# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
import argparse
import logging as log
import os
import os.path as osp
from datumaro.components.environment import DEFAULT_ENVIRONMENT
from datumaro.components.errors import ProjectNotFoundError
from datumaro.util.scope import scope_add, scoped
from ..util import MultilineFormatter
from ..util.errors import CliException
from ..util.project import load_project, parse_full_revpath
[docs]
def build_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(
help="Updates dataset from another one",
description="""
Updates items of the first dataset with items from the second one.|n
|n
By default, datasets are updated in-place. The '-o/--output-dir'
option can be used to specify another output directory. When
updating in-place, use the '--overwrite' parameter along with the
'--save-media' export option (in-place updates fail by default
to prevent data loss).|n
|n
Unlike the regular project data source joining, the datasets are not
required to have the same labels. The labels from the "patch"
dataset are projected onto the labels of the patched dataset,
so only the annotations with the matching labels are used, i.e.
all the annotations having unknown labels are ignored. Currently,
this command doesn't allow to update the label information in the
patched dataset.|n
|n
The command supports passing extra exporting options for the output
dataset. The extra options should be passed after the main arguments
and after the '--' separator. Particularly, this is useful to include
images in the output dataset with '--save-media'.|n
|n
This command can be applied to the current project targets or
arbitrary datasets outside a project. Note that if the target dataset
is read-only (e.g. if it is a project, stage or a cache entry),
the output directory must be provided.|n
|n
This command has the following invocation syntax:
- %(prog)s <target dataset revpath> <patch dataset revpath>|n
|n
<revpath> - either a dataset path or a revision path. The full
syntax is:|n
- Dataset paths:|n
|s|s- <dataset path>[ :<format> ]|n
- Revision paths:|n
|s|s- <project path> [ @<rev> ] [ :<target> ]|n
|s|s- <rev> [ :<target> ]|n
|s|s- <target>|n
|n
The current project (-p/--project) is also used as a context for
plugins, so it can be useful for dataset paths having custom formats.
When not specified, the current project's working tree is used.|n
|n
Examples:|n
- Update a VOC-like dataset with COCO-like annotations:|n
|s|s%(prog)s --overwrite dataset1/:voc dataset2/:coco -- --save-media|n
|n
- Generate a patched dataset, based on a project:|n
|s|s%(prog)s -o patched_proj1/ proj1/ proj2/|n
|n
- Update the "source1" source in the current project with a dataset:|n
|s|s%(prog)s -p proj/ --overwrite source1 path/to/dataset2:coco|n
|n
- Generate a patched source from a previous revision and a dataset:|n
|s|s%(prog)s -o new_src2/ HEAD~2:source-2 path/to/dataset2:yolo|n
|n
- Update a dataset in a custom format, described in a project plugin:|n
|s|s%(prog)s -p proj/ --overwrite dataset/:my_format dataset2/:coco
""",
formatter_class=MultilineFormatter,
)
parser.add_argument("target", help="Target dataset revpath")
parser.add_argument("patch", help="Patch dataset revpath")
parser.add_argument(
"-o",
"--output-dir",
dest="dst_dir",
default=None,
help="Output directory (default: save in-place)",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Overwrite existing files in the save directory, " "if it is not empty",
)
parser.add_argument(
"-p",
"--project",
dest="project_dir",
help="Directory of the 'current' project (default: current dir)",
)
parser.add_argument(
"extra_args",
nargs=argparse.REMAINDER,
help="Additional arguments for exporting (pass '-- -h' for help). "
"Must be specified after the main command arguments and after "
"the '--' separator",
)
parser.set_defaults(command=patch_command)
return parser
[docs]
def get_sensitive_args():
return {
patch_command: ["target", "patch", "dst_dir", "project_dir", "extra_args"],
}
[docs]
@scoped
def patch_command(args):
project = None
try:
project = scope_add(load_project(args.project_dir))
except ProjectNotFoundError:
if args.project_dir:
raise
if project is not None:
env = project.env
else:
env = DEFAULT_ENVIRONMENT
target_dataset, _project = parse_full_revpath(args.target, project)
if _project is not None:
scope_add(_project)
try:
exporter = env.exporters[target_dataset.format]
except KeyError:
raise CliException("Exporter for format '%s' is not found" % args.format)
extra_args = exporter.parse_cmdline(args.extra_args)
dst_dir = args.dst_dir or target_dataset.data_path
if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
raise CliException(
"Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir
)
dst_dir = osp.abspath(dst_dir)
patch_dataset, _project = parse_full_revpath(args.patch, project)
if _project is not None:
scope_add(_project)
target_dataset.update(patch_dataset)
target_dataset.save(save_dir=dst_dir, **extra_args)
log.info("Patched dataset has been saved to '%s'" % dst_dir)
return 0