Source code for datumaro.cli.commands.validate
# Copyright (C) 2020-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
import argparse
import logging as log
from datumaro.components.environment import DEFAULT_ENVIRONMENT
from datumaro.components.errors import ProjectNotFoundError
from datumaro.components.validator import TaskType
from datumaro.util import dump_json_file
from datumaro.util.scope import scope_add, scoped
from ..util import MultilineFormatter
from ..util.errors import CliException
from ..util.project import generate_next_file_name, load_project, parse_full_revpath
[docs]
def build_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(
help="Validate project",
description="""
Validates a dataset according to the task type and
reports summary in a JSON file.|n
Target dataset is specified by a revpath. The full syntax is:|n
- Dataset paths:|n
|s|s- <dataset path>[ :<format> ]|n
- Revision paths:|n
|s|s- <project path> [ @<rev> ] [ :<target> ]|n
|s|s- <rev> [ :<target> ]|n
|s|s- <target>|n
|n
Both forms use the -p/--project as a context for plugins. It can be
useful for dataset paths in targets. When not specified, the current
project's working tree is used.|n
|n
Examples:|n
- Validate a project's subset as a classification dataset:|n |n
|s|s%(prog)s -t classification -s train
""",
formatter_class=MultilineFormatter,
)
task_types = ", ".join(t.name for t in TaskType)
def _parse_task_type(s):
try:
return TaskType[s.lower()].name
except Exception:
raise argparse.ArgumentTypeError(
"Unknown task type %s. Expected " "one of: %s" % (s, task_types)
)
parser.add_argument(
"_positionals", nargs=argparse.REMAINDER, help=argparse.SUPPRESS
) # workaround for -- eaten by positionals
parser.add_argument(
"target", default="project", nargs="?", help="Target dataset revpath (default: project)"
)
parser.add_argument(
"-t",
"--task",
type=_parse_task_type,
required=True,
help="Task type for validation, one of %s" % task_types,
)
parser.add_argument(
"-s", "--subset", dest="subset_name", help="Subset to validate (default: whole dataset)"
)
parser.add_argument(
"-p",
"--project",
dest="project_dir",
help="Directory of the project to validate (default: current dir)",
)
parser.add_argument(
"extra_args",
nargs=argparse.REMAINDER,
help="Optional arguments for validator (pass '-- -h' for help)",
)
parser.set_defaults(command=validate_command)
return parser
[docs]
def get_sensitive_args():
return {
validate_command: ["target", "project_dir", "subset_name", "extra_args"],
}
[docs]
@scoped
def validate_command(args):
has_sep = "--" in args._positionals
if has_sep:
pos = args._positionals.index("--")
if 1 < pos:
raise argparse.ArgumentError(None, message="Expected no more than 1 target argument")
else:
pos = 1
args.target = (args._positionals[:pos] or ["project"])[0]
args.extra_args = args._positionals[pos + has_sep :]
show_plugin_help = "-h" in args.extra_args or "--help" in args.extra_args
project = None
try:
project = scope_add(load_project(args.project_dir))
except ProjectNotFoundError:
if not show_plugin_help and args.project_dir:
raise
if project is not None:
env = project.env
else:
env = DEFAULT_ENVIRONMENT
try:
validator_type = env.validators[args.task]
except KeyError:
raise CliException("Validator type '%s' is not found" % args.task)
extra_args = validator_type.parse_cmdline(args.extra_args)
dataset, target_project = parse_full_revpath(args.target, project)
if target_project:
scope_add(target_project)
dst_file_name = "validation-report"
if args.subset_name is not None:
dataset = dataset.get_subset(args.subset_name)
dst_file_name += f"-{args.subset_name}"
validator = validator_type(**extra_args)
report = validator.validate(dataset)
def _make_serializable(d):
for key, val in list(d.items()):
# tuple key to str
if isinstance(key, tuple) or isinstance(key, int):
d[str(key)] = val
d.pop(key)
if isinstance(val, dict):
_make_serializable(val)
_make_serializable(report)
dst_file = generate_next_file_name(dst_file_name, ext=".json")
log.info("Writing project validation results to '%s'" % dst_file)
dump_json_file(dst_file, report, indent=True, allow_numpy=True)