Source code for datumaro.cli.commands.validate

# Copyright (C) 2020-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import argparse
import logging as log

from datumaro.components.environment import DEFAULT_ENVIRONMENT
from datumaro.components.errors import ProjectNotFoundError
from datumaro.components.validator import TaskType
from datumaro.util import dump_json_file
from datumaro.util.scope import scope_add, scoped

from ..util import MultilineFormatter
from ..util.errors import CliException
from ..util.project import generate_next_file_name, load_project, parse_full_revpath


[docs] def build_parser(parser_ctor=argparse.ArgumentParser): parser = parser_ctor( help="Validate project", description=""" Validates a dataset according to the task type and reports summary in a JSON file.|n Target dataset is specified by a revpath. The full syntax is:|n - Dataset paths:|n |s|s- <dataset path>[ :<format> ]|n - Revision paths:|n |s|s- <project path> [ @<rev> ] [ :<target> ]|n |s|s- <rev> [ :<target> ]|n |s|s- <target>|n |n Both forms use the -p/--project as a context for plugins. It can be useful for dataset paths in targets. When not specified, the current project's working tree is used.|n |n Examples:|n - Validate a project's subset as a classification dataset:|n |n |s|s%(prog)s -t classification -s train """, formatter_class=MultilineFormatter, ) task_types = ", ".join(t.name for t in TaskType) def _parse_task_type(s): try: return TaskType[s.lower()].name except Exception: raise argparse.ArgumentTypeError( "Unknown task type %s. Expected " "one of: %s" % (s, task_types) ) parser.add_argument( "_positionals", nargs=argparse.REMAINDER, help=argparse.SUPPRESS ) # workaround for -- eaten by positionals parser.add_argument( "target", default="project", nargs="?", help="Target dataset revpath (default: project)" ) parser.add_argument( "-t", "--task", type=_parse_task_type, required=True, help="Task type for validation, one of %s" % task_types, ) parser.add_argument( "-s", "--subset", dest="subset_name", help="Subset to validate (default: whole dataset)" ) parser.add_argument( "-p", "--project", dest="project_dir", help="Directory of the project to validate (default: current dir)", ) parser.add_argument( "extra_args", nargs=argparse.REMAINDER, help="Optional arguments for validator (pass '-- -h' for help)", ) parser.set_defaults(command=validate_command) return parser
[docs] def get_sensitive_args(): return { validate_command: ["target", "project_dir", "subset_name", "extra_args"], }
[docs] @scoped def validate_command(args): has_sep = "--" in args._positionals if has_sep: pos = args._positionals.index("--") if 1 < pos: raise argparse.ArgumentError(None, message="Expected no more than 1 target argument") else: pos = 1 args.target = (args._positionals[:pos] or ["project"])[0] args.extra_args = args._positionals[pos + has_sep :] show_plugin_help = "-h" in args.extra_args or "--help" in args.extra_args project = None try: project = scope_add(load_project(args.project_dir)) except ProjectNotFoundError: if not show_plugin_help and args.project_dir: raise if project is not None: env = project.env else: env = DEFAULT_ENVIRONMENT try: validator_type = env.validators[args.task] except KeyError: raise CliException("Validator type '%s' is not found" % args.task) extra_args = validator_type.parse_cmdline(args.extra_args) dataset, target_project = parse_full_revpath(args.target, project) if target_project: scope_add(target_project) dst_file_name = "validation-report" if args.subset_name is not None: dataset = dataset.get_subset(args.subset_name) dst_file_name += f"-{args.subset_name}" validator = validator_type(**extra_args) report = validator.validate(dataset) def _make_serializable(d): for key, val in list(d.items()): # tuple key to str if isinstance(key, tuple) or isinstance(key, int): d[str(key)] = val d.pop(key) if isinstance(val, dict): _make_serializable(val) _make_serializable(report) dst_file = generate_next_file_name(dst_file_name, ext=".json") log.info("Writing project validation results to '%s'" % dst_file) dump_json_file(dst_file, report, indent=True, allow_numpy=True)