Source code for datumaro.cli.commands.stats

# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import argparse
import logging as log

from datumaro.cli.util.errors import CliException, WrongRevpathError
from datumaro.components.errors import ConflictingCategoriesError, ProjectNotFoundError
from datumaro.components.operations import compute_ann_statistics, compute_image_statistics
from datumaro.util import dump_json_file, str_to_bool
from datumaro.util.scope import scope_add, scoped

from ..util import MultilineFormatter
from ..util.project import generate_next_file_name, load_project, parse_full_revpath

__all__ = [
    "build_parser",
    "get_sensitive_args",
]


[docs] def build_parser(parser_ctor=argparse.ArgumentParser): parser = parser_ctor( help="Get project statistics", description=""" Outputs various project statistics like image mean and std (RGB), annotations count etc.|n |n Target dataset is specified by a revpath. The full syntax is:|n - Dataset paths:|n |s|s- <dataset path>[ :<format> ]|n - Revision paths:|n |s|s- <project path> [ @<rev> ] [ :<target> ]|n |s|s- <rev> [ :<target> ]|n |s|s- <target>|n |n Both forms use the -p/--project as a context for plugins. It can be useful for dataset paths in targets. When not specified, the current project's working tree is used.|n |n Examples:|n - Compute project statistics:|n |s|s%(prog)s """, formatter_class=MultilineFormatter, ) parser.add_argument( "target", default="project", nargs="?", help="Target dataset revpath (default: project)" ) parser.add_argument("-s", "--subset", help="Compute stats only for a specific subset") parser.add_argument( "--image-stats", type=str_to_bool, default=True, help="Compute image mean and std (RGB) (default: %(default)s)", ) parser.add_argument( "--ann-stats", type=str_to_bool, default=True, help="Compute annotation statistics (default: %(default)s)", ) parser.add_argument( "-p", "--project", dest="project_dir", help="Directory of the project to operate on (default: current dir)", ) parser.set_defaults(command=stats_command) return parser
[docs] def get_sensitive_args(): return { stats_command: ["project_dir", "target"], }
[docs] @scoped def stats_command(args): project = None try: project = scope_add(load_project(args.project_dir)) except ProjectNotFoundError: if args.project_dir: raise try: dataset, target_project = parse_full_revpath(args.target, project) except WrongRevpathError as e: for p in e.problems: if isinstance(p, ConflictingCategoriesError): src_names = [src for src in project.working_tree.sources] raise CliException( "There are more than two sources with heterogeneous categories in the project. " "This prevents computing the statistics of the merged one. " f"Please specify one of the sources in the project ({src_names}), " f"such as `datum stats {src_names[0]}`" ) from e raise e if target_project: scope_add(target_project) if args.subset: dataset = dataset.get_subset(args.subset) stats = {} if args.image_stats: stats.update(compute_image_statistics(dataset)) if args.ann_stats: stats.update(compute_ann_statistics(dataset)) dst_file = generate_next_file_name("statistics", ext=".json") log.info("Writing project statistics to '%s'" % dst_file) dump_json_file(dst_file, stats, indent=True)