Source code for datumaro.cli.commands.info
# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
import argparse
from datumaro.components.annotation import AnnotationType
from datumaro.components.errors import DatasetMergeError, MissingObjectError, ProjectNotFoundError
from datumaro.util.scope import scope_add, scoped
from ..util import MultilineFormatter
from ..util.project import load_project, parse_full_revpath
[docs]
def build_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(
help="Prints dataset overview",
description="""
Prints info about the dataset at <revpath>, or about the current
project's combined dataset, if none is specified.|n
|n
<revpath> - either a dataset path or a revision path. The full
syntax is:|n
- Dataset paths:|n
|s|s- <dataset path>[ :<format> ]|n
- Revision paths:|n
|s|s- <project path> [ @<rev> ] [ :<target> ]|n
|s|s- <rev> [ :<target> ]|n
|s|s- <target>|n
|n
Both forms use the -p/--project as a context for plugins. It can be
useful for dataset paths in targets. When not specified, the current
project's working tree is used.|n
|n
Examples:|n
- Print dataset info for the current project's working tree:|n
|s|s%(prog)s|n
|n
- Print dataset info for a path and a format name:|n
|s|s%(prog)s path/to/dataset:voc|n
|n
- Print dataset info for a source from a past revision:|n
|s|s%(prog)s HEAD~2:source-2
""",
formatter_class=MultilineFormatter,
)
parser.add_argument(
"target", nargs="?", default="project", metavar="revpath", help="Target dataset revpath"
)
parser.add_argument("--all", action="store_true", help="Print all information")
parser.add_argument(
"-p",
"--project",
dest="project_dir",
help="Directory of the current project (default: current dir)",
)
parser.set_defaults(command=info_command)
return parser
[docs]
def get_sensitive_args():
return {
info_command: [
"target",
"project_dir",
],
}
[docs]
@scoped
def info_command(args):
project = None
try:
project = scope_add(load_project(args.project_dir))
except ProjectNotFoundError:
if args.project_dir:
raise
dataset = None
dataset_problem = ""
try:
# TODO: avoid computing working tree hashes
dataset, target_project = parse_full_revpath(args.target, project)
if target_project:
scope_add(target_project)
except DatasetMergeError as e:
dataset_problem = (
"Can't merge project sources automatically: %s. "
"The conflicting sources are: %s"
% (
e,
", ".join(e.sources),
)
)
except MissingObjectError as e:
dataset_problem = str(e)
def print_dataset_info(dataset, indent=""):
print("%slength:" % indent, len(dataset))
categories = dataset.categories()
print("%scategories:" % indent, ", ".join(c.name for c in categories))
for cat_type, cat in categories.items():
print("%s %s:" % (indent, cat_type.name))
if cat_type == AnnotationType.label:
print("%s count:" % indent, len(cat.items))
count_threshold = 10
if args.all:
count_threshold = len(cat.items)
labels = ", ".join(c.name for c in cat.items[:count_threshold])
if count_threshold < len(cat.items):
labels += " (and %s more)" % (len(cat.items) - count_threshold)
print("%s labels:" % indent, labels)
if dataset is not None:
print_dataset_info(dataset)
subsets = dataset.subsets()
print("subsets:", ", ".join(subsets))
for subset_name in subsets:
subset = dataset.get_subset(subset_name)
print(" '%s':" % subset_name)
print_dataset_info(subset, indent=" ")
else:
print("Dataset info is not available: ", dataset_problem)
return 0