datumaro.components.project#

Classes

BuildStageType(value)

An enumeration.

CrudProxy()

DiffStatus(value)

An enumeration.

DvcWrapper(project_dir)

GitWrapper(project_dir[, repo])

IgnoreMode(value)

An enumeration.

Pipeline([config])

Project([path, readonly])

ProjectBuildTargets(tree)

ProjectBuilder(project, tree)

ProjectSourceDataset(path, tree, source[, ...])

ProjectSources(tree)

Tree(project[, config, rev])

class datumaro.components.project.ProjectSourceDataset(path: str, tree: Tree, source: str, readonly: bool = False)[source]#

Bases: IDataset

save(save_dir=None, **kwargs)[source]#
property readonly#
property config#
subsets()[source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

get_subset(name)[source]#
infos()[source]#

Returns meta-info of dataset.

categories()[source]#

Returns metainfo about dataset labels.

get(id, subset=None)[source]#

Provides random access to dataset items.

media_type()[source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

task_type()[source]#

Returns available task type from dataset annotation types.

class datumaro.components.project.IgnoreMode(value)[source]#

Bases: Enum

An enumeration.

rewrite = 1#
append = 2#
remove = 3#
class datumaro.components.project.CrudProxy[source]#

Bases: Generic[CrudEntry]

get(name: str, default: None | T | CrudEntry = None) None | T | CrudEntry[source]#
items() Iterable[Tuple[str, CrudEntry]][source]#
class datumaro.components.project.ProjectSources(tree: Tree)[source]#

Bases: _DataSourceBase

class datumaro.components.project.BuildStageType(value)[source]#

Bases: Enum

An enumeration.

source = 1#
project = 2#
transform = 3#
filter = 4#
convert = 5#
inference = 6#
explore = 7#
class datumaro.components.project.Pipeline(config: PipelineConfig | None = None)[source]#

Bases: object

property head: str#
property head_node#
get_slice(target) Pipeline[source]#
class datumaro.components.project.ProjectBuilder(project: Project, tree: Tree)[source]#

Bases: object

make_dataset(pipeline: Pipeline) IDataset[source]#
class datumaro.components.project.ProjectBuildTargets(tree: Tree)[source]#

Bases: CrudProxy[BuildTarget]

MAIN_TARGET = 'project'#
BASE_STAGE = 'root'#
add_target(name) BuildTarget[source]#
add_stage(target, value, prev=None, name=None) str[source]#
remove_target(name: str)[source]#
remove_stage(target: str, name: str)[source]#
add_transform_stage(target: str, transform: str, params: Dict | None = None, name: str | None = None)[source]#
add_inference_stage(target: str, model: str, params: Dict | None = None, name: str | None = None)[source]#
add_filter_stage(target: str, expr: str, params: Dict | None = None, name: str | None = None)[source]#
add_convert_stage(target: str, format: str, params: Dict | None = None, name: str | None = None)[source]#
add_explore_stage(target: str, params: Dict | None = None, name: str | None = None)[source]#
static make_target_name(target: str, stage: str | None = None) str[source]#
classmethod split_target_name(name: str) Tuple[str, str][source]#
classmethod strip_target_name(name: str) str[source]#
make_pipeline(target: str) Pipeline[source]#
class datumaro.components.project.GitWrapper(project_dir, repo=None)[source]#

Bases: object

static module()[source]#
property initialized#
init()[source]#
close()[source]#
checkout(ref: str, dst_dir=None, clean=False, force=False)[source]#
add(paths, base=None)[source]#

Adds paths to index. Paths can be truncated relatively to base.

commit(message) str[source]#

Creates a new revision from index. Returns: new revision hash.

GitTree = datumaro.components.project.GitTree#
GitStatus = datumaro.components.project.GitStatus#
status(paths: str | GitTree | Iterable[str] = None, base_dir: str = None) Dict[str, GitStatus][source]#

Compares working directory and index.

Parameters:
  • paths – an iterable of paths to compare, a git.Tree, or None. When None, uses all the paths from HEAD.

  • base_dir – a base path for paths. Paths will be prepended by this. When None or ‘’, uses repo root. Can be useful, if index contains displaced paths, which needs to be mapped on real paths.

The statuses are:
  • “A” for added paths

  • “D” for deleted paths

  • “R” for renamed paths

  • “M” for paths with modified data

  • “T” for changed in the type paths

Returns: { abspath(base_dir + path): status }

is_ref(rev)[source]#
has_commits()[source]#
get_tree(ref)[source]#
write_tree(tree, base_path: str, include_files: List[str] | None = None)[source]#
property head: str#
property branch: str#
rev_parse(ref: str) Tuple[str, str][source]#

Expands named refs and tags.

Returns: object type, object hash

ignore(paths: str | List[str], mode: None | str | IgnoreMode = None, gitignore: str | None = None)[source]#
HASH_LEN = 40#
classmethod is_hash(s: str) bool[source]#
log(depth=10) List[Tuple[Any, int]][source]#

Returns: a list of (commit, index) pairs

class datumaro.components.project.DvcWrapper(project_dir)[source]#

Bases: object

static module()[source]#
exception DvcError[source]#

Bases: Exception

property initialized#
init()[source]#
close()[source]#
checkout(targets=None)[source]#
add(paths, no_commit=False)[source]#
is_cached(obj_hash)[source]#
obj_path(obj_hash, root=None)[source]#
ignore(paths: str | List[str], mode: None | str | IgnoreMode = None, dvcignore: str | None = None)[source]#
yaml = <module 'ruamel.yaml' from '/home/runner/work/datumaro/datumaro/.tox/build-docs/lib/python3.10/site-packages/ruamel/yaml/__init__.py'>#
yaml_parser = <ruamel.yaml.main.YAML object>#
classmethod get_hash_from_dvcfile(path) str[source]#
FILE_HASH_LEN = 32#
DIR_HASH_SUFFIX = '.dir'#
DIR_HASH_LEN = 36#
classmethod is_file_hash(s: str) bool[source]#
classmethod is_dir_hash(s: str) bool[source]#
classmethod is_hash(s: str) bool[source]#
write_obj(obj_hash, dst_dir, allow_links=True)[source]#
remove_cache_obj(obj_hash: str)[source]#
class datumaro.components.project.Tree(project: Project, config: None | Dict | Config | TreeConfig = None, rev: None | Revision = None)[source]#

Bases: object

save()[source]#
dump(path)[source]#
clone() Tree[source]#
property sources: ProjectSources#
property build_targets: ProjectBuildTargets#
property config: Config#
property env: Environment#
property rev: None | Revision#
make_pipeline(target: str | None = None) Pipeline[source]#
make_dataset(target: None | str | Pipeline = None) Dataset[source]#
property is_working_tree: bool#
source_data_dir(source) str[source]#
class datumaro.components.project.DiffStatus(value)[source]#

Bases: Enum

An enumeration.

added = 1#
modified = 2#
removed = 3#
missing = 4#
foreign_modified = 5#
class datumaro.components.project.Project(path: str | None = None, readonly=False)[source]#

Bases: object

static find_project_dir(path: str) str | None[source]#
static migrate_from_v1_to_v2(src_dir: str, dst_dir: str, skip_import_errors=False)[source]#
classmethod init(path) Project[source]#
close()[source]#
save()[source]#
property readonly: bool#
property working_tree: Tree#
property head: Tree#
property head_rev: Revision#
property branch: str#
property config: Config#
property env: Environment#
property models: Dict[str, Model]#
get_rev(rev: None | Revision) Tree[source]#
Reference conventions:
  • None or “” - working dir

  • “<40 symbols>” - revision hash

is_rev_cached(rev: Revision) bool[source]#
is_obj_cached(obj_hash: ObjectId) bool[source]#
cache_path(obj_hash: ObjectId) str[source]#
source_data_dir(name: str) str[source]#
remove_cache_obj(ref: Revision | ObjectId)[source]#
validate_source_name(name: str)[source]#
compute_source_hash(data_dir: str, dvcfile: str | None = None, no_cache: bool = True) ObjectId[source]#
refresh_source_hash(source: str, no_cache: bool = True) ObjectId[source]#

Computes and updates the source hash in the working directory.

Returns: hash

import_source(name: str, url: str | None, format: str, options: Dict | None = None, *, no_cache: bool = True, no_hash: bool = True, rpath: str | None = None) Source[source]#

Adds a new source (dataset) to the working directory of the project.

When ‘rpath’ is specified, will copy all the data from URL, but read only the specified file. Required to support subtasks and subsets in datasets.

Parameters:
  • name (str) – Name of the new source

  • url (str) – URL of the new source. A path to a file or directory

  • format (str) – Dataset format

  • options (dict) – Options for the format Extractor

  • no_cache (bool) – Don’t put a copy of files into the project cache. Can be used to reduce project cache size.

  • no_hash (bool) – Don’t compute source data hash. Implies “no_cache”. Useful to reduce import time at the cost of disabled data integrity checks.

  • rpath (str) – Used to specify a relative path to the dataset inside of the directory pointed by URL.

Returns: the new source config

add_source(path: str, format: str, options: Dict | None = None, *, rpath: str | None = None) Tuple[str, Source][source]#

Adds a new source (dataset) from the working directory of the project.

Only directories from the project root can be added. This command is useful after a source was removed and you need to re-add it, or when the dataset was copied or downloaded manually.

When ‘rpath’ is specified, will copy all the data from URL, but read only the specified file. Required to support subtasks and subsets in datasets.

Parameters:
  • url (str) – URL of the new source. A path to a directory

  • format (str) – Dataset format

  • options (dict) – Options for the format Extractor

  • rpath (str) – Used to specify a relative path to the dataset inside of the directory pointed by URL.

Returns: the name and the config of the new source

remove_source(name: str, *, force: bool = False, keep_data: bool = True)[source]#
Options:
  • force (bool) - ignores errors and tries to wipe remaining data

  • keep_data (bool) - leaves source data untouched

commit(message: str, *, no_cache: bool = False, allow_empty: bool = False, allow_foreign: bool = False) Revision[source]#

Copies tree and objects from the working dir to the cache. Creates a new commit. Moves the HEAD pointer to the new commit.

Options:

  • no_cache (bool) - don’t put added dataset data into cache,

    store only metainfo. Can be used to reduce storage size.

  • allow_empty (bool) - allow commits with no changes.

  • allow_foreign (bool) - allow commits with changes made not by Datumaro.

Returns: the new commit hash

checkout(rev: None | Revision = None, sources: None | str | Iterable[str] = None, *, force: bool = False)[source]#

Copies tree and objects from the cache to the working tree.

Sets HEAD to the specified revision, unless sources specified. When sources specified, only copies objects from the cache to the working tree. When no revision and no sources is specified, restores the sources from the current revision.

By default, uses the current (HEAD) revision.

Options:
  • force (bool) - ignore unsaved changes. By default, an error is raised

is_ref(ref: None | str) bool[source]#
has_commits() bool[source]#
status() Dict[str, DiffStatus][source]#
history(max_count=10) List[Tuple[Revision, str]][source]#
diff(rev_a: Tree | Revision, rev_b: Tree | Revision) Dict[str, DiffStatus][source]#

Compares 2 revision trees.

Returns: { target_name: status } for changed targets

model_data_dir(name: str) str[source]#
make_model(name: str) Launcher[source]#
add_model(name: str, launcher: str, options: Dict[str, Any] | None = None) Model[source]#
remove_model(name: str)[source]#