Source code for datumaro.components.project

# Copyright (C) 2019-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

from __future__ import annotations

import logging as log
import os
import os.path as osp
import re
import shutil
import tempfile
import unittest.mock
from contextlib import ExitStack, suppress
from enum import Enum, auto
from typing import (
    TYPE_CHECKING,
    Any,
    Dict,
    Generic,
    Iterable,
    Iterator,
    List,
    NewType,
    Optional,
    Tuple,
    TypeVar,
    Union,
)

from datumaro.components.config import Config
from datumaro.components.config_model import (
    BuildStage,
    BuildTarget,
    Model,
    PipelineConfig,
    ProjectConfig,
    ProjectLayout,
    Source,
    TreeConfig,
    TreeLayout,
)
from datumaro.components.dataset import DEFAULT_FORMAT, Dataset, IDataset
from datumaro.components.environment import Environment
from datumaro.components.errors import (
    DatasetMergeError,
    EmptyCommitError,
    EmptyPipelineError,
    ForeignChangesError,
    InvalidStageError,
    MigrationError,
    MismatchingObjectError,
    MissingObjectError,
    MissingPipelineHeadError,
    MissingSourceHashError,
    MultiplePipelineHeadsError,
    OldProjectError,
    PathOutsideSourceError,
    ProjectAlreadyExists,
    ProjectNotFoundError,
    ReadonlyDatasetError,
    ReadonlyProjectError,
    SourceExistsError,
    SourceUrlInsideProjectError,
    UnexpectedUrlError,
    UnknownRefError,
    UnknownSourceError,
    UnknownStageError,
    UnknownTargetError,
    UnsavedChangesError,
    VcsAlreadyExists,
    VcsError,
)
from datumaro.components.launcher import Launcher
from datumaro.util import find, parse_json_file, parse_str_enum_value
from datumaro.util.log_utils import catch_logs, logging_disabled
from datumaro.util.os_util import (
    copytree,
    generate_next_name,
    is_subpath,
    make_file_name,
    rmfile,
    rmtree,
)
from datumaro.util.scope import on_error_do, scope_add, scoped

if TYPE_CHECKING:
    import networkx as nx

else:
    from datumaro.util.import_util import lazy_import

    nx = lazy_import("networkx")



[docs]
class ProjectSourceDataset(IDataset):
    def __init__(self, path: str, tree: Tree, source: str, readonly: bool = False):
        config = tree.sources[source]

        rpath = path
        if config.path:
            rpath = osp.join(path, config.path)

        dataset = Dataset.import_from(rpath, env=tree.env, format=config.format, **config.options)

        # Using rpath won't allow to save directly with .save() when a file
        # path is specified. Dataset doesn't know the root location and if
        # it exists at all, but in a project, we do.
        dataset.bind(path, format=dataset.format, options=dataset.options)

        self.__dict__["_dataset"] = dataset

        self.__dict__["_config"] = config
        self.__dict__["_readonly"] = readonly
        self.__dict__["name"] = source


[docs]
    def save(self, save_dir=None, **kwargs):
        if self.readonly and (
            save_dir is None or osp.abspath(save_dir) == osp.abspath(self.data_path)
        ):
            raise ReadonlyDatasetError()
        self._dataset.save(save_dir, **kwargs)


    @property
    def readonly(self):
        return self._readonly or not self.is_bound

    @property
    def config(self):
        return self._config

    def __getattr__(self, name):
        return getattr(self._dataset, name)

    def __setattr__(self, name, value):
        return setattr(self._dataset, name, value)

    def __iter__(self):
        yield from self._dataset

    def __len__(self):
        return len(self._dataset)


[docs]
    def subsets(self):
        return self._dataset.subsets()



[docs]
    def get_subset(self, name):
        return self._dataset.get_subset(name)



[docs]
    def infos(self):
        return self._dataset.infos()



[docs]
    def categories(self):
        return self._dataset.categories()



[docs]
    def get(self, id, subset=None):
        return self._dataset.get(id, subset)



[docs]
    def media_type(self):
        return self._dataset.media_type()



[docs]
    def ann_types(self):
        return self._dataset.ann_types()





[docs]
class IgnoreMode(Enum):
    rewrite = auto()
    append = auto()
    remove = auto()



def _update_ignore_file(
    paths: Union[str, List[str]],
    repo_root: str,
    filepath: str,
    mode: Union[None, str, IgnoreMode] = None,
):
    def _make_ignored_path(path):
        path = osp.join(repo_root, osp.normpath(path))
        assert is_subpath(path, base=repo_root)

        # Prepend the '/' to match only direct childs.
        # Otherwise the rule can be in any path part.
        return "/" + osp.relpath(path, repo_root).replace("\\", "/")

    header = "# The file is autogenerated by Datumaro"

    mode = parse_str_enum_value(mode, IgnoreMode, IgnoreMode.append)

    if isinstance(paths, str):
        paths = [paths]
    paths = {osp.join(repo_root, osp.normpath(p)): _make_ignored_path(p) for p in paths}

    openmode = "r+"
    if not osp.isfile(filepath):
        openmode = "w+"  # r+ cannot create, w truncates
    with open(filepath, openmode) as f:
        lines = []
        if mode in {IgnoreMode.append, IgnoreMode.remove}:
            for line in f:
                lines.append(line.strip())
            f.seek(0)

        new_lines = []
        for line in lines:
            if not line or line.startswith("#"):
                new_lines.append(line)
                continue

            line_path = osp.join(
                repo_root,
                osp.normpath(line.split("#", maxsplit=1)[0]).replace("\\", "/").lstrip("/"),
            )

            if mode == IgnoreMode.append:
                if line_path in paths:
                    paths.pop(line_path)
                new_lines.append(line)
            elif mode == IgnoreMode.remove:
                if line_path not in paths:
                    new_lines.append(line)

        if mode in {IgnoreMode.rewrite, IgnoreMode.append}:
            new_lines.extend(paths.values())

        if not new_lines or new_lines[0] != header:
            print(header, file=f)
        for line in new_lines:
            print(line, file=f)
        f.truncate()


CrudEntry = TypeVar("CrudEntry")
T = TypeVar("T")



[docs]
class CrudProxy(Generic[CrudEntry]):
    @property
    def _data(self) -> Dict[str, CrudEntry]:
        raise NotImplementedError()

    def __len__(self):
        return len(self._data)

    def __getitem__(self, name: str) -> CrudEntry:
        return self._data[name]


[docs]
    def get(
        self, name: str, default: Union[None, T, CrudEntry] = None
    ) -> Union[None, T, CrudEntry]:
        return self._data.get(name, default)


    def __iter__(self) -> Iterator[CrudEntry]:
        return iter(self._data.keys())


[docs]
    def items(self) -> Iterable[Tuple[str, CrudEntry]]:
        return iter(self._data.items())


    def __contains__(self, name: str):
        return name in self._data



class _DataSourceBase(CrudProxy[Source]):
    def __init__(self, tree: Tree, config_field: str):
        self._tree = tree
        self._field = config_field

    @property
    def _data(self) -> Dict[str, Source]:
        return self._tree.config[self._field]

    def add(self, name: str, value: Union[Dict, Config, Source]) -> Source:
        if name in self:
            raise SourceExistsError(name)

        return self._data.set(name, value)

    def remove(self, name: str):
        self._data.remove(name)



[docs]
class ProjectSources(_DataSourceBase):
    def __init__(self, tree: Tree):
        super().__init__(tree, "sources")

    def __getitem__(self, name):
        try:
            return super().__getitem__(name)
        except KeyError as e:
            raise KeyError("Unknown source '%s'" % name) from e




[docs]
class BuildStageType(Enum):
    source = auto()
    project = auto()
    transform = auto()
    filter = auto()
    convert = auto()
    inference = auto()
    explore = auto()




[docs]
class Pipeline:
    @staticmethod
    def _create_graph(config: PipelineConfig):
        graph = nx.DiGraph()
        for entry in config:
            target_name = entry["name"]
            parents = entry["parents"]
            target = BuildStage(entry["config"])

            graph.add_node(target_name, config=target)
            for prev_stage in parents:
                graph.add_edge(prev_stage, target_name)

        return graph

    def __init__(self, config: PipelineConfig = None):
        self._head = None

        if config is not None:
            self._graph = self._create_craph(config)
            if not self.head:
                raise MissingPipelineHeadError()
        else:
            self._graph = nx.DiGraph()

    def __getattr__(self, key):
        return getattr(self._graph, key)

    @staticmethod
    def _find_head_node(graph) -> Optional[str]:
        head = None
        for node in graph.nodes:
            if graph.out_degree(node) == 0:
                if head is not None:
                    raise MultiplePipelineHeadsError(
                        "A pipeline can have only one "
                        "main target, but it has at least 2: %s, %s" % (head, node)
                    )
                head = node
        return head

    @property
    def head(self) -> str:
        if self._head is None:
            self._head = self._find_head_node(self._graph)
        return self._head

    @property
    def head_node(self):
        return self._graph.nodes[self.head]

    @staticmethod
    def _serialize(graph) -> PipelineConfig:
        serialized = PipelineConfig()
        for node_name, node in graph.nodes.items():
            serialized.nodes.append(
                {
                    "name": node_name,
                    "parents": list(graph.predecessors(node_name)),
                    "config": dict(node["config"]),
                }
            )
        return serialized

    @staticmethod
    def _get_subgraph(graph, target):
        """
        Returns a subgraph with all the target dependencies and
        the target itself.
        """
        return graph.subgraph(nx.ancestors(graph, target) | {target})


[docs]
    def get_slice(self, target) -> Pipeline:
        pipeline = Pipeline()
        pipeline._graph = self._get_subgraph(self._graph, target).copy()
        return pipeline





[docs]
class ProjectBuilder:
    def __init__(self, project: Project, tree: Tree):
        self._project = project
        self._tree = tree


[docs]
    def make_dataset(self, pipeline: Pipeline) -> IDataset:
        dataset = self._get_resulting_dataset(pipeline)

        # TODO: May be need to save and load, because it can modify dataset,
        # unless we work with the internal format. For example, it can
        # add format-specific attributes. It should be needed as soon
        # format converting stages (export, convert, load) are allowed.
        #
        # TODO: If the target was rebuilt from sources, it may require saving
        # and hashing, so the resulting hash could be compared with the saved
        # one in the pipeline. This is needed to make sure the reproduced
        # version of the dataset is correct. Currently we only rely on the
        # initial source version check, which can be not enough if stages
        # produce different result (because of the library changes etc).
        #
        # save_in_cache(project, pipeline) # update and check hash in config!
        # dataset = load_dataset(project, pipeline)

        return dataset


    def _run_pipeline(self, pipeline: Pipeline):
        self._validate_pipeline(pipeline)

        missing_sources, wd_hashes = self._find_missing_sources(pipeline)
        for source_name in missing_sources:
            source = self._tree.sources[source_name]

            if wd_hashes.get(source_name):
                raise ForeignChangesError(
                    "Local source '%s' data does not "
                    "match any previous source revision. Probably, the source "
                    "was modified outside Datumaro. You can restore the "
                    "latest source revision with 'checkout' command." % source_name
                )

            if self._project.readonly:
                # Source re-downloading is prohibited in readonly projects
                # because it can seriously hurt free storage space. It must
                # be run manually, so that the user could know about this.
                log.info(
                    "Skipping re-downloading missing source '%s', "
                    "because the project is read-only. Automatic downloading "
                    "is disabled in read-only projects.",
                    source_name,
                )
                continue

            if not source.hash:
                raise MissingSourceHashError(
                    "Unable to re-download source "
                    "'%s': the source was added with no hash information. " % source_name
                )

            with self._project._make_tmp_dir() as tmp_dir:
                obj_hash, _, _ = self._project._download_source(source.url, tmp_dir)

                if source.hash and source.hash != obj_hash:
                    raise MismatchingObjectError(
                        "Downloaded source '%s' data is different "
                        "from what is saved in the build pipeline: "
                        "'%s' vs '%s'" % (source_name, obj_hash, source.hash)
                    )

        return self._init_pipeline(pipeline, working_dir_hashes=wd_hashes)

    def _get_resulting_dataset(self, pipeline):
        graph, head = self._run_pipeline(pipeline)
        return graph.nodes[head]["dataset"]

    def _init_pipeline(self, pipeline: Pipeline, working_dir_hashes=None):
        """
        Initializes datasets in the pipeline nodes. Currently, only the head
        node will have a dataset on exit, so no extra memory is wasted
        for the intermediate nodes.
        """

        def _join_parent_datasets(force=False):
            parents = {p: graph.nodes[p] for p in graph.predecessors(stage_name)}

            if 1 < len(parents) or force:
                try:
                    dataset = Dataset.from_extractors(
                        *(p["dataset"] for p in parents.values()), env=self._tree.env
                    )
                except DatasetMergeError as e:
                    e.sources = set(parents)
                    raise e
            else:
                dataset = next(iter(parents.values()))["dataset"]

            # clear fully utilized datasets to release memory
            for p_name, p in parents.items():
                p["_use_count"] = p.get("_use_count", 0) + 1

                if p_name != head and p["_use_count"] == graph.out_degree(p_name):
                    p.pop("dataset")

            return dataset

        if working_dir_hashes is None:
            working_dir_hashes = {}

        def _try_load_from_disk(stage_name: str, stage_config: BuildStage) -> Dataset:
            # Check if we can restore this stage from the cache or
            # from the working directory.
            #
            # If we have a hash, we have executed this stage already
            # and can have a cache entry or,
            # if this is the last stage of a target in the working tree,
            # we can use data from the working directory.
            stage_hash = stage_config.hash

            data_dir = None
            cached = False

            source_name, source_stage_name = ProjectBuildTargets.split_target_name(stage_name)
            if self._tree.is_working_tree and source_name in self._tree.sources:
                target = self._tree.build_targets[source_name]
                data_dir = self._project.source_data_dir(source_name)
                wd_hash = working_dir_hashes.get(source_name)

                if not stage_hash:
                    if source_stage_name == target.head.name and osp.isdir(data_dir):
                        pass
                    else:
                        log.debug(
                            "Build: skipping loading stage '%s' from "
                            "working dir '%s', because the stage has no hash "
                            "and is not the head stage",
                            stage_name,
                            data_dir,
                        )
                        data_dir = None
                elif not wd_hash:
                    if osp.isdir(data_dir):
                        wd_hash = self._project.compute_source_hash(data_dir)
                        working_dir_hashes[source_name] = wd_hash
                    else:
                        log.debug(
                            "Build: skipping checking working dir '%s', "
                            "because it does not exist",
                            data_dir,
                        )
                        data_dir = None

                if stage_hash and stage_hash != wd_hash:
                    log.debug(
                        "Build: skipping loading stage '%s' from "
                        "working dir '%s', because hashes do not match",
                        stage_name,
                        data_dir,
                    )
                    data_dir = None

            if not data_dir and stage_hash:
                if self._project._is_cached(stage_hash):
                    data_dir = self._project.cache_path(stage_hash)
                    cached = True
                elif self._project._can_retrieve_from_vcs_cache(stage_hash):
                    data_dir = self._project._materialize_obj(stage_hash)
                    cached = True

                if not data_dir or not osp.isdir(data_dir):
                    log.debug(
                        "Build: skipping loading stage '%s' from "
                        "cache obj '%s', because it is not available",
                        stage_name,
                        stage_hash,
                    )
                    return None

            if data_dir:
                assert osp.isdir(data_dir), data_dir
                log.debug("Build: loading stage '%s' from '%s'", stage_name, data_dir)
                return ProjectSourceDataset(
                    data_dir, self._tree, source_name, readonly=cached or self._project.readonly
                )

            return None

        # Pipeline is assumed to be validated already
        graph = pipeline._graph
        head = pipeline.head

        # traverse the graph and initialize nodes from sources to the head
        to_visit = [head]
        while to_visit:
            stage_name = to_visit.pop()
            stage = graph.nodes[stage_name]
            stage_config = stage["config"]
            stage_type = BuildStageType[stage_config.type]
            stage_hash = stage_config.hash

            assert stage.get("dataset") is None

            dataset = _try_load_from_disk(stage_name, stage_config)
            if dataset is not None:
                stage["dataset"] = dataset
                continue

            uninitialized_parents = []
            for p_name in graph.predecessors(stage_name):
                parent = graph.nodes[p_name]
                if parent.get("dataset") is None:
                    uninitialized_parents.append(p_name)

            if uninitialized_parents:
                to_visit.append(stage_name)
                to_visit.extend(uninitialized_parents)
                continue

            if stage_type == BuildStageType.transform:
                kind = stage_config.kind
                try:
                    transform = self._tree.env.transforms[kind]
                except KeyError as e:
                    raise UnknownStageError("Unknown transform '%s'" % kind) from e

                dataset = _join_parent_datasets()
                dataset = dataset.transform(transform, **stage_config.params)

            elif stage_type == BuildStageType.filter:
                dataset = _join_parent_datasets()
                dataset = dataset.filter(**stage_config.params)

            elif stage_type == BuildStageType.inference:
                kind = stage_config.kind
                model = self._project.make_model(kind)

                dataset = _join_parent_datasets()
                dataset = dataset.run_model(model)

            elif stage_type == BuildStageType.source:
                # Stages of type "Source" cannot have inputs,
                # they are build tree inputs themselves
                assert graph.in_degree(stage_name) == 0, stage_name

                # The only valid situation we get here is that it is a
                # generated source:
                # - No cache entry
                # - No local dir data
                source_name = ProjectBuildTargets.strip_target_name(stage_name)
                source = self._tree.sources[source_name]
                if not source.is_generated:
                    # Source is missing in the cache and the working tree,
                    # and cannot be retrieved from the VCS cache.
                    # It is assumed that all the missing sources were
                    # downloaded earlier.
                    raise MissingObjectError(
                        "Failed to initialize stage '%s': "
                        "object '%s' was not found in cache" % (stage_name, stage_hash)
                    )

                # Generated sources do not require a data directory,
                # but they still can be bound to a directory
                if self._tree.is_working_tree:
                    source_dir = self._project.source_data_dir(source_name)
                else:
                    source_dir = None
                dataset = ProjectSourceDataset(
                    source_dir,
                    self._tree,
                    source_name,
                    readonly=not source_dir or self._project.readonly,
                )

            elif stage_type == BuildStageType.project:
                dataset = _join_parent_datasets(force=True)

            elif stage_type == BuildStageType.convert:
                dataset = _join_parent_datasets()

            else:
                raise UnknownStageError("Unexpected stage type '%s'" % stage_type)

            stage["dataset"] = dataset

        return graph, head

    @staticmethod
    def _validate_pipeline(pipeline: Pipeline):
        graph = pipeline._graph
        if (
            len(graph) == 0
            or len(graph) == 1
            and next(iter(graph.nodes))
            == ProjectBuildTargets.make_target_name(
                ProjectBuildTargets.MAIN_TARGET, ProjectBuildTargets.BASE_STAGE
            )
        ):
            raise EmptyPipelineError()

        head = pipeline.head
        if not head:
            raise MissingPipelineHeadError()

        for stage_name, stage in graph.nodes.items():
            stage_type = BuildStageType[stage["config"].type]

            if graph.in_degree(stage_name) == 0:
                if stage_type != BuildStageType.source:
                    raise InvalidStageError(
                        "Stage '%s' of type '%s' must have inputs" % (stage_name, stage_type.name)
                    )
            else:
                if stage_type == BuildStageType.source:
                    raise InvalidStageError(
                        "Stage '%s' of type '%s' can't have inputs" % (stage_name, stage_type.name)
                    )

            if graph.out_degree(stage_name) == 0:
                if stage_name != head:
                    raise InvalidStageError(
                        "Stage '%s' of type '%s' has no outputs, "
                        "but is not the head stage" % (stage_name, stage_type.name)
                    )

    def _find_missing_sources(self, pipeline: Pipeline):
        work_dir_hashes = {}

        def _can_retrieve(stage_name: str, stage_config: BuildStage):
            stage_hash = stage_config.hash

            source_name, source_stage_name = ProjectBuildTargets.split_target_name(stage_name)
            if self._tree.is_working_tree and source_name in self._tree.sources:
                target = self._tree.build_targets[source_name]
                data_dir = self._project.source_data_dir(source_name)

                if not stage_hash:
                    return source_stage_name == target.head.name and osp.isdir(data_dir)

                wd_hash = work_dir_hashes.get(source_name)
                if not wd_hash and osp.isdir(data_dir):
                    wd_hash = self._project.compute_source_hash(
                        self._project.source_data_dir(source_name)
                    )
                    work_dir_hashes[source_name] = wd_hash

                if stage_hash and stage_hash == wd_hash:
                    return True

            if stage_hash and self._project.is_obj_cached(stage_hash):
                return True

            return False

        missing_sources = set()
        checked_deps = set()
        unchecked_deps = [pipeline.head]
        while unchecked_deps:
            stage_name = unchecked_deps.pop()
            if stage_name in checked_deps:
                continue

            stage_config = pipeline._graph.nodes[stage_name]["config"]

            if not _can_retrieve(stage_name, stage_config):
                if pipeline._graph.in_degree(stage_name) == 0:
                    assert stage_config.type == "source", stage_config.type
                    source_name = self._tree.build_targets.strip_target_name(stage_name)
                    source = self._tree.sources[source_name]
                    if not source.is_generated:
                        missing_sources.add(source_name)
                else:
                    for p in pipeline._graph.predecessors(stage_name):
                        if p not in checked_deps:
                            unchecked_deps.append(p)
                    continue

            checked_deps.add(stage_name)
        return missing_sources, work_dir_hashes




[docs]
class ProjectBuildTargets(CrudProxy[BuildTarget]):
    MAIN_TARGET = "project"
    BASE_STAGE = "root"

    def __init__(self, tree: Tree):
        self._tree = tree

    @property
    def _data(self):
        data = self._tree.config.build_targets

        if self.MAIN_TARGET not in data:
            data[self.MAIN_TARGET] = {
                "stages": [
                    BuildStage(
                        {
                            "name": self.BASE_STAGE,
                            "type": BuildStageType.project.name,
                        }
                    ),
                ]
            }

        for source in self._tree.sources:
            if source not in data:
                data[source] = {
                    "stages": [
                        BuildStage(
                            {
                                "name": self.BASE_STAGE,
                                "type": BuildStageType.source.name,
                            }
                        ),
                    ]
                }

        return data

    def __contains__(self, key):
        if "." in key:
            target, stage = self.split_target_name(key)
            return target in self._data and self._data[target].find_stage(stage) is not None
        return key in self._data


[docs]
    def add_target(self, name) -> BuildTarget:
        return self._data.set(
            name,
            {
                "stages": [
                    BuildStage(
                        {
                            "name": self.BASE_STAGE,
                            "type": BuildStageType.source.name,
                        }
                    ),
                ]
            },
        )



[docs]
    def add_stage(self, target, value, prev=None, name=None) -> str:
        target_name = target
        target_stage_name = None
        if "." in target:
            target_name, target_stage_name = self.split_target_name(target)

        if prev is None:
            prev = target_stage_name

        target = self._data[target_name]

        if prev:
            prev_stage = find(enumerate(target.stages), lambda e: e[1].name == prev)
            if prev_stage is None:
                raise KeyError("Can't find stage '%s'" % prev)
            prev_stage = prev_stage[0]
        else:
            prev_stage = len(target.stages) - 1

        name = value.get("name") or name
        if not name:
            name = generate_next_name(
                (s.name for s in target.stages), "stage", sep="-", default="1"
            )
        else:
            if target.find_stage(name):
                raise VcsError("Stage '%s' already exists" % name)
        value["name"] = name

        value = BuildStage(value)
        assert value.type in BuildStageType.__members__
        target.stages.insert(prev_stage + 1, value)

        return self.make_target_name(target_name, name)



[docs]
    def remove_target(self, name: str):
        assert name != self.MAIN_TARGET, "Can't remove the main target"
        self._data.remove(name)



[docs]
    def remove_stage(self, target: str, name: str):
        assert name not in {self.BASE_STAGE}, "Can't remove a default stage"

        target = self._data[target]
        idx = find(enumerate(target.stages), lambda e: e[1].name == name)
        if idx is None:
            raise KeyError("Can't find stage '%s'" % name)
        target.stages.remove(idx)



[docs]
    def add_transform_stage(
        self, target: str, transform: str, params: Optional[Dict] = None, name: Optional[str] = None
    ):
        if transform not in self._tree.env.transforms:
            raise KeyError("Unknown transform '%s'" % transform)

        return self.add_stage(
            target,
            {
                "type": BuildStageType.transform.name,
                "kind": transform,
                "params": params or {},
            },
            name=name,
        )



[docs]
    def add_inference_stage(
        self, target: str, model: str, params: Optional[Dict] = None, name: Optional[str] = None
    ):
        if model not in self._tree._project.models:
            raise KeyError("Unknown model '%s'" % model)

        return self.add_stage(
            target,
            {
                "type": BuildStageType.inference.name,
                "kind": model,
                "params": params or {},
            },
            name=name,
        )



[docs]
    def add_filter_stage(
        self, target: str, expr: str, params: Optional[Dict] = None, name: Optional[str] = None
    ):
        params = params or {}
        params["expr_or_filter_func"] = expr
        return self.add_stage(
            target,
            {
                "type": BuildStageType.filter.name,
                "params": params,
            },
            name=name,
        )



[docs]
    def add_convert_stage(
        self, target: str, format: str, params: Optional[Dict] = None, name: Optional[str] = None
    ):
        if not self._tree.env.is_format_known(format):
            raise KeyError("Unknown format '%s'" % format)

        return self.add_stage(
            target,
            {
                "type": BuildStageType.convert.name,
                "kind": format,
                "params": params or {},
            },
            name=name,
        )



[docs]
    def add_explore_stage(
        self, target: str, params: Optional[Dict] = None, name: Optional[str] = None
    ):
        return self.add_stage(
            target,
            {
                "type": BuildStageType.explore.name,
                "params": params or {},
            },
            name=name,
        )



[docs]
    @staticmethod
    def make_target_name(target: str, stage: Optional[str] = None) -> str:
        if stage:
            return "%s.%s" % (target, stage)
        return target



[docs]
    @classmethod
    def split_target_name(cls, name: str) -> Tuple[str, str]:
        if "." in name:
            target, stage = name.split(".", maxsplit=1)
            if not target:
                raise ValueError("Wrong build target name '%s': " "a name can't be empty" % name)
            if not stage:
                raise ValueError(
                    "Wrong build target name '%s': "
                    "expected stage name after the separator" % name
                )
        else:
            target = name
            stage = cls.BASE_STAGE
        return target, stage



[docs]
    @classmethod
    def strip_target_name(cls, name: str) -> str:
        return cls.split_target_name(name)[0]


    def _make_full_pipeline(self) -> Pipeline:
        pipeline = Pipeline()
        graph = pipeline._graph

        for target_name, target in self.items():
            if target_name == self.MAIN_TARGET:
                # main target combines all the others
                prev_stages = [
                    self.make_target_name(n, t.head.name)
                    for n, t in self.items()
                    if n != self.MAIN_TARGET
                ]
            else:
                prev_stages = [self.make_target_name(t, self[t].head.name) for t in target.parents]

            for stage in target.stages:
                stage_name = self.make_target_name(target_name, stage["name"])

                graph.add_node(stage_name, config=stage)

                for prev_stage in prev_stages:
                    graph.add_edge(prev_stage, stage_name)
                prev_stages = [stage_name]

        return pipeline


[docs]
    def make_pipeline(self, target: str) -> Pipeline:
        if target not in self:
            raise UnknownTargetError(target)

        # a subgraph with all the target dependencies
        if "." not in target:
            target = self.make_target_name(target, self[target].head.name)

        return self._make_full_pipeline().get_slice(target)





[docs]
class GitWrapper:

[docs]
    @staticmethod
    def module():
        try:
            import git

            return git
        except ModuleNotFoundError as e:
            raise ModuleNotFoundError(
                "Can't import the 'git' package. "
                "Make sure GitPython is installed, or install it with "
                "'pip install datumaro[default]'."
            ) from e


    def _git_dir(self):
        return osp.join(self._project_dir, ".git")

    def __init__(self, project_dir, repo=None):
        self._project_dir = project_dir
        self.repo = repo

        if repo is None and osp.isdir(project_dir) and osp.isdir(self._git_dir()):
            self.repo = self.module().Repo(project_dir)

    @property
    def initialized(self):
        return self.repo is not None


[docs]
    def init(self):
        if self.initialized:
            return

        repo = self.module().Repo.init(path=self._project_dir)
        repo.config_writer().set_value("user", "name", "User").set_value(
            "user", "email", "<>"
        ).release()

        # GitPython's init produces an incomplete repo, which becomes normal
        # only after a first commit. Unless the commit is done, some
        # GitPython's functions will throw useless errors.
        # Call "git init" directly to have the desired behaviour.
        repo.git.init()

        self.repo = repo



[docs]
    def close(self):
        if self.repo:
            self.repo.close()
            self.repo = None


    def __del__(self):
        with suppress(Exception):
            self.close()


[docs]
    def checkout(self, ref: str, dst_dir=None, clean=False, force=False):
        # If user wants to navigate to a head, we need to supply its object
        # insted of just a string. Otherwise, we'll get a detached head.
        try:
            ref_obj = self.repo.heads[ref]
        except IndexError:
            ref_obj = ref

        commit = self.repo.commit(ref)
        tree = commit.tree

        if not dst_dir:
            dst_dir = self._project_dir

        repo_dir = osp.abspath(self._project_dir)
        dst_dir = osp.abspath(dst_dir)
        assert is_subpath(dst_dir, base=repo_dir)

        if not force:
            statuses = self.status(tree, base_dir=dst_dir)

            # Only modified files produce conflicts in checkout
            dst_rpath = osp.relpath(dst_dir, repo_dir)
            conflicts = [osp.join(dst_rpath, p) for p, s in statuses.items() if s == "M"]
            if conflicts:
                raise UnsavedChangesError(conflicts)

        self.repo.head.ref = ref_obj
        self.repo.head.reset(working_tree=False)

        if clean:
            rmtree(dst_dir)

        self.write_tree(tree, dst_dir)



[docs]
    def add(self, paths, base=None):
        """
        Adds paths to index.
        Paths can be truncated relatively to base.
        """

        path_rewriter = None
        if base:
            base = osp.abspath(base)
            repo_root = osp.abspath(self._project_dir)
            assert is_subpath(base, base=repo_root), "Base path should be inside of the repo"
            base = osp.relpath(base, repo_root)
            path_rewriter = lambda entry: osp.relpath(entry.path, base).replace("\\", "/")

        if isinstance(paths, str):
            paths = [paths]

        # A workaround for path_rewriter incompatibility
        # with directory paths expansion
        paths_to_add = []
        for path in paths:
            if not osp.isdir(path):
                paths_to_add.append(path)
                continue

            for d, _, filenames in os.walk(path):
                for fn in filenames:
                    paths_to_add.append(osp.join(d, fn))

        self.repo.index.add(paths_to_add, path_rewriter=path_rewriter)



[docs]
    def commit(self, message) -> str:
        """
        Creates a new revision from index.
        Returns: new revision hash.
        """
        return self.repo.index.commit(message).hexsha


    GitTree = NewType("GitTree", object)
    GitStatus = NewType("GitStatus", str)


[docs]
    def status(
        self, paths: Union[str, GitTree, Iterable[str]] = None, base_dir: str = None
    ) -> Dict[str, GitStatus]:
        """
        Compares working directory and index.

        Parameters:
            paths: an iterable of paths to compare, a git.Tree, or None.
                When None, uses all the paths from HEAD.
            base_dir: a base path for paths. Paths will be prepended by this.
                When None or '', uses repo root. Can be useful, if index contains
                displaced paths, which needs to be mapped on real paths.

        The statuses are:
            - "A" for added paths
            - "D" for deleted paths
            - "R" for renamed paths
            - "M" for paths with modified data
            - "T" for changed in the type paths

        Returns: { abspath(base_dir + path): status }
        """

        if paths is None or isinstance(paths, self.module().objects.tree.Tree):
            if paths is None:
                tree = self.repo.head.commit.tree
            else:
                tree = paths
            paths = (obj.path for obj in tree.traverse() if obj.type == "blob")
        elif isinstance(paths, str):
            paths = [paths]

        if not base_dir:
            base_dir = self._project_dir

        repo_dir = osp.abspath(self._project_dir)
        base_dir = osp.abspath(base_dir)
        assert is_subpath(base_dir, base=repo_dir)

        statuses = {}
        for obj_path in paths:
            file_path = osp.join(base_dir, obj_path)

            index_entry = self.repo.index.entries.get((obj_path, 0), None)
            file_exists = osp.isfile(file_path)
            if not file_exists and index_entry:
                status = "D"
            elif file_exists and not index_entry:
                status = "A"
            elif file_exists and index_entry:
                # '--ignore-cr-at-eol' doesn't affect '--name-status'
                # so we can't really obtain 'T'
                status = self.repo.git.diff("--ignore-cr-at-eol", index_entry.hexsha, file_path)
                if status:
                    status = "M"
                assert status in {"", "M", "T"}, status
            else:
                status = ""  # ignore missing paths

            if status:
                statuses[obj_path] = status

        return statuses



[docs]
    def is_ref(self, rev):
        try:
            self.repo.commit(rev)
            return True
        except (ValueError, self.module().exc.BadName):
            return False



[docs]
    def has_commits(self):
        return self.is_ref("HEAD")



[docs]
    def get_tree(self, ref):
        return self.repo.tree(ref)



[docs]
    def write_tree(self, tree, base_path: str, include_files: Optional[List[str]] = None):
        os.makedirs(base_path, exist_ok=True)

        for obj in tree.traverse(visit_once=True):
            if include_files and obj.path not in include_files:
                continue

            path = osp.join(base_path, obj.path)
            os.makedirs(osp.dirname(path), exist_ok=True)
            if obj.type == "blob":
                with open(path, "wb") as f:
                    obj.stream_data(f)
            elif obj.type == "tree":
                pass
            else:
                raise ValueError(
                    "Unexpected object type in a " "git tree: %s (%s)" % (obj.type, obj.hexsha)
                )


    @property
    def head(self) -> str:
        return self.repo.head.commit.hexsha

    @property
    def branch(self) -> str:
        if self.repo.head.is_detached:
            return None
        return self.repo.active_branch


[docs]
    def rev_parse(self, ref: str) -> Tuple[str, str]:
        """
        Expands named refs and tags.

        Returns: object type, object hash
        """
        obj = self.repo.rev_parse(ref)
        return obj.type, obj.hexsha



[docs]
    def ignore(
        self,
        paths: Union[str, List[str]],
        mode: Union[None, str, IgnoreMode] = None,
        gitignore: Optional[str] = None,
    ):
        if not gitignore:
            gitignore = ".gitignore"
        repo_root = self._project_dir
        gitignore = osp.abspath(osp.join(repo_root, gitignore))
        assert is_subpath(gitignore, base=repo_root), gitignore

        _update_ignore_file(paths, repo_root=repo_root, mode=mode, filepath=gitignore)


    HASH_LEN = 40


[docs]
    @classmethod
    def is_hash(cls, s: str) -> bool:
        return len(s) == cls.HASH_LEN



[docs]
    def log(self, depth=10) -> List[Tuple[Any, int]]:
        """
        Returns: a list of (commit, index) pairs
        """

        commits = []

        if not self.has_commits():
            return commits

        for commit in zip(self.repo.iter_commits(rev="HEAD"), range(depth)):
            commits.append(commit)
        return commits





[docs]
class DvcWrapper:

[docs]
    @staticmethod
    def module():
        try:
            import dvc
            import dvc.cli
            import dvc.env
            import dvc.repo

            return dvc
        except ModuleNotFoundError as e:
            raise ModuleNotFoundError(
                "Can't import the 'dvc' package. "
                "Make sure DVC is installed, or install it with "
                "'pip install datumaro[default]'."
            ) from e


    def _dvc_dir(self):
        return osp.join(self._project_dir, ".dvc")


[docs]
    class DvcError(Exception):
        pass


    def __init__(self, project_dir):
        self._project_dir = project_dir
        self.repo = None

        if osp.isdir(project_dir) and osp.isdir(self._dvc_dir()):
            with logging_disabled():
                self.repo = self.module().repo.Repo(project_dir)

    @property
    def initialized(self):
        return self.repo is not None


[docs]
    def init(self):
        if self.initialized:
            return

        with logging_disabled():
            self.repo = self.module().repo.Repo.init(self._project_dir)

        repo_dir = osp.join(self._project_dir, ".dvc")
        _update_ignore_file(
            [osp.join(repo_dir, "plots")],
            filepath=osp.join(repo_dir, ".gitignore"),
            repo_root=repo_dir,
        )



[docs]
    def close(self):
        if self.repo:
            self.repo.close()
            self.repo = None


    def __del__(self):
        with suppress(Exception):
            self.close()


[docs]
    def checkout(self, targets=None):
        args = ["checkout"]
        if targets:
            if isinstance(targets, str):
                args.append(targets)
            else:
                args.extend(targets)
        self._exec(args)



[docs]
    def add(self, paths, no_commit=False):
        args = ["add"]
        if no_commit:
            args.append("--no-commit")
        if paths:
            if isinstance(paths, str):
                args.append(paths)
            else:
                args.extend(paths)
        self._exec(args)


    def _exec(self, args, hide_output=True, answer_on_input="y"):
        args = ["--cd", self._project_dir] + args

        # Avoid calling an extra process. Improves call performance and
        # removes an extra console window on Windows.
        os.environ[self.module().env.DVC_NO_ANALYTICS] = "1"

        with ExitStack() as es:
            es.callback(os.chdir, os.getcwd())  # restore cd after DVC

            if answer_on_input is not None:

                def _input(*args):
                    return answer_on_input

                es.enter_context(unittest.mock.patch("dvc.prompt.input", new=_input))

            log.debug("Calling DVC main with args: %s", args)

            logs = es.enter_context(catch_logs("dvc"))
            retcode = self.module().cli.main(args)

        logs = logs.getvalue()
        if retcode != 0:
            raise self.DvcError(logs)
        if not hide_output:
            print(logs)
        return logs


[docs]
    def is_cached(self, obj_hash):
        path = self.obj_path(obj_hash)
        if not osp.isfile(path):
            return False

        if obj_hash.endswith(self.DIR_HASH_SUFFIX):
            objects = parse_json_file(path)
            for entry in objects:
                if not osp.isfile(self.obj_path(entry["md5"])):
                    return False

        return True



[docs]
    def obj_path(self, obj_hash, root=None):
        assert self.is_hash(obj_hash), obj_hash
        if not root:
            root = osp.join(self._project_dir, ".dvc", "cache", "files", "md5")
        return osp.join(root, obj_hash[:2], obj_hash[2:])



[docs]
    def ignore(
        self,
        paths: Union[str, List[str]],
        mode: Union[None, str, IgnoreMode] = None,
        dvcignore: Optional[str] = None,
    ):
        if not dvcignore:
            dvcignore = ".dvcignore"
        repo_root = self._project_dir
        dvcignore = osp.abspath(osp.join(repo_root, dvcignore))
        assert is_subpath(dvcignore, base=repo_root), dvcignore

        _update_ignore_file(paths, repo_root=repo_root, mode=mode, filepath=dvcignore)


    # This ruamel parser is needed to preserve comments,
    # order and form (if multiple forms allowed by the standard)
    # of the entries in the file. It can be reused.
    import ruamel.yaml as yaml

    yaml_parser = yaml.YAML(typ="rt")


[docs]
    @classmethod
    def get_hash_from_dvcfile(cls, path) -> str:
        with open(path) as f:
            contents = cls.yaml_parser.load(f)
        return contents["outs"][0]["md5"]


    FILE_HASH_LEN = 32
    DIR_HASH_SUFFIX = ".dir"
    DIR_HASH_LEN = FILE_HASH_LEN + len(DIR_HASH_SUFFIX)


[docs]
    @classmethod
    def is_file_hash(cls, s: str) -> bool:
        return len(s) == cls.FILE_HASH_LEN



[docs]
    @classmethod
    def is_dir_hash(cls, s: str) -> bool:
        return len(s) == cls.DIR_HASH_LEN and s.endswith(cls.DIR_HASH_SUFFIX)



[docs]
    @classmethod
    def is_hash(cls, s: str) -> bool:
        return cls.is_file_hash(s) or cls.is_dir_hash(s)



[docs]
    def write_obj(self, obj_hash, dst_dir, allow_links=True):
        def _copy_obj(src, dst, link=False):
            os.makedirs(osp.dirname(dst), exist_ok=True)
            if link:
                os.link(src, dst)
            else:
                shutil.copy(src, dst, follow_symlinks=True)

        src = self.obj_path(obj_hash)
        if osp.isfile(src):
            _copy_obj(src, dst_dir, link=allow_links)
            return

        src += self.DIR_HASH_SUFFIX
        if not osp.isfile(src):
            raise UnknownRefError(obj_hash)

        src_meta = parse_json_file(src)
        for entry in src_meta:
            _copy_obj(
                self.obj_path(entry["md5"]), osp.join(dst_dir, entry["relpath"]), link=allow_links
            )



[docs]
    def remove_cache_obj(self, obj_hash: str):
        src = self.obj_path(obj_hash)
        if osp.isfile(src):
            rmfile(src)
            return

        src += self.DIR_HASH_SUFFIX
        if not osp.isfile(src):
            raise UnknownRefError(obj_hash)

        src_meta = parse_json_file(src)
        for entry in src_meta:
            entry_path = self.obj_path(entry["md5"])
            if osp.isfile(entry_path):
                rmfile(entry_path)

        rmfile(src)





[docs]
class Tree:
    # can be:
    # - attached to the work dir
    # - attached to a revision

    def __init__(
        self,
        project: Project,
        config: Union[None, Dict, Config, TreeConfig] = None,
        rev: Union[None, Revision] = None,
    ):
        assert isinstance(project, Project)
        assert not rev or project.is_ref(rev), rev

        if not isinstance(config, TreeConfig):
            config = TreeConfig(config)
        if config.format_version != 2:
            raise ValueError(
                "Unexpected tree config version '%s', expected 2" % config.format_version
            )
        self._config = config

        self._project = project
        self._rev = rev

        self._sources = ProjectSources(self)
        self._targets = ProjectBuildTargets(self)


[docs]
    def save(self):
        self.dump(self._config.config_path)



[docs]
    def dump(self, path):
        os.makedirs(osp.dirname(path), exist_ok=True)
        self._config.dump(path)



[docs]
    def clone(self) -> Tree:
        return Tree(self._project, TreeConfig(self.config), self._rev)


    @property
    def sources(self) -> ProjectSources:
        return self._sources

    @property
    def build_targets(self) -> ProjectBuildTargets:
        return self._targets

    @property
    def config(self) -> Config:
        return self._config

    @property
    def env(self) -> Environment:
        return self._project.env

    @property
    def rev(self) -> Union[None, Revision]:
        return self._rev


[docs]
    def make_pipeline(self, target: Optional[str] = None) -> Pipeline:
        if not target:
            target = "project"

        return self.build_targets.make_pipeline(target)



[docs]
    def make_dataset(self, target: Union[None, str, Pipeline] = None) -> Dataset:
        if not target or isinstance(target, str):
            pipeline = self.make_pipeline(target)
        elif isinstance(target, Pipeline):
            pipeline = target
        else:
            raise TypeError(f"Unexpected target type {type(target)}")

        return ProjectBuilder(self._project, self).make_dataset(pipeline)


    @property
    def is_working_tree(self) -> bool:
        return not self._rev


[docs]
    def source_data_dir(self, source) -> str:
        if self.is_working_tree:
            return self._project.source_data_dir(source)

        obj_hash = self.build_targets[source].head.hash
        return self._project.cache_path(obj_hash)





[docs]
class DiffStatus(Enum):
    added = auto()
    modified = auto()
    removed = auto()
    missing = auto()
    foreign_modified = auto()



Revision = NewType("Revision", str)  # a commit hash or a named reference
ObjectId = NewType("ObjectId", str)  # a commit or an object hash



[docs]
class Project:

[docs]
    @staticmethod
    def find_project_dir(path: str) -> Optional[str]:
        path = osp.abspath(path)

        if osp.basename(path) != ProjectLayout.aux_dir:
            path = osp.join(path, ProjectLayout.aux_dir)

        if osp.isdir(path):
            return path

        return None



[docs]
    @staticmethod
    @scoped
    def migrate_from_v1_to_v2(src_dir: str, dst_dir: str, skip_import_errors=False):
        if not osp.isdir(src_dir):
            raise FileNotFoundError("Source project is not found")

        if osp.exists(dst_dir):
            raise FileExistsError("Output path already exists")

        src_dir = osp.abspath(src_dir)
        dst_dir = osp.abspath(dst_dir)
        if src_dir == dst_dir:
            raise MigrationError(
                "Source and destination paths are the same. "
                "Project migration cannot be done inplace."
            )

        old_aux_dir = osp.join(src_dir, ".datumaro")
        old_config = Config.parse(osp.join(old_aux_dir, "config.yaml"))
        if old_config.format_version != 1:
            raise MigrationError(
                "Failed to migrate project: "
                "unexpected old version '%s'" % old_config.format_version
            )

        on_error_do(rmtree, dst_dir, ignore_errors=True)
        new_project = scope_add(Project.init(dst_dir))

        new_wtree_dir = osp.join(new_project._aux_dir, ProjectLayout.working_tree_dir)
        os.makedirs(new_wtree_dir, exist_ok=True)

        old_plugins_dir = osp.join(old_aux_dir, "plugins")
        if osp.isdir(old_plugins_dir):
            copytree(old_plugins_dir, osp.join(new_project._aux_dir, ProjectLayout.plugins_dir))

        old_models_dir = osp.join(old_aux_dir, "models")
        if osp.isdir(old_models_dir):
            copytree(old_models_dir, osp.join(new_project._aux_dir, ProjectLayout.models_dir))

        new_project.env.load_plugins(osp.join(new_project._aux_dir, ProjectLayout.plugins_dir))

        new_tree_config = new_project.working_tree.config
        new_local_config = new_project.config

        if "models" in old_config:
            for name, old_model in old_config.models.items():
                new_local_config.models[name] = Model(
                    {"launcher": old_model["launcher"], "options": old_model["options"]}
                )

        if "sources" in old_config:
            for name, old_source in old_config.sources.items():
                is_local = False
                source_dir = osp.join(src_dir, "sources", name)
                url = osp.abspath(osp.join(source_dir, old_source["url"]))
                rpath = None
                if osp.exists(url):
                    if is_subpath(url, source_dir):
                        if url != source_dir:
                            rpath = osp.relpath(url, source_dir)
                            url = source_dir
                        is_local = True
                    elif osp.isfile(url):
                        url, rpath = osp.split(url)
                elif not old_source["url"]:
                    url = ""

                try:
                    source = new_project.import_source(
                        name,
                        url=url,
                        rpath=rpath,
                        format=old_source["format"],
                        options=old_source["options"],
                    )
                    if is_local:
                        source.url = ""

                    new_project.working_tree.make_dataset(name)
                except Exception as e:
                    if not skip_import_errors:
                        raise MigrationError(f"Failed to migrate the source '{name}'") from e
                    else:
                        log.warning(
                            f"Failed to migrate the source '{name}'. "
                            "Try to add this source manually with "
                            "'datum project import', once migration is finished. The "
                            "reason is: %s",
                            e,
                        )
                        new_project.remove_source(name, force=True, keep_data=False)

        old_dataset_dir = osp.join(src_dir, "dataset")
        if osp.isdir(old_dataset_dir):
            # Such source cannot be represented in v2 directly.
            # However, it can be considered a generated source with
            # working tree data.
            name = generate_next_name(
                list(new_tree_config.sources), "local_dataset", sep="-", default="1"
            )
            source = new_project.import_source(name, url=old_dataset_dir, format=DEFAULT_FORMAT)

            # Make the source generated. It can only have local data.
            source.url = ""

        new_project.save()
        new_project.close()


    def __init__(self, path: Optional[str] = None, readonly=False):
        if not path:
            path = osp.curdir
        found_path = self.find_project_dir(path)
        if not found_path:
            raise ProjectNotFoundError(path)

        old_config_path = osp.join(found_path, "config.yaml")
        if osp.isfile(old_config_path):
            if Config.parse(old_config_path).format_version != 2:
                raise OldProjectError()

        self._aux_dir = found_path
        self._root_dir = osp.dirname(found_path)

        self._readonly = readonly

        # Force import errors on missing dependencies.
        #
        # TODO: maybe allow class use in some cases, which not require
        # Git or DVC
        GitWrapper.module()
        DvcWrapper.module()

        self._git = GitWrapper(self._root_dir)
        self._dvc = DvcWrapper(self._root_dir)

        self._working_tree = None
        self._head_tree = None

        local_config = osp.join(self._aux_dir, ProjectLayout.conf_file)
        if osp.isfile(local_config):
            self._config = ProjectConfig.parse(local_config)
        else:
            self._config = ProjectConfig()

        self._env = Environment()

        plugins_dir = osp.join(self._aux_dir, ProjectLayout.plugins_dir)
        if osp.isdir(plugins_dir):
            self._env.load_plugins(plugins_dir)

    def _init_vcs(self):
        # DVC requires Git to be initialized
        if not self._git.initialized:
            self._git.init()
            self._git.ignore(
                [
                    ProjectLayout.cache_dir,
                ],
                gitignore=osp.join(self._aux_dir, ".gitignore"),
            )
            self._git.ignore([])  # create the file
        if not self._dvc.initialized:
            self._dvc.init()
            self._dvc.ignore(
                [
                    osp.join(self._aux_dir, ProjectLayout.cache_dir),
                    osp.join(self._aux_dir, ProjectLayout.working_tree_dir),
                ]
            )
            self._git.repo.index.remove(
                osp.join(self._root_dir, ".dvc", "plots"), r=True, ignore_unmatch=True
            )
        self.commit("Initial commit", allow_empty=True)


[docs]
    @classmethod
    @scoped
    def init(cls, path) -> Project:
        existing_project = cls.find_project_dir(path)
        if existing_project:
            raise ProjectAlreadyExists(path)

        path = osp.abspath(path)
        if osp.basename(path) != ProjectLayout.aux_dir:
            path = osp.join(path, ProjectLayout.aux_dir)

        project_dir = osp.dirname(path)
        if not osp.isdir(project_dir):
            on_error_do(rmtree, project_dir, ignore_errors=True)

        os.makedirs(path, exist_ok=True)

        on_error_do(rmtree, osp.join(project_dir, ProjectLayout.cache_dir), ignore_errors=True)
        on_error_do(rmtree, osp.join(project_dir, ProjectLayout.tmp_dir), ignore_errors=True)
        os.makedirs(osp.join(path, ProjectLayout.cache_dir))
        os.makedirs(osp.join(path, ProjectLayout.tmp_dir))

        git_dir, dvc_dir = osp.join(project_dir, ".git"), osp.join(project_dir, ".dvc")

        if osp.exists(git_dir):
            raise VcsAlreadyExists(git_dir)
        if osp.exists(dvc_dir):
            raise VcsAlreadyExists(dvc_dir)

        on_error_do(rmtree, git_dir, ignore_errors=True)
        on_error_do(rmtree, dvc_dir, ignore_errors=True)

        project = Project(path)
        project._init_vcs()

        return project



[docs]
    def close(self):
        if self._dvc:
            self._dvc.close()
            self._dvc = None

        if self._git:
            self._git.close()
            self._git = None


    def __del__(self):
        with suppress(Exception):
            self.close()

    def __enter__(self):
        return self

    def __exit__(self, *args, **kwargs):
        self.close()


[docs]
    def save(self):
        self._config.dump(osp.join(self._aux_dir, ProjectLayout.conf_file))

        if self._working_tree:
            self._working_tree.save()


    @property
    def readonly(self) -> bool:
        return self._readonly

    @property
    def working_tree(self) -> Tree:
        if self._working_tree is None:
            self._working_tree = self.get_rev(None)
        return self._working_tree

    @property
    def head(self) -> Tree:
        if self._head_tree is None:
            self._head_tree = self.get_rev("HEAD")
        return self._head_tree

    @property
    def head_rev(self) -> Revision:
        return self._git.head

    @property
    def branch(self) -> str:
        return self._git.branch

    @property
    def config(self) -> Config:
        return self._config

    @property
    def env(self) -> Environment:
        return self._env

    @property
    def models(self) -> Dict[str, Model]:
        return dict(self._config.models)


[docs]
    def get_rev(self, rev: Union[None, Revision]) -> Tree:
        """
        Reference conventions:
            - None or "" - working dir
            - "<40 symbols>" - revision hash
        """

        obj_type, obj_hash = self._parse_ref(rev)
        assert obj_type == self._ObjectIdKind.tree, obj_type

        if self._is_working_tree_ref(obj_hash):
            config_path = osp.join(
                self._aux_dir, ProjectLayout.working_tree_dir, TreeLayout.conf_file
            )
            if osp.isfile(config_path):
                tree_config = TreeConfig.parse(config_path)
            else:
                tree_config = TreeConfig()
                os.makedirs(osp.dirname(config_path), exist_ok=True)
                tree_config.dump(config_path)
            tree_config.config_path = config_path
            tree_config.base_dir = osp.dirname(config_path)
            tree = Tree(config=tree_config, project=self, rev=obj_hash)
        else:
            if not self.is_rev_cached(obj_hash):
                self._materialize_rev(obj_hash)

            rev_dir = self.cache_path(obj_hash)
            tree_config = TreeConfig.parse(osp.join(rev_dir, TreeLayout.conf_file))
            tree_config.base_dir = rev_dir
            tree = Tree(config=tree_config, project=self, rev=obj_hash)
        return tree



[docs]
    def is_rev_cached(self, rev: Revision) -> bool:
        obj_type, obj_hash = self._parse_ref(rev)
        assert obj_type == self._ObjectIdKind.tree, obj_type
        return self._is_cached(obj_hash)



[docs]
    def is_obj_cached(self, obj_hash: ObjectId) -> bool:
        return self._is_cached(obj_hash) or self._can_retrieve_from_vcs_cache(obj_hash)


    @staticmethod
    def _is_working_tree_ref(ref: Union[None, Revision, ObjectId]) -> bool:
        return not ref

    class _ObjectIdKind(Enum):
        # Project revision data. Currently, a Git commit hash.
        tree = auto()

        # Source revision data. DVC directories and files.
        blob = auto()

    def _parse_ref(self, ref: Union[None, Revision, ObjectId]) -> Tuple[_ObjectIdKind, ObjectId]:
        """
        Resolves the reference to an object hash.
        """

        if self._is_working_tree_ref(ref):
            return self._ObjectIdKind.tree, ref

        try:
            obj_type, obj_hash = self._git.rev_parse(ref)
        except Exception:  # nosec try_except_pass
            pass  # Ignore git errors
        else:
            if obj_type != "commit":
                raise UnknownRefError(obj_hash)

            return self._ObjectIdKind.tree, obj_hash

        try:
            assert self._dvc.is_hash(ref), ref
            return self._ObjectIdKind.blob, ref
        except Exception as e:
            raise UnknownRefError(ref) from e

    def _materialize_rev(self, rev: Revision) -> str:
        """
        Restores the revision tree data in the project cache from Git.

        Returns: cache object path
        """
        # TODO: maybe avoid this operation by providing a virtual filesystem
        # object

        # Allowed to be run when readonly, because it doesn't modify project
        # data and doesn't hurt disk space.

        obj_dir = self.cache_path(rev)
        if osp.isdir(obj_dir):
            return obj_dir

        tree = self._git.get_tree(rev)
        self._git.write_tree(tree, obj_dir)
        return obj_dir

    def _is_cached(self, obj_hash: ObjectId):
        return osp.isdir(self.cache_path(obj_hash))


[docs]
    def cache_path(self, obj_hash: ObjectId) -> str:
        assert self._git.is_hash(obj_hash) or self._dvc.is_hash(obj_hash), obj_hash
        if self._dvc.is_dir_hash(obj_hash):
            obj_hash = obj_hash[: self._dvc.FILE_HASH_LEN]

        return osp.join(self._aux_dir, ProjectLayout.cache_dir, obj_hash[:2], obj_hash[2:])


    def _can_retrieve_from_vcs_cache(self, obj_hash: ObjectId):
        if not self._dvc.is_dir_hash(obj_hash):
            dir_check = self._dvc.is_cached(obj_hash + self._dvc.DIR_HASH_SUFFIX)
        else:
            dir_check = False
        return dir_check or self._dvc.is_cached(obj_hash)


[docs]
    def source_data_dir(self, name: str) -> str:
        return osp.join(self._root_dir, name)


    def _source_dvcfile_path(self, name: str, root: Optional[str] = None) -> str:
        """
        root - Path to the tree root directory. If not set,
            the working tree is used.
        """

        if not root:
            root = osp.join(self._aux_dir, ProjectLayout.working_tree_dir)
        return osp.join(root, TreeLayout.sources_dir, name, "source.dvc")

    def _make_tmp_dir(self, suffix: Optional[str] = None):
        project_tmp_dir = osp.join(self._aux_dir, ProjectLayout.tmp_dir)
        os.makedirs(project_tmp_dir, exist_ok=True)
        if suffix:
            suffix = "_" + suffix

        return tempfile.TemporaryDirectory(suffix=suffix, dir=project_tmp_dir)


[docs]
    def remove_cache_obj(self, ref: Union[Revision, ObjectId]):
        if self.readonly:
            raise ReadonlyProjectError()

        obj_type, obj_hash = self._parse_ref(ref)

        if self._is_cached(obj_hash):
            rmtree(self.cache_path(obj_hash))

        if obj_type == self._ObjectIdKind.tree:
            # Revision metadata is cheap enough and needed to materialize
            # the revision, so we keep it in the Git cache.
            pass
        elif obj_type == self._ObjectIdKind.blob:
            self._dvc.remove_cache_obj(obj_hash)
        else:
            raise ValueError("Unexpected object type '%s'" % obj_type)



[docs]
    def validate_source_name(self, name: str):
        if not name:
            raise ValueError("Source name cannot be empty")

        disallowed_symbols = r"[^\\ \.\~\-\w]"
        found_wrong_symbols = re.findall(disallowed_symbols, name)
        if found_wrong_symbols:
            raise ValueError("Source name contains invalid symbols: %s" % found_wrong_symbols)

        valid_filename = make_file_name(name)
        if valid_filename != name:
            raise ValueError(
                "Source name contains " "invalid symbols: %s" % (set(name) - set(valid_filename))
            )

        if name.startswith("."):
            raise ValueError("Source name can't start with '.'")

        reserved_names = {"dataset", "build", "project"}
        if name.lower() in reserved_names:
            raise ValueError("Source name is reserved for internal use")


    @scoped
    def _download_source(
        self, url: str, dst_dir: str, *, no_cache: bool = False, no_hash: bool = False
    ) -> Tuple[str, str, str]:
        assert url
        assert dst_dir

        dvcfile = osp.join(dst_dir, "source.dvc")
        data_dir = osp.join(dst_dir, "data")

        log.debug(f"Copying from '{url}' to '{data_dir}'")

        if osp.isdir(url):
            copytree(url, data_dir)
        elif osp.isfile(url):
            os.makedirs(data_dir, exist_ok=True)
            shutil.copy(url, data_dir)
        else:
            raise UnexpectedUrlError(url)
        on_error_do(rmtree, data_dir, ignore_errors=True)

        log.debug("Done")

        if not no_hash:
            obj_hash = self.compute_source_hash(data_dir, dvcfile=dvcfile, no_cache=no_cache)
            if not no_cache:
                log.debug("Data is added to DVC cache")
            log.debug("Data hash: '%s'", obj_hash)
        else:
            obj_hash = ""

        return obj_hash, dvcfile, data_dir

    @staticmethod
    def _get_source_hash(dvcfile):
        obj_hash = DvcWrapper.get_hash_from_dvcfile(dvcfile)
        if obj_hash.endswith(DvcWrapper.DIR_HASH_SUFFIX):
            obj_hash = obj_hash[: -len(DvcWrapper.DIR_HASH_SUFFIX)]
        return obj_hash


[docs]
    @scoped
    def compute_source_hash(
        self,
        data_dir: str,
        dvcfile: Optional[str] = None,
        no_cache: bool = True,
    ) -> ObjectId:
        if not dvcfile:
            tmp_dir = scope_add(self._make_tmp_dir())
            dvcfile = osp.join(tmp_dir, "source.dvc")

        self._dvc.add(data_dir, no_commit=no_cache)

        gen_dvcfile = osp.join(self._root_dir, data_dir + ".dvc")
        if os.path.isfile(gen_dvcfile):
            shutil.move(gen_dvcfile, dvcfile)

        obj_hash = self._get_source_hash(dvcfile)
        return obj_hash



[docs]
    def refresh_source_hash(self, source: str, no_cache: bool = True) -> ObjectId:
        """
        Computes and updates the source hash in the working directory.

        Returns: hash
        """

        if self.readonly:
            raise ReadonlyProjectError()

        build_target = self.working_tree.build_targets[source]
        source_dir = self.source_data_dir(source)

        if not osp.isdir(source_dir):
            return None

        dvcfile = self._source_dvcfile_path(source)
        os.makedirs(osp.dirname(dvcfile), exist_ok=True)
        obj_hash = self.compute_source_hash(source_dir, dvcfile=dvcfile, no_cache=no_cache)

        build_target.head.hash = obj_hash
        if not build_target.has_stages:
            self.working_tree.sources[source].hash = obj_hash

        return obj_hash


    def _materialize_obj(self, obj_hash: ObjectId) -> str:
        """
        Restores the object data in the project cache from DVC.

        Returns: cache object path
        """
        # TODO: maybe avoid this operation by providing a virtual filesystem
        # object

        # Allowed to be run when readonly, because it shouldn't hurt disk
        # space, if object is materialized with symlinks.

        if not self._can_retrieve_from_vcs_cache(obj_hash):
            raise MissingObjectError(obj_hash)

        dst_dir = self.cache_path(obj_hash)
        if osp.isdir(dst_dir):
            return dst_dir

        self._dvc.write_obj(obj_hash, dst_dir, allow_links=True)
        return dst_dir


[docs]
    @scoped
    def import_source(
        self,
        name: str,
        url: Optional[str],
        format: str,
        options: Optional[Dict] = None,
        *,
        no_cache: bool = True,
        no_hash: bool = True,
        rpath: Optional[str] = None,
    ) -> Source:
        """
        Adds a new source (dataset) to the working directory of the project.

        When 'rpath' is specified, will copy all the data from URL, but read
        only the specified file. Required to support subtasks and subsets
        in datasets.

        Parameters:
            name (str): Name of the new source
            url (str): URL of the new source. A path to a file or directory
            format (str): Dataset format
            options (dict): Options for the format Extractor
            no_cache (bool): Don't put a copy of files into the project cache.
                Can be used to reduce project cache size.
            no_hash (bool): Don't compute source data hash. Implies "no_cache".
                Useful to reduce import time at the cost of disabled data
                integrity checks.
            rpath (str): Used to specify a relative path to the dataset
                inside of the directory pointed by URL.

        Returns: the new source config
        """

        if self.readonly:
            raise ReadonlyProjectError()

        self.validate_source_name(name)

        if name in self.working_tree.sources:
            raise SourceExistsError(name)

        data_dir = self.source_data_dir(name)
        if osp.exists(data_dir):
            if os.listdir(data_dir):
                raise FileExistsError("Source directory '%s' already " "exists" % data_dir)
            os.rmdir(data_dir)

        if url:
            url = osp.abspath(url)
            if not osp.exists(url):
                raise FileNotFoundError(url)

            if is_subpath(url, base=self._root_dir):
                raise SourceUrlInsideProjectError()

            if rpath:
                rpath = osp.normpath(osp.join(url, rpath))

                if not osp.exists(rpath):
                    raise FileNotFoundError(rpath)

                if not is_subpath(rpath, base=url):
                    raise PathOutsideSourceError(
                        "Source data path is outside of the directory, "
                        "specified by source URL: '%s', '%s'" % (rpath, url)
                    )

                rpath = osp.relpath(rpath, url)
            elif osp.isfile(url):
                rpath = osp.basename(url)
        else:
            rpath = None

        if no_hash:
            no_cache = True

        config = Source(
            {
                "url": (url or "").replace("\\", "/"),
                "path": (rpath or "").replace("\\", "/"),
                "format": format,
                "options": options or {},
            }
        )

        if not config.is_generated:
            dvcfile = self._source_dvcfile_path(name)
            os.makedirs(osp.dirname(dvcfile), exist_ok=True)

            with self._make_tmp_dir() as tmp_dir:
                obj_hash, tmp_dvcfile, tmp_data_dir = self._download_source(
                    url, tmp_dir, no_cache=no_cache, no_hash=no_hash
                )

                shutil.move(tmp_data_dir, data_dir)
                on_error_do(rmtree, data_dir)

                if not no_hash:
                    os.replace(tmp_dvcfile, dvcfile)
                    config["hash"] = obj_hash

        self._git.ignore([data_dir])

        config = self.working_tree.sources.add(name, config)
        target = self.working_tree.build_targets.add_target(name)
        target.root.hash = config.hash

        self.working_tree.save()

        return config



[docs]
    @scoped
    def add_source(
        self, path: str, format: str, options: Optional[Dict] = None, *, rpath: Optional[str] = None
    ) -> Tuple[str, Source]:
        """
        Adds a new source (dataset) from the working directory of the project.

        Only directories from the project root can be added. This command is
        useful after a source was removed and you need to re-add it, or when
        the dataset was copied or downloaded manually.

        When 'rpath' is specified, will copy all the data from URL, but read
        only the specified file. Required to support subtasks and subsets
        in datasets.

        Parameters:
            url (str): URL of the new source. A path to a directory
            format (str): Dataset format
            options (dict): Options for the format Extractor
            rpath (str): Used to specify a relative path to the dataset
                inside of the directory pointed by URL.

        Returns: the name and the config of the new source
        """

        if self.readonly:
            raise ReadonlyProjectError()

        if not path:
            raise ValueError("Source path cannot be empty")

        path = osp.abspath(path)

        name = osp.basename(path)
        self.validate_source_name(name)

        if name in self.working_tree.sources:
            raise SourceExistsError(name)

        if not osp.isdir(path):
            raise FileNotFoundError("Source directory '%s' is not found" % path)

        if not (is_subpath(path, base=self._root_dir) and osp.dirname(path) == self._root_dir):
            raise UnexpectedUrlError(
                "The source path is expected to be " "a directory in the project root"
            )

        if rpath:
            rpath = osp.normpath(osp.join(path, rpath))

            if not osp.exists(rpath):
                raise FileNotFoundError(rpath)

            if not is_subpath(rpath, base=path):
                raise PathOutsideSourceError(
                    "Source data path is outside of the directory, "
                    "specified by source URL: '%s', '%s'" % (rpath, path)
                )

            rpath = osp.relpath(rpath, path)
        else:
            rpath = None

        self._git.ignore([path])

        config = self.working_tree.sources.add(
            name,
            {
                "url": (path or "").replace("\\", "/"),
                "path": (rpath or "").replace("\\", "/"),
                "format": format,
                "options": options or {},
            },
        )
        self.working_tree.build_targets.add_target(name)

        self.working_tree.save()

        return name, config



[docs]
    def remove_source(self, name: str, *, force: bool = False, keep_data: bool = True):
        """
        Options:
            - force (bool) - ignores errors and tries to wipe remaining data
            - keep_data (bool) - leaves source data untouched
        """

        if self.readonly:
            raise ReadonlyProjectError()

        if name not in self.working_tree.sources and not force:
            raise UnknownSourceError(name)

        self.working_tree.sources.remove(name)

        data_dir = self.source_data_dir(name)
        if not keep_data:
            if osp.isdir(data_dir):
                rmtree(data_dir)

        dvcfile = self._source_dvcfile_path(name)
        if osp.isfile(dvcfile):
            try:
                rmfile(dvcfile)
            except Exception:
                if not force:
                    raise

        self.working_tree.build_targets.remove_target(name)

        self.working_tree.save()

        self._git.ignore([data_dir], mode="remove")



[docs]
    def commit(
        self,
        message: str,
        *,
        no_cache: bool = False,
        allow_empty: bool = False,
        allow_foreign: bool = False,
    ) -> Revision:
        """
        Copies tree and objects from the working dir to the cache.
        Creates a new commit. Moves the HEAD pointer to the new commit.

        Options:

            - no_cache (bool) - don't put added dataset data into cache,
                store only metainfo. Can be used to reduce storage size.
            - allow_empty (bool) - allow commits with no changes.
            - allow_foreign (bool) - allow commits with changes made not by Datumaro.

        Returns: the new commit hash
        """

        if self.readonly:
            raise ReadonlyProjectError()

        statuses = self.status()

        if not allow_empty and not statuses:
            raise EmptyCommitError()

        for t, s in statuses.items():
            if s == DiffStatus.foreign_modified:
                # TODO: compute a patch and a new stage, remove allow_foreign
                if allow_foreign:
                    log.warning(
                        "The source '%s' has been changed "
                        "without Datumaro. It will be saved, but it will "
                        "only be available for reproduction from the cache.",
                        t,
                    )
                else:
                    raise ForeignChangesError(
                        "The source '%s' is changed outside Datumaro. You can "
                        "restore the latest source revision with 'checkout' "
                        "command." % t
                    )

        for s in self.working_tree.sources:
            self.refresh_source_hash(s, no_cache=no_cache)

        wtree_dir = osp.join(self._aux_dir, ProjectLayout.working_tree_dir)
        self.working_tree.save()
        self._git.add(wtree_dir, base=wtree_dir)

        extra_files = [
            osp.join(self._root_dir, ".dvc", ".gitignore"),
            osp.join(self._root_dir, ".dvc", "config"),
            osp.join(self._root_dir, ".dvcignore"),
            osp.join(self._root_dir, ".gitignore"),
            osp.join(self._aux_dir, ".gitignore"),
        ]
        self._git.add(extra_files, base=self._root_dir)

        head = self._git.commit(message)

        rev_dir = self.cache_path(head)
        copytree(wtree_dir, rev_dir)
        for p in extra_files:
            if osp.isfile(p):
                dst_path = osp.join(rev_dir, osp.relpath(p, self._root_dir))
                os.makedirs(osp.dirname(dst_path), exist_ok=True)
                shutil.copyfile(p, dst_path)

        self._head_tree = None

        return head


    @staticmethod
    def _move_dvc_dir(src_dir, dst_dir):
        for name in {"config", ".gitignore"}:
            os.replace(osp.join(src_dir, name), osp.join(dst_dir, name))


[docs]
    def checkout(
        self,
        rev: Union[None, Revision] = None,
        sources: Union[None, str, Iterable[str]] = None,
        *,
        force: bool = False,
    ):
        """
        Copies tree and objects from the cache to the working tree.

        Sets HEAD to the specified revision, unless sources specified.
        When sources specified, only copies objects from the cache to
        the working tree. When no revision and no sources is specified,
        restores the sources from the current revision.

        By default, uses the current (HEAD) revision.

        Options:
            - force (bool) - ignore unsaved changes. By default, an error is raised
        """

        if self.readonly:
            raise ReadonlyProjectError()

        if isinstance(sources, str):
            sources = {sources}
        elif sources is None:
            sources = {}
        else:
            sources = set(sources)

        rev = rev or "HEAD"

        if sources:
            rev_tree = self.get_rev(rev)

            # Check targets
            for s in sources:
                if s not in rev_tree.sources:
                    raise UnknownSourceError(s)

            rev_dir = rev_tree.config.base_dir
            with self._make_tmp_dir() as tmp_dir:
                dvcfiles = []

                for s in sources:
                    dvcfile = self._source_dvcfile_path(s, root=rev_dir)

                    tmp_dvcfile = osp.join(tmp_dir, s + ".dvc")
                    with open(dvcfile) as f:
                        conf = self._dvc.yaml_parser.load(f)

                    conf["wdir"] = self._root_dir

                    with open(tmp_dvcfile, "w") as f:
                        self._dvc.yaml_parser.dump(conf, f)

                    dvcfiles.append(tmp_dvcfile)

                self._dvc.checkout(dvcfiles)

            self._git.ignore(sources)

            for s in sources:
                self.working_tree.config.sources[s] = rev_tree.config.sources[s]
                self.working_tree.config.build_targets[s] = rev_tree.config.build_targets[s]

            self.working_tree.save()
        else:
            # Check working tree for unsaved changes,
            # set HEAD to the revision
            # write revision tree to working tree
            wtree_dir = osp.join(self._aux_dir, ProjectLayout.working_tree_dir)
            self._git.checkout(rev, dst_dir=wtree_dir, clean=True, force=force)
            self._move_dvc_dir(osp.join(wtree_dir, ".dvc"), osp.join(self._root_dir, ".dvc"))

            self._working_tree = None

            # Restore sources from the commit.
            # Work with the working tree instead of cache, to
            # avoid extra memory use from materializing
            # the head commit sources in the cache
            rev_tree = self.working_tree
            with self._make_tmp_dir() as tmp_dir:
                dvcfiles = []

                for s in rev_tree.sources:
                    dvcfile = self._source_dvcfile_path(s)

                    tmp_dvcfile = osp.join(tmp_dir, s + ".dvc")
                    with open(dvcfile) as f:
                        conf = self._dvc.yaml_parser.load(f)

                    conf["wdir"] = self._root_dir

                    with open(tmp_dvcfile, "w") as f:
                        self._dvc.yaml_parser.dump(conf, f)

                    dvcfiles.append(tmp_dvcfile)

                self._dvc.checkout(dvcfiles)

            os.replace(osp.join(wtree_dir, ".gitignore"), osp.join(self._root_dir, ".gitignore"))
            os.replace(osp.join(wtree_dir, ".dvcignore"), osp.join(self._root_dir, ".dvcignore"))

        self._working_tree = None



[docs]
    def is_ref(self, ref: Union[None, str]) -> bool:
        if self._is_working_tree_ref(ref):
            return True
        return self._git.is_ref(ref)



[docs]
    def has_commits(self) -> bool:
        return self._git.has_commits()



[docs]
    def status(self) -> Dict[str, DiffStatus]:
        wd = self.working_tree

        if not self.has_commits():
            return {s: DiffStatus.added for s in wd.sources}

        head = self.head

        changed_targets = {}

        for t_name, wd_target in wd.build_targets.items():
            if t_name == ProjectBuildTargets.MAIN_TARGET:
                continue

            if osp.isdir(self.source_data_dir(t_name)):
                old_hash = wd_target.head.hash
                new_hash = self.compute_source_hash(t_name, no_cache=True)

                if old_hash and old_hash != new_hash:
                    changed_targets[t_name] = DiffStatus.foreign_modified

        for t_name in set(head.build_targets) | set(wd.build_targets):
            if t_name == ProjectBuildTargets.MAIN_TARGET:
                continue
            if t_name in changed_targets:
                continue

            head_target = head.build_targets.get(t_name)
            wd_target = wd.build_targets.get(t_name)

            status = None

            if head_target is None:
                status = DiffStatus.added
            elif wd_target is None:
                status = DiffStatus.removed
            else:
                if head_target != wd_target:
                    status = DiffStatus.modified
                elif not osp.isdir(self.source_data_dir(t_name)):
                    status = DiffStatus.missing

            if status:
                changed_targets[t_name] = status

        return changed_targets



[docs]
    def history(self, max_count=10) -> List[Tuple[Revision, str]]:
        return [(c.hexsha, c.message) for c, _ in self._git.log(max_count)]



[docs]
    def diff(
        self, rev_a: Union[Tree, Revision], rev_b: Union[Tree, Revision]
    ) -> Dict[str, DiffStatus]:
        """
        Compares 2 revision trees.

        Returns: { target_name: status } for changed targets
        """

        if rev_a == rev_b:
            return {}

        if isinstance(rev_a, str):
            tree_a = self.get_rev(rev_a)
        else:
            tree_a = rev_a

        if isinstance(rev_b, str):
            tree_b = self.get_rev(rev_b)
        else:
            tree_b = rev_b

        changed_targets = {}

        for t_name in set(tree_a.build_targets) | set(tree_b.build_targets):
            if t_name == ProjectBuildTargets.MAIN_TARGET:
                continue

            head_target = tree_a.build_targets.get(t_name)
            wd_target = tree_b.build_targets.get(t_name)

            status = None

            if head_target is None:
                status = DiffStatus.added
            elif wd_target is None:
                status = DiffStatus.removed
            else:
                if head_target != wd_target:
                    status = DiffStatus.modified

            if status:
                changed_targets[t_name] = status

        return changed_targets



[docs]
    def model_data_dir(self, name: str) -> str:
        return osp.join(self._aux_dir, ProjectLayout.models_dir, name)



[docs]
    def make_model(self, name: str) -> Launcher:
        model = self._config.models[name]
        model_dir = self.model_data_dir(name)
        if not osp.isdir(model_dir):
            model_dir = None
        return self._env.make_launcher(model.launcher, **model.options, model_dir=model_dir)



[docs]
    def add_model(self, name: str, launcher: str, options: Dict[str, Any] = None) -> Model:
        if self.readonly:
            raise ReadonlyProjectError()

        if launcher not in self.env.launchers:
            raise KeyError("Unknown launcher '%s'" % launcher)

        if not name:
            raise ValueError("Model name can't be empty")

        if name in self.models:
            raise KeyError("Model '%s' already exists" % name)

        return self._config.models.set(name, {"launcher": launcher, "options": options or {}})



[docs]
    def remove_model(self, name: str):
        if self.readonly:
            raise ReadonlyProjectError()

        if name not in self.models:
            raise KeyError("Unknown model '%s'" % name)

        self._config.models.remove(name)

        data_dir = self.model_data_dir(name)
        if osp.isdir(data_dir):
            rmtree(data_dir)