Source code for datumaro.components.importer

# Copyright (C) 2019-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT

from __future__ import annotations

import os
import os.path as osp
from contextlib import contextmanager
from functools import wraps
from glob import iglob
from typing import Callable, Dict, List, Optional, Type, TypeVar

from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.contexts.importer import (
    FailingImportErrorPolicy,
    ImportContext,
    ImportErrorPolicy,
    NullImportContext,
    _ImportFail,
)
from datumaro.components.errors import DatasetImportError, DatasetNotFoundError
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.merge.extractor_merger import ExtractorMerger
from datumaro.util.definitions import SUBSET_NAME_BLACKLIST

T = TypeVar("T")

__all__ = [
    "ImportContext",
    "NullImportContext",
    "_ImportFail",
    "Importer",
    "with_subset_dirs",
    "ImportErrorPolicy",
    "FailingImportErrorPolicy",
]


[docs] class Importer(CliPlugin): DETECT_CONFIDENCE = FormatDetectionConfidence.LOW
[docs] @classmethod def detect( cls, context: FormatDetectionContext, ) -> FormatDetectionConfidence: if not cls.find_sources_with_params(context.root_path): context.fail("specific requirement information unavailable") return cls.DETECT_CONFIDENCE
[docs] @classmethod def get_file_extensions(cls) -> List[str]: raise NotImplementedError()
[docs] @classmethod def find_sources(cls, path: str) -> List[Dict]: raise NotImplementedError()
[docs] @classmethod def find_sources_with_params(cls, path: str, **extra_params) -> List[Dict]: return cls.find_sources(path)
def __call__(self, path, stream: bool = False, **extra_params): if not path or not osp.exists(path): raise DatasetNotFoundError(path, self.NAME) found_sources = self.find_sources_with_params(osp.normpath(path), **extra_params) if not found_sources: raise DatasetNotFoundError(path, self.NAME) sources = [] for desc in found_sources: params = dict(extra_params) params.update(desc.get("options", {})) if stream and self.can_stream: params.update({"stream": True}) elif stream and not self.can_stream: raise DatasetImportError( f"{self.__class__.__name__} cannot stream, but stream=True." ) desc["options"] = params sources.append(desc) return sources @classmethod def _find_sources_recursive( cls, path: str, ext: Optional[str], extractor_name: str, filename: str = "*", dirname: str = "", file_filter: Optional[Callable[[str], bool]] = None, max_depth: int = 3, recursive: bool = False, ): """ Finds sources in the specified location, using the matching pattern to filter file names and directories. Supposed to be used, and to be the only call in subclasses. Parameters: path: a directory or file path, where sources need to be found. ext: file extension to match. To match directories, set this parameter to None or ''. Comparison is case-independent, a starting dot is not required. extractor_name: the name of the associated Extractor type filename: a glob pattern for file names dirname: a glob pattern for filename prefixes file_filter: a callable (abspath: str) -> bool, to filter paths found max_depth: the maximum depth for recursive search. recursive: If recursive is true, the pattern '**' will match any files and zero or more directories and subdirectories. Returns: a list of source configurations (i.e. Extractor type names and c-tor parameters) """ if ext: if not ext.startswith("."): ext = "." + ext ext = ext.lower() if (path.lower().endswith(ext) and osp.isfile(path)) or ( not ext and dirname and osp.isdir(path) and os.sep + osp.normpath(dirname.lower()) + os.sep in osp.abspath(path.lower()) + os.sep ): sources = [{"url": path, "format": extractor_name}] else: sources = [] for d in range(max_depth + 1): sources.extend( {"url": p, "format": extractor_name} for p in iglob( osp.join(path, *("*" * d), dirname, filename + ext), recursive=recursive ) if (callable(file_filter) and file_filter(p)) or (not callable(file_filter)) ) if sources: break return sources @property def can_stream(self) -> bool: """Flag to indicate whether the importer can stream the dataset item or not.""" return False
[docs] def get_extractor_merger(self) -> Optional[Type[ExtractorMerger]]: """Extractor merger dedicated for the data format Datumaro import process spawns multiple `DatasetBase` for the detected sources. We can find a bunch of the detected sources from the given directory path. It is usually each detected source is corresponded to the subset of dataset in many data formats. Parameters: stream: There can exist a branch according to `stream` flag Returns: If None, use `Dataset.from_extractors()` to merge the extractors, Otherwise, use the return type to merge the extractors. """ return None
[docs] def with_subset_dirs(input_cls: Importer): @wraps(input_cls, updated=()) class WrappedImporter(input_cls): NAME = input_cls.NAME @classmethod def detect( cls, context: FormatDetectionContext, ) -> Optional[FormatDetectionConfidence]: @contextmanager def _change_context_root_path(context: FormatDetectionContext, path: str): tmp = context.root_path context._root_path = path yield context._root_path = tmp confs = [] path = context.root_path if not osp.isdir(path): context.fail( f"{input_cls.NAME} should require an input as a directory path. " f"However, {path} is not a directory path." ) for sub_dir in os.listdir(path): if sub_dir.lower() in SUBSET_NAME_BLACKLIST: continue sub_path = osp.join(path, sub_dir) if osp.isdir(sub_path): with _change_context_root_path(context, sub_path): conf = input_cls.detect(context) if conf is not None: confs += [conf] if len(confs) == 0: context.fail(f"{input_cls.NAME} cannot find its subdirectory structure.") return max(confs) def __call__(self, path, **extra_params): sources = [] for sub_dir in os.listdir(path): sub_path = osp.join(path, sub_dir) if osp.isdir(sub_path): source = input_cls.__call__(self, sub_path, **extra_params) if len(source) != 1: raise DatasetImportError( f"@with_subset_dirs only allows one source format from {sub_path}." ) if "subset" in source[0]: raise DatasetImportError( f"@with_subset_dirs does not allows a subset key in source: {source[0]}." ) source[0]["options"]["subset"] = sub_dir sources += source return sources def __reduce__(self): return (input_cls.__class__, ()) return WrappedImporter