Source code for datumaro.plugins.data_formats.imagenet

# Copyright (C) 2020-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

import errno
import logging as log
import os
import os.path as osp
import warnings
from typing import List, Optional

from datumaro.components.annotation import AnnotationType, Label, LabelCategories
from datumaro.components.dataset_base import DatasetItem, SubsetBase
from datumaro.components.errors import MediaTypeError
from datumaro.components.exporter import Exporter
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.importer import ImportContext, Importer, with_subset_dirs
from datumaro.components.media import Image
from datumaro.util.definitions import SUBSET_NAME_BLACKLIST
from datumaro.util.image import IMAGE_EXTENSIONS, find_images



[docs]
class ImagenetPath:
    IMAGE_DIR_NO_LABEL = "no_label"
    SEP_TOKEN = ":"




[docs]
class ImagenetBase(SubsetBase):
    def __init__(
        self,
        path: str,
        *,
        subset: Optional[str] = None,
        ctx: Optional[ImportContext] = None,
    ):
        if not osp.isdir(path):
            raise NotADirectoryError(errno.ENOTDIR, "Can't find dataset directory", path)

        super().__init__(subset=subset, ctx=ctx)

        self._categories = self._load_categories(path)
        self._items = list(self._load_items(path).values())

    def _load_categories(self, path):
        label_cat = LabelCategories()
        for dirname in sorted(os.listdir(path)):
            if not os.path.isdir(os.path.join(path, dirname)):
                warnings.warn(
                    f"{dirname} is not a directory in the folder {path}, so this will"
                    "be skipped when declaring the cateogries of `imagenet` dataset."
                )
                continue
            if dirname != ImagenetPath.IMAGE_DIR_NO_LABEL:
                label_cat.add(dirname)
        return {AnnotationType.label: label_cat}

    def _load_items(self, path):
        items = {}

        # Images should be in root/label_dir/*.img and root/*.img is not allowed.
        # => max_depth=1, min_depth=1
        for image_path in find_images(path, recursive=True, max_depth=1, min_depth=1):
            label = osp.basename(osp.dirname(image_path))
            image_name = osp.splitext(osp.basename(image_path))[0]

            item_id = label + ImagenetPath.SEP_TOKEN + image_name
            item = items.get(item_id)
            try:
                if item is None:
                    item = DatasetItem(
                        id=item_id, subset=self._subset, media=Image.from_file(path=image_path)
                    )
                    items[item_id] = item
            except Exception as e:
                self._ctx.error_policy.report_item_error(e, item_id=(item_id, self._subset))
            annotations = item.annotations

            if label != ImagenetPath.IMAGE_DIR_NO_LABEL:
                try:
                    label = self._categories[AnnotationType.label].find(label)[0]
                    annotations.append(Label(label=label))
                    self._ann_types.add(AnnotationType.label)
                except Exception as e:
                    self._ctx.error_policy.report_annotation_error(
                        e, item_id=(item_id, self._subset)
                    )

        return items




[docs]
class ImagenetImporter(Importer):
    """TorchVision's ImageFolder style importer.
    For example, it imports the following directory structure.

    .. code-block:: text

        root
        ├── label_0
        │   ├── label_0_1.jpg
        │   └── label_0_2.jpg
        └── label_1
            └── label_1_1.jpg

    """


[docs]
    @classmethod
    def detect(cls, context: FormatDetectionContext) -> FormatDetectionConfidence:
        # Images must not be under a directory whose name is blacklisted.
        for dname in os.listdir(context.root_path):
            dpath = osp.join(context.root_path, dname)
            if osp.isdir(dpath) and dname.lower() in SUBSET_NAME_BLACKLIST:
                context.fail(
                    f"{dname} is found in {context.root_path}. "
                    "However, Images must not be under a directory whose name is blacklisted "
                    f"(SUBSET_NAME_BLACKLIST={SUBSET_NAME_BLACKLIST})."
                )

        return super().detect(context)



[docs]
    @classmethod
    def find_sources(cls, path):
        if not osp.isdir(path):
            return []

        # Images should be in root/label_dir/*.img and root/*.img is not allowed.
        # => max_depth=1, min_depth=1
        for _ in find_images(path, recursive=True, max_depth=1, min_depth=1):
            return [{"url": path, "format": ImagenetBase.NAME}]

        return []



[docs]
    @classmethod
    def get_file_extensions(cls) -> List[str]:
        return list(IMAGE_EXTENSIONS)



[docs]
    @classmethod
    def build_cmdline_parser(cls, **kwargs):
        parser = super().build_cmdline_parser(**kwargs)
        parser.add_argument("--path", required=True)
        parser.add_argument("--subset")

        return parser





[docs]
@with_subset_dirs
class ImagenetWithSubsetDirsImporter(ImagenetImporter):
    """TorchVision ImageFolder style importer.
    For example, it imports the following directory structure.

    .. code-block::

        root
        ├── train
        │   ├── label_0
        │   │   ├── label_0_1.jpg
        │   │   └── label_0_2.jpg
        │   └── label_1
        │       └── label_1_1.jpg
        ├── val
        │   ├── label_0
        │   │   ├── label_0_1.jpg
        │   │   └── label_0_2.jpg
        │   └── label_1
        │       └── label_1_1.jpg
        └── test
            ├── label_0
            │   ├── label_0_1.jpg
            │   └── label_0_2.jpg
            └── label_1
                └── label_1_1.jpg

    Then, it will have three subsets: train, val, and test and they have label_0 and label_1 labels.
    """




[docs]
class ImagenetExporter(Exporter):
    DEFAULT_IMAGE_EXT = ".jpg"
    USE_SUBSET_DIRS = False

    def _apply_impl(self):
        def _get_name(item: DatasetItem) -> str:
            id_parts = item.id.split(ImagenetPath.SEP_TOKEN)

            if len(id_parts) == 1:
                # e.g. item.id = my_img_1
                return item.id
            else:
                # e.g. item.id = label_1:my_img_1
                return "_".join(id_parts[1:])  # ":" is not allowed in windows

        if self._extractor.media_type() and not issubclass(self._extractor.media_type(), Image):
            raise MediaTypeError("Media type is not an image")

        if 1 < len(self._extractor.subsets()) and not self.USE_SUBSET_DIRS:
            log.warning(
                f"There are more than one subset in the dataset ({len(self._extractor.subsets())}). "
                "However, ImageNet format exports all dataset items into the same directory. "
                "Therefore, subset information will be lost. To prevent it, please use ImagenetWithSubsetDirsExporter. "
                'For example, dataset.export("<path/to/output>", format="imagenet_with_subset_dirs").'
            )

        root_dir = self._save_dir
        extractor = self._extractor
        labels = {}
        for item in self._extractor:
            file_name = _get_name(item)
            labels = set(p.label for p in item.annotations if p.type == AnnotationType.label)

            for label in labels:
                label_name = extractor.categories()[AnnotationType.label][label].name
                self._save_image(
                    item,
                    subdir=osp.join(root_dir, item.subset, label_name)
                    if self.USE_SUBSET_DIRS
                    else osp.join(root_dir, label_name),
                    name=file_name,
                )

            if not labels:
                self._save_image(
                    item,
                    subdir=osp.join(root_dir, item.subset, ImagenetPath.IMAGE_DIR_NO_LABEL)
                    if self.USE_SUBSET_DIRS
                    else osp.join(root_dir, ImagenetPath.IMAGE_DIR_NO_LABEL),
                    name=file_name,
                )




[docs]
class ImagenetWithSubsetDirsExporter(ImagenetExporter):
    USE_SUBSET_DIRS = True