Source code for datumaro.cli.commands.downloaders.tfds

# Copyright (C) 2024 Intel Corporation
#
# SPDX-License-Identifier: MIT

import contextlib
import logging as log
import os
import os.path as osp
import sys
from typing import Dict, Tuple

from datumaro.components.dataset_base import IDataset
from datumaro.components.environment import DEFAULT_ENVIRONMENT
from datumaro.components.extractor_tfds import (
    AVAILABLE_TFDS_DATASETS,
    TFDS_EXTRACTOR_AVAILABLE,
    TfdsDatasetRemoteMetadata,
)
from datumaro.util import dump_json
from datumaro.util.os_util import make_file_name

from ...util.errors import CliException
from ...util.project import generate_next_file_name
from .downloader import IDatasetDownloader



[docs]
class TfdsDatasetDownloader(IDatasetDownloader):

[docs]
    @classmethod
    def get_extractor(cls, dataset_id: str) -> Tuple[str, IDataset]:
        if dataset_id.startswith("tfds:"):
            if TFDS_EXTRACTOR_AVAILABLE:
                tfds_ds_name = dataset_id[5:]
                tfds_ds = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name)
                if tfds_ds:
                    default_output_format = tfds_ds.metadata.default_output_format
                    extractor_factory = tfds_ds.make_extractor
                    return default_output_format, extractor_factory
                else:
                    raise CliException(f"Unsupported TFDS dataset '{tfds_ds_name}'")
            else:
                raise CliException(
                    "TFDS datasets are not available, because TFDS and/or"
                    "TensorFlow are not installed.\n"
                    "You can install them with: pip install datumaro[tf,tfds]"
                )
        else:
            raise CliException(f"Unknown dataset ID TFDS dataset '{tfds_ds_name}'")


    @staticmethod
    def _describe_txt(dataset_metas: Dict[str, TfdsDatasetRemoteMetadata], report_file=None):
        with open(report_file, "w") if report_file else contextlib.nullcontext() as report_file:
            if dataset_metas:
                print("Available datasets:", file=report_file)
                for name, meta in sorted(dataset_metas.items()):
                    print(
                        f"""
{name} ({meta.human_name}):
  default output format: {meta.default_output_format}
  description:""",
                        file=report_file,
                    )
                    for line in meta.description.rstrip("\n").split("\n"):
                        print(f"    {line}", file=report_file)
                    print(
                        f"""  download size: {meta.download_size} bytes
  home URL: {meta.home_url or 'N/A'}
  number of classes: {meta.num_classes}
  subsets:""",
                        file=report_file,
                    )
                    for subset_name, subset_meta in sorted(meta.subsets.items()):
                        print(f"    {subset_name}: {subset_meta.num_items} items", file=report_file)
                    print(f"  version: {meta.version}" "", file=report_file)
            else:
                print(
                    """No datasets available.

"You can enable TFDS datasets by installing TensorFlow and TensorFlow Datasets:
    pip install datumaro[tf,tfds]""",
                    file=report_file,
                )

    @staticmethod
    def _describe_json(dataset_metas, report_file):
        def meta_to_raw(meta: TfdsDatasetRemoteMetadata):
            raw = {}

            # We omit the media type from the output, because there is currently no mechanism
            # for mapping media types to strings. The media type could be useful information
            # for users, though, so we might want to implement such a mechanism eventually.

            for attribute in (
                "default_output_format",
                "description",
                "download_size",
                "home_url",
                "human_name",
                "num_classes",
                "version",
            ):
                raw[attribute] = getattr(meta, attribute)

            raw["subsets"] = {
                name: {"num_items": subset.num_items} for name, subset in meta.subsets.items()
            }

            return raw

        with (
            open(report_file, "w") if report_file else contextlib.nullcontext(sys.stdout)
        ) as report_file:
            report_file.write(
                dump_json(
                    {name: meta_to_raw(meta) for name, meta in dataset_metas.items()},
                    indent=True,
                    append_newline=True,
                ).decode()
            )


[docs]
    @classmethod
    def describe(cls, report_format, report_file=None):
        dataset_metas: Dict[str, TfdsDatasetRemoteMetadata] = {}

        if TFDS_EXTRACTOR_AVAILABLE:
            for dataset_name, dataset in AVAILABLE_TFDS_DATASETS.items():
                dataset_metas[f"tfds:{dataset_name}"] = dataset.query_remote_metadata()

        if report_format == "text":
            cls._describe_txt(dataset_metas, report_file)

        elif report_format == "json":
            cls._describe_json(dataset_metas, report_file)



[docs]
    @classmethod
    def describe_command_description(_):
        return """More detailed
        information can be found in the TFDS Catalog:
        <https://www.tensorflow.org/datasets/catalog/overview>."""



[docs]
    @classmethod
    def get_command_description(cls):
        builtin_writers = sorted(DEFAULT_ENVIRONMENT.exporters)
        if TFDS_EXTRACTOR_AVAILABLE:
            available_datasets = ", ".join(f"tfds:{name}" for name in AVAILABLE_TFDS_DATASETS)
        else:
            available_datasets = "N/A (TensorFlow and/or TensorFlow Datasets are not installed)"
            return f"""
Supported datasets: {available_datasets}|n
|n
Supported output formats: {", ".join(builtin_writers)}|n
|n
Examples:|n
- Download the MNIST dataset:|n
|s|s%(prog)s -i tfds:mnist -- --save-media|n
|n
- Download the VOC 2012 dataset, saving only the annotations in the COCO
format into a specific directory:|n
|s|s%(prog)s -i tfds:voc/2012 -f coco -o path/I/like/
"""



[docs]
    @classmethod
    def download(cls, dataset_id, dst_dir, overwrite, output_format, subset, extra_args):
        env = DEFAULT_ENVIRONMENT
        default_output_format, extractor_factory = cls.get_extractor(dataset_id)
        output_format = output_format or default_output_format

        try:
            exporter = env.exporters[output_format]
        except KeyError:
            raise CliException(f"Exporter for format '{output_format}' is not found")
        extra_args = exporter.parse_cmdline(extra_args)

        if dst_dir:
            if not overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
                raise CliException(
                    f"Directory '{dst_dir}' already exists (pass --overwrite to overwrite)"
                )
        else:
            dst_dir = generate_next_file_name(
                f"{make_file_name(dataset_id)}-{make_file_name(output_format)}"
            )
        dst_dir = osp.abspath(dst_dir)

        log.info("Downloading the dataset")
        extractor = extractor_factory()

        if subset:
            try:
                extractor = extractor.subsets()[subset]
            except KeyError:
                raise CliException(f"Subset '{subset}' is not present in the dataset")

        log.info("Exporting the dataset")
        exporter.convert(extractor, dst_dir, default_image_ext=".png", **extra_args)

        log.info(f"Dataset exported to '{dst_dir}' as '{output_format}'")