Source code for datumaro.plugins.sampler.relevancy_sampler

# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

from collections import defaultdict
from typing import TYPE_CHECKING, Optional, Union

from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.dataset_base import IDataset
from datumaro.components.transformer import Transform
from datumaro.util import parse_str_enum_value

from .algorithm.algorithm import Algorithm, SamplingMethod

if TYPE_CHECKING:
    import pandas as pd
else:
    from datumaro.util.import_util import lazy_import

    pd = lazy_import("pandas")



[docs]
class RelevancySampler(Transform, CliPlugin):
    r"""
    Sampler that analyzes model inference results on the dataset |n
    and picks the best sample for training.|n
    |n
    Creates a dataset from the `-k/--count` hardest items for a model.
    The whole dataset or a single subset will be split into the `sampled`
    and `unsampled` subsets based on the model confidence.
    The dataset **must** contain model confidence
    values in the `scores` attributes of annotations.|n
    |n
    There are five methods of sampling (the `-m/--method` option):|n
    |s|s- `topk` - Return the k items with the highest uncertainty data|n
    |s|s- `lowk` - Return the k items with the lowest uncertainty data|n
    |s|s- `randk` - Return random k items|n
    |s|s- `mixk` - Return a half using topk, and the other half using lowk method|n
    |s|s- `randtopk` - Select 3*k items randomly, and return the topk among them|n
    |n
    Notes:|n
    |s|s- Each image's inference result must contain the probability for|n
    |s|s|s|sall classes.|n
    |s|s- Requesting a sample larger than the number of all images will|n
    |s|s|s|sreturn all images.|n
    |n
    Example: select the most relevant data subset of 20 images |n
    |s|sbased on model certainty, put the result into 'sample' subset |n
    |s|sand put all the rest into 'unsampled' subset, use 'train' subset |n
    |s|sas input. |n

    .. code-block::

    |s|s%(prog)s \ |n
    |s|s|s|s--algorithm entropy \ |n
    |s|s|s|s--subset_name train \ |n
    |s|s|s|s--sample_name sample \ |n
    |s|s|s|s--unsampled_name unsampled \ |n
    |s|s|s|s--sampling_method topk -k 20
    """


[docs]
    @classmethod
    def build_cmdline_parser(cls, **kwargs):
        parser = super().build_cmdline_parser(**kwargs)
        parser.add_argument(
            "-k", "--count", type=int, required=True, help="Number of items to sample"
        )
        parser.add_argument(
            "-a",
            "--algorithm",
            default=Algorithm.entropy.name,
            choices=[t.name for t in Algorithm],
            help="Sampling algorithm (one of {}; default: %(default)s)".format(
                ", ".join(t.name for t in Algorithm)
            ),
        )
        parser.add_argument(
            "-i",
            "--input_subset",
            default=None,
            help="Subset name to select sample from (default: %(default)s)",
        )
        parser.add_argument(
            "-o",
            "--sampled_subset",
            default="sample",
            help="Subset name to put sampled data to (default: %(default)s)",
        )
        parser.add_argument(
            "-u",
            "--unsampled_subset",
            default="unsampled",
            help="Subset name to put the rest data to (default: %(default)s)",
        )
        parser.add_argument(
            "-m",
            "--sampling_method",
            default=SamplingMethod.topk.name,
            choices=[t.name for t in SamplingMethod],
            help="Sampling method (one of {}; default: %(default)s)".format(
                ", ".join(t.name for t in SamplingMethod)
            ),
        )
        parser.add_argument("-d", "--output_file", help="A .csv file path to dump sampling results")
        return parser


    def __init__(
        self,
        extractor: IDataset,
        count: int,
        *,
        algorithm: Union[str, Algorithm],
        sampling_method: Union[str, SamplingMethod],
        input_subset: Optional[str] = None,
        sampled_subset: str = "sample",
        unsampled_subset: str = "unsampled",
        output_file: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        extractor
        algorithm
            Specifying the algorithm to calculate the uncertainty
            for sample selection. default: 'entropy'
        subset_name
            The name of the subset to which you want to select a sample.
        sample_name
            Subset name of the selected sample, default: 'sample'
        sampling_method
            Method of sampling, 'topk' or 'lowk' or 'randk'
        count
            Number of samples extracted
        output_file
            A path to .csv file for sampling results
        """
        super().__init__(extractor)

        self.input_subset = input_subset
        self.sampled_subset = sampled_subset
        self.unsampled_subset = unsampled_subset
        self.algorithm = parse_str_enum_value(algorithm, Algorithm).name
        self.sampling_method = parse_str_enum_value(sampling_method, SamplingMethod).name
        self.count = count
        self.output_file = output_file

        # Use the --output_file option to save the sample list as a csv file
        if output_file and not output_file.endswith(".csv"):
            raise ValueError("The output file must have the '.csv' extension")

    @staticmethod
    def _load_inference_from_subset(extractor, subset_name):
        # 1. Get Dataset from subset name
        if subset_name in extractor.subsets():
            subset = extractor.get_subset(subset_name)
        else:
            raise Exception(f"Unknown subset '{subset_name}'")

        data_df = defaultdict(list)
        infer_df = defaultdict(list)

        # 2. Fill the data_df and infer_df to fit the sampler algorithm
        # input format.
        for item in subset:
            data_df["ImageID"].append(item.id)

            if not item.media or item.media.size is None:
                raise Exception(f"Item {item.id} does not have image info")

            width, height = item.media.size
            data_df["Width"].append(width)
            data_df["Height"].append(height)
            data_df["ImagePath"].append(getattr(item.media, "path", None))

            if not item.annotations:
                raise Exception(f"Item {item.id} does not have annotations")

            for annotation in item.annotations:
                if "scores" not in annotation.attributes:
                    raise Exception(
                        f"Item {item.id} - an annotation " "does not have 'scores' attribute"
                    )
                probs = annotation.attributes["scores"]

                infer_df["ImageID"].append(item.id)

                for prob_idx, prob in enumerate(probs):
                    infer_df[f"ClassProbability{prob_idx+1}"].append(prob)

        data_df = pd.DataFrame(data_df)
        infer_df = pd.DataFrame(infer_df)

        return data_df, infer_df

    @staticmethod
    def _calculate_uncertainty(algorithm, data, inference):
        # Checking and creating algorithms
        if algorithm == Algorithm.entropy.name:
            from .algorithm.entropy import SampleEntropy

            # Data delivery, uncertainty score calculations also proceed.
            sampler = SampleEntropy(data, inference)
        else:
            raise Exception(
                f"Unknown algorithm '{algorithm}', available "
                f"algorithms: {[a.name for a in Algorithm]}"
            )
        return sampler

    def _get_sample_subset(self, image):
        if image.subset == self.input_subset:
            # 1. Returns the sample subset if the id belongs to samples.
            if image.id in self.sample_id:
                return self.sampled_subset
            else:
                return self.unsampled_subset
        else:
            # 2. Returns the existing subset name if it is not a sample
            return image.subset

    def __iter__(self):
        # Import data into a subset name and convert it
        # to a format that will be used in the sampler algorithm with
        # the inference result.
        data_df, infer_df = self._load_inference_from_subset(self._extractor, self.input_subset)

        sampler = self._calculate_uncertainty(self.algorithm, data_df, infer_df)
        self.result = sampler.get_sample(method=self.sampling_method, k=self.count)

        if self.output_file is not None:
            self.result.to_csv(self.output_file, index=False)

        self.sample_id = self.result["ImageID"].to_list()

        # Transform properties for each data
        for item in self._extractor:
            # After checking whether each item belongs to a sample,
            # rename the subset
            yield self.wrap_item(item, subset=self._get_sample_subset(item))