Source code for datumaro.plugins.sampler.relevancy_sampler

# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

from collections import defaultdict
from typing import TYPE_CHECKING, Optional, Union

from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.dataset_base import IDataset
from datumaro.components.transformer import Transform
from datumaro.util import parse_str_enum_value

from .algorithm.algorithm import Algorithm, SamplingMethod

if TYPE_CHECKING:
    import pandas as pd
else:
    from datumaro.util.import_util import lazy_import

    pd = lazy_import("pandas")


[docs] class RelevancySampler(Transform, CliPlugin): r""" Sampler that analyzes model inference results on the dataset |n and picks the best sample for training.|n |n Creates a dataset from the `-k/--count` hardest items for a model. The whole dataset or a single subset will be split into the `sampled` and `unsampled` subsets based on the model confidence. The dataset **must** contain model confidence values in the `scores` attributes of annotations.|n |n There are five methods of sampling (the `-m/--method` option):|n |s|s- `topk` - Return the k items with the highest uncertainty data|n |s|s- `lowk` - Return the k items with the lowest uncertainty data|n |s|s- `randk` - Return random k items|n |s|s- `mixk` - Return a half using topk, and the other half using lowk method|n |s|s- `randtopk` - Select 3*k items randomly, and return the topk among them|n |n Notes:|n |s|s- Each image's inference result must contain the probability for|n |s|s|s|sall classes.|n |s|s- Requesting a sample larger than the number of all images will|n |s|s|s|sreturn all images.|n |n Example: select the most relevant data subset of 20 images |n |s|sbased on model certainty, put the result into 'sample' subset |n |s|sand put all the rest into 'unsampled' subset, use 'train' subset |n |s|sas input. |n .. code-block:: |s|s%(prog)s \ |n |s|s|s|s--algorithm entropy \ |n |s|s|s|s--subset_name train \ |n |s|s|s|s--sample_name sample \ |n |s|s|s|s--unsampled_name unsampled \ |n |s|s|s|s--sampling_method topk -k 20 """
[docs] @classmethod def build_cmdline_parser(cls, **kwargs): parser = super().build_cmdline_parser(**kwargs) parser.add_argument( "-k", "--count", type=int, required=True, help="Number of items to sample" ) parser.add_argument( "-a", "--algorithm", default=Algorithm.entropy.name, choices=[t.name for t in Algorithm], help="Sampling algorithm (one of {}; default: %(default)s)".format( ", ".join(t.name for t in Algorithm) ), ) parser.add_argument( "-i", "--input_subset", default=None, help="Subset name to select sample from (default: %(default)s)", ) parser.add_argument( "-o", "--sampled_subset", default="sample", help="Subset name to put sampled data to (default: %(default)s)", ) parser.add_argument( "-u", "--unsampled_subset", default="unsampled", help="Subset name to put the rest data to (default: %(default)s)", ) parser.add_argument( "-m", "--sampling_method", default=SamplingMethod.topk.name, choices=[t.name for t in SamplingMethod], help="Sampling method (one of {}; default: %(default)s)".format( ", ".join(t.name for t in SamplingMethod) ), ) parser.add_argument("-d", "--output_file", help="A .csv file path to dump sampling results") return parser
def __init__( self, extractor: IDataset, count: int, *, algorithm: Union[str, Algorithm], sampling_method: Union[str, SamplingMethod], input_subset: Optional[str] = None, sampled_subset: str = "sample", unsampled_subset: str = "unsampled", output_file: Optional[str] = None, ): """ Parameters ---------- extractor algorithm Specifying the algorithm to calculate the uncertainty for sample selection. default: 'entropy' subset_name The name of the subset to which you want to select a sample. sample_name Subset name of the selected sample, default: 'sample' sampling_method Method of sampling, 'topk' or 'lowk' or 'randk' count Number of samples extracted output_file A path to .csv file for sampling results """ super().__init__(extractor) self.input_subset = input_subset self.sampled_subset = sampled_subset self.unsampled_subset = unsampled_subset self.algorithm = parse_str_enum_value(algorithm, Algorithm).name self.sampling_method = parse_str_enum_value(sampling_method, SamplingMethod).name self.count = count self.output_file = output_file # Use the --output_file option to save the sample list as a csv file if output_file and not output_file.endswith(".csv"): raise ValueError("The output file must have the '.csv' extension") @staticmethod def _load_inference_from_subset(extractor, subset_name): # 1. Get Dataset from subset name if subset_name in extractor.subsets(): subset = extractor.get_subset(subset_name) else: raise Exception(f"Unknown subset '{subset_name}'") data_df = defaultdict(list) infer_df = defaultdict(list) # 2. Fill the data_df and infer_df to fit the sampler algorithm # input format. for item in subset: data_df["ImageID"].append(item.id) if not item.media or item.media.size is None: raise Exception(f"Item {item.id} does not have image info") width, height = item.media.size data_df["Width"].append(width) data_df["Height"].append(height) data_df["ImagePath"].append(getattr(item.media, "path", None)) if not item.annotations: raise Exception(f"Item {item.id} does not have annotations") for annotation in item.annotations: if "scores" not in annotation.attributes: raise Exception( f"Item {item.id} - an annotation " "does not have 'scores' attribute" ) probs = annotation.attributes["scores"] infer_df["ImageID"].append(item.id) for prob_idx, prob in enumerate(probs): infer_df[f"ClassProbability{prob_idx+1}"].append(prob) data_df = pd.DataFrame(data_df) infer_df = pd.DataFrame(infer_df) return data_df, infer_df @staticmethod def _calculate_uncertainty(algorithm, data, inference): # Checking and creating algorithms if algorithm == Algorithm.entropy.name: from .algorithm.entropy import SampleEntropy # Data delivery, uncertainty score calculations also proceed. sampler = SampleEntropy(data, inference) else: raise Exception( f"Unknown algorithm '{algorithm}', available " f"algorithms: {[a.name for a in Algorithm]}" ) return sampler def _get_sample_subset(self, image): if image.subset == self.input_subset: # 1. Returns the sample subset if the id belongs to samples. if image.id in self.sample_id: return self.sampled_subset else: return self.unsampled_subset else: # 2. Returns the existing subset name if it is not a sample return image.subset def __iter__(self): # Import data into a subset name and convert it # to a format that will be used in the sampler algorithm with # the inference result. data_df, infer_df = self._load_inference_from_subset(self._extractor, self.input_subset) sampler = self._calculate_uncertainty(self.algorithm, data_df, infer_df) self.result = sampler.get_sample(method=self.sampling_method, k=self.count) if self.output_file is not None: self.result.to_csv(self.output_file, index=False) self.sample_id = self.result["ImageID"].to_list() # Transform properties for each data for item in self._extractor: # After checking whether each item belongs to a sample, # rename the subset yield self.wrap_item(item, subset=self._get_sample_subset(item))