Source code for datumaro.plugins.sampler.algorithm.entropy

# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import logging as log
import math
import re
from typing import TYPE_CHECKING

from .algorithm import InferenceResultAnalyzer

if TYPE_CHECKING:
    import pandas as pd
else:
    from datumaro.util.import_util import lazy_import

    pd = lazy_import("pandas")


[docs] class SampleEntropy(InferenceResultAnalyzer): """ Entropy is a class that inherits an Sampler, calculates an uncertainty score based on an entropy, and get samples based on that score. """ def __init__(self, data, inference): """ Constructor function Args: data: Receive the data format in pd.DataFrame format. ImageID is an essential element for data. inference: Receive the inference format in the form of pd.DataFrame. ImageID and ClassProbability are essential for inferences. """ super().__init__(data, inference) # check the existence of "ImageID" in data & inference if "ImageID" not in data: raise Exception("Invalid Data, ImageID not found in data") if "ImageID" not in inference: raise Exception("Invalid Data, ImageID not found in inference") # check the existence of "ClassProbability" in inference self.num_classes = 0 for head in list(inference): if re.match(r"ClassProbability\d+", head): self.num_classes += 1 if self.num_classes == 0: raise Exception("Invalid data, Inference do not have ClassProbability values") # rank: The inference DataFrame, sorted according to the score. self.rank = self._rank_images().sort_values(by="rank")
[docs] def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame: """ A function that extracts sample data and returns it. Args: method: - 'topk' - It extracts the k sample data with the highest uncertainty. - 'lowk' - It extracts the k sample data with the lowest uncertainty. - 'randomk' - Extract and return random k sample data. k: number of sample data n: Parameters to be used in the randtopk method, Variable to first extract data of multiple n of k. Returns: Extracted sample data : pd.DataFrame """ temp_rank = self.rank # 1. k value check if not isinstance(k, int) or k <= 0: raise ValueError(f"Invalid value {k}. k must have an integer greater than zero.") # 2. Select a sample according to the method if k <= len(temp_rank): if method == self.sampling_method.topk.name: temp_rank = temp_rank[:k] elif method == self.sampling_method.lowk.name: temp_rank = temp_rank[-k:] elif method == self.sampling_method.randk.name: return self.data.sample(n=k).reset_index(drop=True) elif method in {self.sampling_method.mixk.name, self.sampling_method.randtopk.name}: return self._get_sample_mixed(method=method, k=k, n=n) else: raise ValueError(f"Unknown sampling method '{method}'") else: log.warning("The number of samples is greater than the size of the " "selected subset.") columns = list(self.data.columns) merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"]) return merged_df[columns].reset_index(drop=True)
def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame: """ A function that extracts sample data and returns it. Args: method: - 'mixk': Return top-k and low-k halves based on uncertainty. - 'randomtopk': Randomly extract n*k and return k with high uncertainty. k: number of sample data n: Number to extract n * k from total data according to n, and top-k from it Returns: Extracted sample data : pd.DataFrame """ temp_rank = self.rank # Select a sample according to the method if k <= len(temp_rank): if method == self.sampling_method.mixk.name: if k % 2 == 0: temp_rank = pd.concat([temp_rank[: k // 2], temp_rank[-(k // 2) :]]) else: temp_rank = pd.concat([temp_rank[: (k // 2) + 1], temp_rank[-(k // 2) :]]) elif method == self.sampling_method.randtopk.name: if n * k <= len(temp_rank): temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank") else: log.warning(msg="n * k exceeds the length of the inference") temp_rank = temp_rank[:k] columns = list(self.data.columns) merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"]) return merged_df[columns].reset_index(drop=True) def _rank_images(self) -> pd.DataFrame: """ A internal function that ranks the inference data based on uncertainty. Returns: inference data sorted by uncertainty. pd.DataFrame """ # 1. Load Inference inference, res = None, None if self.inference is not None: inference = pd.DataFrame(self.inference) else: raise Exception("Invalid Data, Failed to load inference result") # 2. If the reference data frame does not contain an uncertify score, calculate it if "Uncertainty" not in inference: inference = self._calculate_uncertainty_from_classprob(inference) # 3. Check that Uncertainty values are in place. na_df = inference.isna().sum() if "Uncertainty" in na_df and na_df["Uncertainty"] > 0: raise Exception("Some inference results do not have Uncertainty values") # 4. Ranked based on Uncertainty score res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean() res["rank"] = res["Uncertainty"].rank(ascending=False, method="first") res = res.reset_index() return res def _calculate_uncertainty_from_classprob(self, inference: pd.DataFrame) -> pd.DataFrame: """ A function that calculates uncertainty based on entropy through ClassProbability values. Args: inference: Inference data where uncertainty has not been calculated Returns: inference data with uncertainty variable """ # Calculate Entropy (Uncertainty Score) uncertainty = [] for i in range(len(inference)): entropy = 0 for j in range(self.num_classes): p = inference.loc[i][f"ClassProbability{j+1}"] if p < 0 or p > 1: raise Exception("Invalid data, Math domain Error! p is between 0 and 1") entropy -= p * math.log(p + 1e-14, math.e) uncertainty.append(entropy) inference["Uncertainty"] = uncertainty return inference