Source code for datumaro.plugins.sampler.algorithm.entropy

# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import logging as log
import math
import re
from typing import TYPE_CHECKING

from .algorithm import InferenceResultAnalyzer

if TYPE_CHECKING:
    import pandas as pd
else:
    from datumaro.util.import_util import lazy_import

    pd = lazy_import("pandas")



[docs]
class SampleEntropy(InferenceResultAnalyzer):
    """
    Entropy is a class that inherits an Sampler,
    calculates an uncertainty score based on an entropy,
    and get samples based on that score.
    """

    def __init__(self, data, inference):
        """
        Constructor function

        Args:

            data: Receive the data format in pd.DataFrame format.
                    ImageID is an essential element for data.
            inference:
                Receive the inference format in the form of pd.DataFrame.
                ImageID and ClassProbability are essential for inferences.
        """
        super().__init__(data, inference)

        # check the existence of "ImageID" in data & inference
        if "ImageID" not in data:
            raise Exception("Invalid Data, ImageID not found in data")
        if "ImageID" not in inference:
            raise Exception("Invalid Data, ImageID not found in inference")

        # check the existence of "ClassProbability" in inference
        self.num_classes = 0
        for head in list(inference):
            if re.match(r"ClassProbability\d+", head):
                self.num_classes += 1

        if self.num_classes == 0:
            raise Exception("Invalid data, Inference do not have ClassProbability values")

        # rank: The inference DataFrame, sorted according to the score.
        self.rank = self._rank_images().sort_values(by="rank")


[docs]
    def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
        """
        A function that extracts sample data and returns it.

        Args:
            method:
                - 'topk' - It extracts the k sample data with the
                  highest uncertainty.
                - 'lowk' - It extracts the k sample data with the
                  lowest uncertainty.
                - 'randomk' - Extract and return random k sample data.
            k: number of sample data
            n: Parameters to be used in the randtopk method, Variable to first
                extract data of multiple n of k.
        Returns:
            Extracted sample data : pd.DataFrame
        """
        temp_rank = self.rank

        # 1. k value check
        if not isinstance(k, int) or k <= 0:
            raise ValueError(f"Invalid value {k}. k must have an integer greater than zero.")

        # 2. Select a sample according to the method
        if k <= len(temp_rank):
            if method == self.sampling_method.topk.name:
                temp_rank = temp_rank[:k]
            elif method == self.sampling_method.lowk.name:
                temp_rank = temp_rank[-k:]
            elif method == self.sampling_method.randk.name:
                return self.data.sample(n=k).reset_index(drop=True)
            elif method in {self.sampling_method.mixk.name, self.sampling_method.randtopk.name}:
                return self._get_sample_mixed(method=method, k=k, n=n)
            else:
                raise ValueError(f"Unknown sampling method '{method}'")
        else:
            log.warning("The number of samples is greater than the size of the " "selected subset.")

        columns = list(self.data.columns)
        merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
        return merged_df[columns].reset_index(drop=True)


    def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
        """
        A function that extracts sample data and returns it.

        Args:
            method:
                - 'mixk': Return top-k and low-k halves based on uncertainty.
                - 'randomtopk': Randomly extract n*k and return k
                  with high uncertainty.
            k: number of sample data
            n: Number to extract n * k from total data according to n,
                and top-k from it
        Returns:
            Extracted sample data : pd.DataFrame
        """
        temp_rank = self.rank

        # Select a sample according to the method
        if k <= len(temp_rank):
            if method == self.sampling_method.mixk.name:
                if k % 2 == 0:
                    temp_rank = pd.concat([temp_rank[: k // 2], temp_rank[-(k // 2) :]])
                else:
                    temp_rank = pd.concat([temp_rank[: (k // 2) + 1], temp_rank[-(k // 2) :]])
            elif method == self.sampling_method.randtopk.name:
                if n * k <= len(temp_rank):
                    temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank")
                else:
                    log.warning(msg="n * k exceeds the length of the inference")
                temp_rank = temp_rank[:k]

        columns = list(self.data.columns)
        merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
        return merged_df[columns].reset_index(drop=True)

    def _rank_images(self) -> pd.DataFrame:
        """
        A internal function that ranks the inference data based on uncertainty.
        Returns:
            inference data sorted by uncertainty. pd.DataFrame
        """
        # 1. Load Inference
        inference, res = None, None
        if self.inference is not None:
            inference = pd.DataFrame(self.inference)
        else:
            raise Exception("Invalid Data, Failed to load inference result")

        # 2. If the reference data frame does not contain an uncertify score, calculate it
        if "Uncertainty" not in inference:
            inference = self._calculate_uncertainty_from_classprob(inference)

        # 3. Check that Uncertainty values are in place.
        na_df = inference.isna().sum()
        if "Uncertainty" in na_df and na_df["Uncertainty"] > 0:
            raise Exception("Some inference results do not have Uncertainty values")

        # 4. Ranked based on Uncertainty score
        res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean()
        res["rank"] = res["Uncertainty"].rank(ascending=False, method="first")
        res = res.reset_index()

        return res

    def _calculate_uncertainty_from_classprob(self, inference: pd.DataFrame) -> pd.DataFrame:
        """
        A function that calculates uncertainty based on entropy through
        ClassProbability values.
        Args:
            inference: Inference data where uncertainty has not been calculated
        Returns:
            inference data with uncertainty variable
        """

        # Calculate Entropy (Uncertainty Score)
        uncertainty = []
        for i in range(len(inference)):
            entropy = 0
            for j in range(self.num_classes):
                p = inference.loc[i][f"ClassProbability{j+1}"]
                if p < 0 or p > 1:
                    raise Exception("Invalid data, Math domain Error! p is between 0 and 1")
                entropy -= p * math.log(p + 1e-14, math.e)

            uncertainty.append(entropy)

        inference["Uncertainty"] = uncertainty

        return inference