Source code for datumaro.plugins.splitter

# Copyright (C) 2020-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import copy
import logging as log
from enum import Enum, auto
from math import gcd

import numpy as np

from datumaro.components.annotation import AnnotationType
from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME
from datumaro.components.transformer import Transform
from datumaro.util import cast

NEAR_ZERO = 1e-7



[docs]
class SplitTask(Enum):
    classification = auto()
    detection = auto()
    segmentation = auto()
    reid = auto()




[docs]
class Split(Transform, CliPlugin):
    """
    - classification split |n
    |s|s|s|sSplits dataset into subsets(train/val/test) in class-wise manner. |n
    |s|s|s|sSplits dataset images in the specified ratio, keeping the initial class |n
    |s|s|s|sdistribution.|n
    |n
    - detection & segmentation split |n
    |s|s|s|sEach image can have multiple object annotations - |n
    |s|s|s|s(bbox, mask, polygon). Since an image shouldn't be included |n
    |s|s|s|sin multiple subsets at the same time, and image annotations |n
    |s|s|s|sshouldn't be split, in general, dataset annotations are unlikely |n
    |s|s|s|sto be split exactly in the specified ratio. |n
    |s|s|s|sThis split tries to split dataset images as close as possible |n
    |s|s|s|sto the specified ratio, keeping the initial class distribution.|n
    |n
    - reidentification split |n
    |s|s|s|sIn this task, the test set should consist of images of unseen|n
    |s|s|s|speople or objects during the training phase.|n
    |s|s|s|sThis function splits a dataset in the following way:|n
    |n
    |s|s1. Splits the dataset into 'train + val' and 'test' sets |n
    |s|s|s|s|sbased on person or object ID.|n
    |s|s2. Splits 'test' set into 'test-gallery' and 'test-query' sets |n
    |s|s|s|s|sin class-wise manner.|n
    |s|s3. Splits the 'train + val' set into 'train' and 'val' sets |n
    |s|s|s|s|sin the same way.|n
    |n
    The final subsets would be|n
    'train', 'val', 'test-gallery' and 'test-query'. |n
    |n
    Notes:|n
    |s|s- Each image is expected to have only one Annotation. Unlabeled or |n
    |s|s|s|smulti-labeled images will be split into subsets randomly. |n
    |s|s- If Labels also have attributes, also splits by attribute values.|n
    |s|s- If there is not enough images in some class or attributes group, |n
    |s|s|s|sthe split ratio can't be guaranteed. |n
    |s|s|s|sIn reidentification task, |n
    |s|s- Object ID can be described by Label, or by attribute (--attr parameter)|n
    |s|s- The splits of the test set are controlled by '--query' parameter |n
    |s|s|s|sGallery ratio would be 1.0 - query.|n
    |n
    Example:|n

    .. code-block::

    |s|s%(prog)s -t classification --subset train:.5 --subset val:.2 --subset test:.3 |n
    |s|s%(prog)s -t detection --subset train:.5 --subset val:.2 --subset test:.3 |n
    |s|s%(prog)s -t segmentation --subset train:.5 --subset val:.2 --subset test:.3 |n
    |s|s%(prog)s -t reid --subset train:.5 --subset val:.2 --subset test:.3 --query .5 |n
    |n
    Example: use 'person_id' attribute for splitting|n

    .. code-block::

    |s|s%(prog)s --attr person_id
    """

    _default_split = [("train", 0.5), ("val", 0.2), ("test", 0.3)]
    _default_query_ratio = 0.5


[docs]
    @classmethod
    def build_cmdline_parser(cls, **kwargs):
        parser = super().build_cmdline_parser(**kwargs)
        parser.add_argument(
            "-t",
            "--task",
            default=SplitTask.classification.name,
            choices=[t.name for t in SplitTask],
            help="(one of {}; default: %(default)s)".format(", ".join(t.name for t in SplitTask)),
        )
        parser.add_argument(
            "-s",
            "--subset",
            action="append",
            type=cls._split_arg,
            dest="splits",
            help="Subsets in the form: '<subset>:<ratio>' "
            "(repeatable, default: %s)" % dict(cls._default_split),
        )
        parser.add_argument(
            "--query",
            type=float,
            default=None,
            help="Query ratio in the test set (default: %.3f)" % cls._default_query_ratio,
        )
        parser.add_argument(
            "--attr",
            type=str,
            dest="attr_for_id",
            default=None,
            help="Attribute name representing the ID (default: use label)",
        )
        parser.add_argument("--seed", type=int, help="Random seed")
        return parser


    @staticmethod
    def _split_arg(s):
        parts = s.split(":")
        if len(parts) != 2:
            import argparse

            raise argparse.ArgumentTypeError()
        return (parts[0], float(parts[1]))

    def __init__(self, dataset, task, splits, query=None, attr_for_id=None, seed=None):
        super().__init__(dataset)

        if splits is None:
            splits = self._default_split

        self.task = task
        self.splitter = self._get_splitter(task, dataset, splits, seed, query, attr_for_id)
        self._initialized = False
        self._subsets = self.splitter._subsets

    @staticmethod
    def _get_splitter(task, dataset, splits, seed, query, attr_for_id):
        if task == SplitTask.classification.name:
            splitter = _ClassificationSplit(dataset=dataset, splits=splits, seed=seed)
        elif task in {SplitTask.detection.name, SplitTask.segmentation.name}:
            splitter = _InstanceSpecificSplit(dataset=dataset, splits=splits, seed=seed, task=task)
        elif task == SplitTask.reid.name:
            splitter = _ReidentificationSplit(
                dataset=dataset,
                splits=splits,
                seed=seed,
                query=query,
                attr_for_id=attr_for_id,
            )
        else:
            raise Exception(
                f"Unknown task '{task}', available "
                f"splitter format: {[a.name for a in SplitTask]}"
            )
        return splitter

    def __iter__(self):
        # lazy splitting
        if self._initialized is False:
            self.splitter._split_dataset()
            self._initialized = True
        for i, item in enumerate(self._extractor):
            yield self.wrap_item(item, subset=self.splitter._find_split(i))


[docs]
    def get_subset(self, name):
        # lazy splitting
        if self._initialized is False:
            self.splitter._split_dataset()
            self._initialized = True
        return super().get_subset(name)



[docs]
    def subsets(self):
        # lazy splitting
        if self._initialized is False:
            self.splitter._split_dataset()
            self._initialized = True
        return super().subsets()




class _TaskSpecificSplit:
    def __init__(self, dataset, splits, seed, restrict=False):
        self._extractor = dataset

        snames, sratio, subsets = self._validate_splits(splits, restrict)

        self._snames = snames
        self._sratio = sratio

        self._seed = seed

        # remove subset name restriction
        # https://github.com/openvinotoolkit/datumaro/issues/194
        self._subsets = subsets
        self._parts = []
        self._length = "parent"

        self._initialized = False

    def _set_parts(self, by_splits):
        self._parts = []
        for subset in self._subsets:
            self._parts.append((set(by_splits[subset]), subset))

    @staticmethod
    def _get_uniq_annotations(dataset):
        annotations = []
        unlabeled_or_multi = []

        for idx, item in enumerate(dataset):
            labels = [a for a in item.annotations if a.type == AnnotationType.label]
            if len(labels) == 1:
                annotations.append(labels[0])
            else:
                unlabeled_or_multi.append(idx)

        return annotations, unlabeled_or_multi

    @staticmethod
    def _validate_splits(splits, restrict=False):
        snames = []
        ratios = []
        subsets = set()
        valid = ["train", "val", "test"]
        for subset, ratio in splits:
            # remove subset name restriction
            # https://github.com/openvinotoolkit/datumaro/issues/194
            if restrict:
                assert subset in valid, "Subset name must be one of %s, got %s" % (
                    valid,
                    subset,
                )
            assert (
                0.0 <= ratio and ratio <= 1.0
            ), "Ratio is expected to be in the range " "[0, 1], but got %s for %s" % (
                ratio,
                subset,
            )
            # ignore near_zero ratio because it may produce partition error.
            if ratio > NEAR_ZERO:
                # handling duplication
                if subset in snames:
                    raise Exception("Subset (%s) is duplicated" % subset)
                snames.append(subset)
                ratios.append(float(ratio))
            subsets.add(subset)

        ratios = np.array(ratios)

        total_ratio = np.sum(ratios)
        if not abs(total_ratio - 1.0) <= NEAR_ZERO:
            raise Exception(
                "Sum of ratios is expected to be 1, got %s, which is %s" % (splits, total_ratio)
            )

        return snames, ratios, subsets

    @staticmethod
    def _get_required(ratio):
        if len(ratio) < 2:
            return 1

        for scale in [10, 100]:
            farray = np.array(ratio) * scale
            iarray = farray.astype(int)
            if np.array_equal(iarray, farray):
                break

        # find gcd
        common_divisor = iarray[0]
        for val in iarray[1:]:
            common_divisor = gcd(common_divisor, val)

        required = np.sum(np.array(iarray / common_divisor).astype(int))

        return required

    @staticmethod
    def _get_sections(dataset_size, ratio):
        n_splits = [int(np.around(dataset_size * r)) for r in ratio[:-1]]
        n_splits.append(dataset_size - np.sum(n_splits))

        # if there are splits with zero samples even if ratio is not 0,
        # borrow one from the split who has one or more.
        for ii, num_split in enumerate(n_splits):
            if num_split == 0 and NEAR_ZERO < ratio[ii]:
                midx = np.argmax(n_splits)
                if n_splits[midx] > 0:
                    n_splits[ii] += 1
                    n_splits[midx] -= 1
        sections = np.add.accumulate(n_splits[:-1])
        return sections, n_splits

    @staticmethod
    def _group_by_attr(items):
        """
        Args:
            items: list of (idx_img, ann). ann is the annotation from Label object.
        Returns:
            by_attributes: dict of { combination-of-attrs : list of index }
        """

        # float--> numerical, others(int, string, bool) --> categorical
        def _is_float(value):
            if isinstance(value, str):
                casted = cast(value, float)
                if casted is not None:
                    if cast(casted, str) == value:
                        return True
                return False
            elif isinstance(value, float):
                cast(value, float)
                return True
            return False

        # group by attributes
        by_attributes = dict()
        for idx_img, ann in items:
            # ignore numeric attributes
            filtered = {}
            for attr, value in ann.attributes.items():
                if _is_float(value):
                    continue
                filtered[attr] = value
            attributes = tuple(sorted(filtered.items()))
            if attributes not in by_attributes:
                by_attributes[attributes] = []
            by_attributes[attributes].append(idx_img)

        return by_attributes

    def _split_by_attr(self, datasets, snames, ratio, out_splits, merge_small_classes=True):
        def _split_indice(indice):
            sections, _ = self._get_sections(len(indice), ratio)
            splits = np.array_split(indice, sections)
            for subset, split in zip(snames, splits):
                if 0 < len(split):
                    out_splits[subset].extend(split)

        required = self._get_required(ratio)
        rest = []
        for _, items in datasets.items():
            np.random.shuffle(items)
            by_attributes = self._group_by_attr(items)
            attr_combinations = list(by_attributes.keys())
            np.random.shuffle(attr_combinations)  # add randomness
            for attr in attr_combinations:
                indice = by_attributes[attr]
                quo = len(indice) // required
                if quo > 0:
                    filtered_size = quo * required
                    _split_indice(indice[:filtered_size])
                    rest.extend(indice[filtered_size:])
                else:
                    rest.extend(indice)

                quo = len(rest) // required
                if quo > 0:
                    filtered_size = quo * required
                    _split_indice(rest[:filtered_size])
                    rest = rest[filtered_size:]

            if not merge_small_classes and len(rest) > 0:
                _split_indice(rest)
                rest = []

        if len(rest) > 0:
            _split_indice(rest)

    def _split_unlabeled(self, unlabeled, by_splits):
        """
        split unlabeled data into subsets (detection, classification)
        Args:
            unlabeled: list of index of unlabeled or multi-labeled data
            by_splits: splits up to now
        Returns:
            by_splits: final splits
        """
        dataset_size = len(self._extractor)
        _, n_splits = list(self._get_sections(dataset_size, self._sratio))
        counts = [len(by_splits[sname]) for sname in self._snames]
        expected = [max(0, v) for v in np.subtract(n_splits, counts)]
        sections = np.add.accumulate(expected[:-1])
        np.random.shuffle(unlabeled)
        splits = np.array_split(unlabeled, sections)
        for subset, split in zip(self._snames, splits):
            if 0 < len(split):
                by_splits[subset].extend(split)

    def _find_split(self, index):
        for subset_indices, subset in self._parts:
            if index in subset_indices:
                return subset
        return DEFAULT_SUBSET_NAME  # all the possible remainder --> default

    def _split_dataset(self):
        raise NotImplementedError()


class _ClassificationSplit(_TaskSpecificSplit):
    """
    Splits dataset into subsets(train/val/test) in class-wise manner. |n
    Splits dataset images in the specified ratio, keeping the initial class
    distribution.|n
    |n
    Notes:|n
    |s|s- Each image is expected to have only one Label. Unlabeled or
    |s|s|s|smulti-labeled images will be split into subsets randomly. |n
    |s|s- If Labels also have attributes, also splits by attribute values.|n
    |s|s- If there is not enough images in some class or attributes group,
    |s|s|s|sthe split ratio can't be guaranteed.|n
    |n
    Example:|n

    .. code-block::

    |s|s%(prog)s -t classification --subset train:.5 --subset val:.2 --subset test:.3
    """

    def __init__(self, dataset, splits, seed=None):
        """
        Parameters
        ----------
        dataset : Dataset
        splits : list
            A list of (subset(str), ratio(float))
            The sum of ratios is expected to be 1.
        seed : int
            optional
        """
        super().__init__(dataset, splits, seed)

    def _split_dataset(self):
        np.random.seed(self._seed)

        # support only single label for a DatasetItem
        # 1. group by label
        by_labels = dict()
        annotations, unlabeled = self._get_uniq_annotations(self._extractor)

        for idx, ann in enumerate(annotations):
            label = getattr(ann, "label", None)
            if label not in by_labels:
                by_labels[label] = []
            by_labels[label].append((idx, ann))

        by_splits = dict()
        for subset in self._subsets:
            by_splits[subset] = []

        # 2. group by attributes
        self._split_by_attr(by_labels, self._snames, self._sratio, by_splits)

        # 3. split unlabeled data
        if len(unlabeled) > 0:
            self._split_unlabeled(unlabeled, by_splits)

        # 4. set parts
        self._set_parts(by_splits)


class _ReidentificationSplit(_TaskSpecificSplit):
    """
    Splits a dataset for re-identification task.|n
    Produces a split with a specified ratio of images, avoiding having same
    labels in different subsets.|n
    |n
    In this task, the test set should consist of images of unseen
    people or objects during the training phase. |n
    This function splits a dataset in the following way:|n
    |n
    |s|s1. Splits the dataset into 'train + val' and 'test' sets|n
    |s|s|s|s|sbased on person or object ID.|n
    |s|s2. Splits 'test' set into 'test-gallery' and 'test-query' sets|n
    |s|s|s|s|sin class-wise manner.|n
    |s|s3. Splits the 'train + val' set into 'train' and 'val' sets|n
    |s|s|s|s|sin the same way.|n
    |n
    The final subsets would be
    'train', 'val', 'test-gallery' and 'test-query'. |n
    |n
    Notes:|n
    |s|s- Each image is expected to have a single Label. Unlabeled or multi-labeled
    |s|s|s|simages will be split into 'not-supported'.|n
    |s|s- Object ID can be described by Label, or by attribute (--attr parameter)|n
    |s|s- The splits of the test set are controlled by '--query' parameter. |n
    |s|s|s|sGallery ratio would be 1.0 - query.|n
    |n
    Example: split a dataset in the specified ratio, split the test set|n
    into gallery and query in 1:1 ratio|n

    .. code-block::

    |s|s%(prog)s -t reidentification --subset train:.5 --subset val:.2 --subset test:.3 --query .5|n
    |n
    Example: use 'person_id' attribute for splitting|n

    .. code-block::

    |s|s%(prog)s --attr person_id
    """

    _default_query_ratio = 0.5

    def __init__(self, dataset, splits, query=None, attr_for_id=None, seed=None):
        """
        Parameters
        ----------
        dataset : Dataset
        splits : list
            A list of (subset(str), ratio(float))
            Subset is expected to be one of ["train", "val", "test"].
            The sum of ratios is expected to be 1.
        query : float
            The ratio of 'test-query' set.
            The ratio of 'test-gallery' set would be 1.0 - query.
        attr_for_id: str
            attribute name representing the person/object id.
            if this is not specified, label would be used.
        seed : int
            optional
        """
        super().__init__(dataset, splits, seed, restrict=True)

        if query is None:
            query = self._default_query_ratio

        assert 0.0 <= query and query <= 1.0, (
            "Query ratio is expected to be in the range " "[0, 1], but got %f" % query
        )
        test_splits = [("test-query", query), ("test-gallery", 1.0 - query)]

        # remove subset name restriction
        self._subsets = {"train", "val", "test-gallery", "test-query"}
        self._test_splits = test_splits
        self._attr_for_id = attr_for_id

    def _split_dataset(self):
        np.random.seed(self._seed)

        id_snames, id_ratio = self._snames, self._sratio

        attr_for_id = self._attr_for_id
        dataset = self._extractor

        # group by ID(attr_for_id)
        by_id = dict()
        annotations, unlabeled = self._get_uniq_annotations(dataset)
        if attr_for_id is None:  # use label
            for idx, ann in enumerate(annotations):
                ID = getattr(ann, "label", None)
                if ID not in by_id:
                    by_id[ID] = []
                by_id[ID].append((idx, ann))
        else:  # use attr_for_id
            for idx, ann in enumerate(annotations):
                attributes = dict(ann.attributes.items())
                assert attr_for_id in attributes, (
                    "'%s' is expected as an attribute name" % attr_for_id
                )
                ID = attributes[attr_for_id]
                if ID not in by_id:
                    by_id[ID] = []
                by_id[ID].append((idx, ann))

        required = self._get_required(id_ratio)
        if len(by_id) < required:
            log.warning(
                "There's not enough IDs, which is %s, "
                "so train/val/test ratio can't be guaranteed." % len(by_id)
            )

        # 1. split dataset into trval and test
        #    IDs in test set should not exist in train/val set.
        test = id_ratio[id_snames.index("test")] if "test" in id_snames else 0
        if NEAR_ZERO < test:  # has testset
            split_ratio = np.array([test, 1.0 - test])
            IDs = list(by_id.keys())
            np.random.shuffle(IDs)
            sections, _ = self._get_sections(len(IDs), split_ratio)
            splits = np.array_split(IDs, sections)
            testset = {pid: by_id[pid] for pid in splits[0]}
            trval = {pid: by_id[pid] for pid in splits[1]}
            # follow the ratio of datasetitems as possible.
            # naive heuristic: exchange the best item one by one.
            expected_count = int((len(self._extractor) - len(unlabeled)) * split_ratio[0])
            testset_total = int(np.sum([len(v) for v in testset.values()]))
            self._rebalancing(testset, trval, expected_count, testset_total)
        else:
            testset = dict()
            trval = by_id

        by_splits = dict()
        for subset in self._subsets:
            by_splits[subset] = []

        # 2. split 'test' into 'test-gallery' and 'test-query'
        if 0 < len(testset):
            test_snames = []
            test_ratio = []
            for sname, ratio in self._test_splits:
                test_snames.append(sname)
                test_ratio.append(float(ratio))

            self._split_by_attr(
                testset, test_snames, test_ratio, by_splits, merge_small_classes=False
            )

        # 3. split 'trval' into  'train' and 'val'
        trval_snames = ["train", "val"]
        trval_ratio = []
        for subset in trval_snames:
            if subset in id_snames:
                val = id_ratio[id_snames.index(subset)]
            else:
                val = 0.0
            trval_ratio.append(val)
        trval_ratio = np.array(trval_ratio)
        total_ratio = np.sum(trval_ratio)
        if total_ratio < NEAR_ZERO:
            trval_splits = list(zip(["train", "val"], trval_ratio))
            log.warning(
                "Sum of ratios is expected to be positive, "
                "got %s, which is %s" % (trval_splits, total_ratio)
            )
        else:
            trval_ratio /= total_ratio  # normalize
            self._split_by_attr(
                trval, trval_snames, trval_ratio, by_splits, merge_small_classes=False
            )

        # split unlabeled data into 'not-supported'.
        if len(unlabeled) > 0:
            self._subsets.add("not-supported")
            by_splits["not-supported"] = unlabeled

        self._set_parts(by_splits)

    @staticmethod
    def _rebalancing(test, trval, expected_count, testset_total):
        diffs = dict()
        for id_test, items_test in test.items():
            count_test = len(items_test)
            for id_trval, items_trval in trval.items():
                count_trval = len(items_trval)
                diff = count_trval - count_test
                if diff == 0:
                    continue  # exchange has no effect
                if diff not in diffs:
                    diffs[diff] = [(id_test, id_trval)]
                else:
                    diffs[diff].append((id_test, id_trval))
        if len(diffs) == 0:  # nothing would be changed by exchange
            return

        exchanges = []
        while True:
            target_diff = expected_count - testset_total
            # find nearest diff.
            keys = np.array(list(diffs.keys()))
            idx = (np.abs(keys - target_diff)).argmin()
            nearest = keys[idx]
            if abs(target_diff) <= abs(target_diff - nearest):
                break
            choice = np.random.choice(range(len(diffs[nearest])))
            id_test, id_trval = diffs[nearest][choice]
            testset_total += nearest
            new_diffs = dict()
            for diff, IDs in diffs.items():
                new_list = []
                for id1, id2 in IDs:
                    if id1 == id_test or id2 == id_trval:
                        continue
                    new_list.append((id1, id2))
                if 0 < len(new_list):
                    new_diffs[diff] = new_list
            diffs = new_diffs
            exchanges.append((id_test, id_trval))

        # exchange
        for id_test, id_trval in exchanges:
            test[id_trval] = trval.pop(id_trval)
            trval[id_test] = test.pop(id_test)


class _InstanceSpecificSplit(_TaskSpecificSplit):
    """
    Splits a dataset into subsets(train/val/test),
    using object annotations as a basis for splitting.|n
    Tries to produce an image split with the specified ratio, keeping the
    initial distribution of class objects.|n
    |n
    each image can have multiple object annotations -
    (instance bounding boxes, masks, polygons). Since an image shouldn't be included
    in multiple subsets at the same time, and image annotations
    shouldn't be split, in general, dataset annotations are unlikely to be split
    exactly in the specified ratio. |n
    This split tries to split dataset images as close as possible
    to the specified ratio, keeping the initial class distribution.|n
    |n
    Notes:|n
    |s|s- Each image is expected to have one or more annotations.|n
    |s|s- Only bbox annotations are considered in detection task.|n
    |s|s- Mask or Polygon annotations are considered in segmentation task.|n
    |n
    Example: split dataset so that each object class annotations were split|n
    in the specified ratio between subsets|n

    .. code-block::

    |s|s%(prog)s -t detection --subset train:.5 --subset val:.2 --subset test:.3 |n
    |s|s%(prog)s -t segmentation --subset train:.5 --subset val:.2 --subset test:.3
    """

    def __init__(self, dataset, splits, task, seed=None):
        """
        Parameters
        ----------
        dataset : Dataset
        splits : list
            A list of (subset(str), ratio(float))
            The sum of ratios is expected to be 1.
        seed : int
            optional
        """
        super().__init__(dataset, splits, seed)

        if task == SplitTask.detection.name:
            self.annotation_type = [AnnotationType.bbox]
        elif task == SplitTask.segmentation.name:
            self.annotation_type = [AnnotationType.mask, AnnotationType.polygon]

    def _group_by_labels(self, dataset):
        by_labels = dict()
        unlabeled = []

        for idx, item in enumerate(dataset):
            instance_anns = [a for a in item.annotations if a.type in self.annotation_type]
            if len(instance_anns) == 0:
                unlabeled.append(idx)
                continue
            for instance_ann in instance_anns:
                label = getattr(instance_ann, "label", None)
                if label not in by_labels:
                    by_labels[label] = [(idx, instance_ann)]
                else:
                    by_labels[label].append((idx, instance_ann))

        return by_labels, unlabeled

    def _split_dataset(self):
        np.random.seed(self._seed)

        subsets, sratio = self._snames, self._sratio

        # 1. group by bbox label
        by_labels, unlabeled = self._group_by_labels(self._extractor)

        # 2. group by attributes
        required = self._get_required(sratio)
        by_combinations = list()
        for _, items in by_labels.items():
            by_attributes = self._group_by_attr(items)
            # merge groups which have too small samples.
            attr_combinations = list(by_attributes.keys())
            np.random.shuffle(attr_combinations)  # add randomless
            cluster = []
            min_cluster = max(required, len(items) * 0.01)  # temp solution
            for attr in attr_combinations:
                indice = by_attributes[attr]
                if len(indice) >= min_cluster:
                    by_combinations.append(indice)
                else:
                    cluster.extend(indice)
                    if len(cluster) >= min_cluster:
                        by_combinations.append(cluster)
                        cluster = []

            if len(cluster) > 0:
                by_combinations.append(cluster)
                cluster = []

        total = len(self._extractor)
        # total number of GT samples per label-attr combinations
        n_combs = [len(v) for v in by_combinations]

        # 3-1. initially count per-image GT samples
        counts_all = {}
        for idx_img in range(total):
            if idx_img not in unlabeled:
                counts_all[idx_img] = dict()

        for idx_comb, indice in enumerate(by_combinations):
            for idx_img in indice:
                if idx_comb not in counts_all[idx_img]:
                    counts_all[idx_img][idx_comb] = 1
                else:
                    counts_all[idx_img][idx_comb] += 1

        by_splits = dict()
        for sname in self._subsets:
            by_splits[sname] = []

        target_ins = []  # target instance numbers to be split
        for sname, ratio in zip(subsets, sratio):
            target_ins.append([sname, np.array(n_combs) * ratio])

        init_scores = {}
        for idx_img, distributions in counts_all.items():
            norm_sum = 0.0
            for idx_comb, dis in distributions.items():
                norm_sum += dis / n_combs[idx_comb]
            init_scores[idx_img] = norm_sum

        by_scores = dict()
        for idx_img, score in init_scores.items():
            if score not in by_scores:
                by_scores[score] = [idx_img]
            else:
                by_scores[score].append(idx_img)

        # functions for keep the # of annotations not exceed the target_ins num
        def compute_penalty(counts, n_combs):
            p = 0
            for idx_comb, v in counts.items():
                if n_combs[idx_comb] <= 0:
                    p += 1
                else:
                    p += max(0, (v / n_combs[idx_comb]) - 1.0)

            return p

        def update_nc(counts, n_combs):
            for idx_comb, v in counts.items():
                n_combs[idx_comb] = n_combs[idx_comb] - v

        # 3-2. assign each DatasetItem to a split, one by one
        actual_ins = copy.deepcopy(target_ins)
        for score in sorted(by_scores.keys(), reverse=True):
            indice = by_scores[score]
            np.random.shuffle(indice)  # add randomness for the same score

            for idx in indice:
                counts = counts_all[idx]
                # shuffling split order to add randomness
                # when two or more splits have the same penalty value
                np.random.shuffle(actual_ins)

                pp = []
                for sname, nc in actual_ins:
                    if np.sum(nc) <= 0:
                        # the split has enough instances,
                        # stop adding more images to this split
                        pp.append(1e08)
                    else:
                        # compute penalty based on the number of GT samples
                        # added in the split
                        pp.append(compute_penalty(counts, nc))

                # we push an image to a split with the minimum penalty
                midx = np.argmin(pp)
                sname, nc = actual_ins[midx]
                by_splits[sname].append(idx)
                update_nc(counts, nc)

        # split unlabeled data
        if len(unlabeled) > 0:
            self._split_unlabeled(unlabeled, by_splits)

        self._set_parts(by_splits)