[docs]classSplit(Transform,CliPlugin):""" - classification split |n |s|s|s|sSplits dataset into subsets(train/val/test) in class-wise manner. |n |s|s|s|sSplits dataset images in the specified ratio, keeping the initial class |n |s|s|s|sdistribution.|n |n - detection & segmentation split |n |s|s|s|sEach image can have multiple object annotations - |n |s|s|s|s(bbox, mask, polygon). Since an image shouldn't be included |n |s|s|s|sin multiple subsets at the same time, and image annotations |n |s|s|s|sshouldn't be split, in general, dataset annotations are unlikely |n |s|s|s|sto be split exactly in the specified ratio. |n |s|s|s|sThis split tries to split dataset images as close as possible |n |s|s|s|sto the specified ratio, keeping the initial class distribution.|n |n - reidentification split |n |s|s|s|sIn this task, the test set should consist of images of unseen|n |s|s|s|speople or objects during the training phase.|n |s|s|s|sThis function splits a dataset in the following way:|n |n |s|s1. Splits the dataset into 'train + val' and 'test' sets |n |s|s|s|s|sbased on person or object ID.|n |s|s2. Splits 'test' set into 'test-gallery' and 'test-query' sets |n |s|s|s|s|sin class-wise manner.|n |s|s3. Splits the 'train + val' set into 'train' and 'val' sets |n |s|s|s|s|sin the same way.|n |n The final subsets would be|n 'train', 'val', 'test-gallery' and 'test-query'. |n |n Notes:|n |s|s- Each image is expected to have only one Annotation. Unlabeled or |n |s|s|s|smulti-labeled images will be split into subsets randomly. |n |s|s- If Labels also have attributes, also splits by attribute values.|n |s|s- If there is not enough images in some class or attributes group, |n |s|s|s|sthe split ratio can't be guaranteed. |n |s|s|s|sIn reidentification task, |n |s|s- Object ID can be described by Label, or by attribute (--attr parameter)|n |s|s- The splits of the test set are controlled by '--query' parameter |n |s|s|s|sGallery ratio would be 1.0 - query.|n |n Example:|n .. code-block:: |s|s%(prog)s -t classification --subset train:.5 --subset val:.2 --subset test:.3 |n |s|s%(prog)s -t detection --subset train:.5 --subset val:.2 --subset test:.3 |n |s|s%(prog)s -t segmentation --subset train:.5 --subset val:.2 --subset test:.3 |n |s|s%(prog)s -t reid --subset train:.5 --subset val:.2 --subset test:.3 --query .5 |n |n Example: use 'person_id' attribute for splitting|n .. code-block:: |s|s%(prog)s --attr person_id """_default_split=[("train",0.5),("val",0.2),("test",0.3)]_default_query_ratio=0.5
[docs]@classmethoddefbuild_cmdline_parser(cls,**kwargs):parser=super().build_cmdline_parser(**kwargs)parser.add_argument("-t","--task",default=SplitTask.classification.name,choices=[t.namefortinSplitTask],help="(one of {}; default: %(default)s)".format(", ".join(t.namefortinSplitTask)),)parser.add_argument("-s","--subset",action="append",type=cls._split_arg,dest="splits",help="Subsets in the form: '<subset>:<ratio>' ""(repeatable, default: %s)"%dict(cls._default_split),)parser.add_argument("--query",type=float,default=None,help="Query ratio in the test set (default: %.3f)"%cls._default_query_ratio,)parser.add_argument("--attr",type=str,dest="attr_for_id",default=None,help="Attribute name representing the ID (default: use label)",)parser.add_argument("--seed",type=int,help="Random seed")returnparser
@staticmethoddef_split_arg(s):parts=s.split(":")iflen(parts)!=2:importargparseraiseargparse.ArgumentTypeError()return(parts[0],float(parts[1]))def__init__(self,dataset,task,splits,query=None,attr_for_id=None,seed=None):super().__init__(dataset)ifsplitsisNone:splits=self._default_splitself.task=taskself.splitter=self._get_splitter(task,dataset,splits,seed,query,attr_for_id)self._initialized=Falseself._subsets=self.splitter._subsets@staticmethoddef_get_splitter(task,dataset,splits,seed,query,attr_for_id):iftask==SplitTask.classification.name:splitter=_ClassificationSplit(dataset=dataset,splits=splits,seed=seed)eliftaskin{SplitTask.detection.name,SplitTask.segmentation.name}:splitter=_InstanceSpecificSplit(dataset=dataset,splits=splits,seed=seed,task=task)eliftask==SplitTask.reid.name:splitter=_ReidentificationSplit(dataset=dataset,splits=splits,seed=seed,query=query,attr_for_id=attr_for_id,)else:raiseException(f"Unknown task '{task}', available "f"splitter format: {[a.nameforainSplitTask]}")returnsplitterdef__iter__(self):# lazy splittingifself._initializedisFalse:self.splitter._split_dataset()self._initialized=Truefori,iteminenumerate(self._extractor):yieldself.wrap_item(item,subset=self.splitter._find_split(i))
class_TaskSpecificSplit:def__init__(self,dataset,splits,seed,restrict=False):self._extractor=datasetsnames,sratio,subsets=self._validate_splits(splits,restrict)self._snames=snamesself._sratio=sratioself._seed=seed# remove subset name restriction# https://github.com/openvinotoolkit/datumaro/issues/194self._subsets=subsetsself._parts=[]self._length="parent"self._initialized=Falsedef_set_parts(self,by_splits):self._parts=[]forsubsetinself._subsets:self._parts.append((set(by_splits[subset]),subset))@staticmethoddef_get_uniq_annotations(dataset):annotations=[]unlabeled_or_multi=[]foridx,iteminenumerate(dataset):labels=[aforainitem.annotationsifa.type==AnnotationType.label]iflen(labels)==1:annotations.append(labels[0])else:unlabeled_or_multi.append(idx)returnannotations,unlabeled_or_multi@staticmethoddef_validate_splits(splits,restrict=False):snames=[]ratios=[]subsets=set()valid=["train","val","test"]forsubset,ratioinsplits:# remove subset name restriction# https://github.com/openvinotoolkit/datumaro/issues/194ifrestrict:assertsubsetinvalid,"Subset name must be one of %s, got %s"%(valid,subset,)assert(0.0<=ratioandratio<=1.0),"Ratio is expected to be in the range ""[0, 1], but got %s for %s"%(ratio,subset,)# ignore near_zero ratio because it may produce partition error.ifratio>NEAR_ZERO:# handling duplicationifsubsetinsnames:raiseException("Subset (%s) is duplicated"%subset)snames.append(subset)ratios.append(float(ratio))subsets.add(subset)ratios=np.array(ratios)total_ratio=np.sum(ratios)ifnotabs(total_ratio-1.0)<=NEAR_ZERO:raiseException("Sum of ratios is expected to be 1, got %s, which is %s"%(splits,total_ratio))returnsnames,ratios,subsets@staticmethoddef_get_required(ratio):iflen(ratio)<2:return1forscalein[10,100]:farray=np.array(ratio)*scaleiarray=farray.astype(int)ifnp.array_equal(iarray,farray):break# find gcdcommon_divisor=iarray[0]forvaliniarray[1:]:common_divisor=gcd(common_divisor,val)required=np.sum(np.array(iarray/common_divisor).astype(int))returnrequired@staticmethoddef_get_sections(dataset_size,ratio):n_splits=[int(np.around(dataset_size*r))forrinratio[:-1]]n_splits.append(dataset_size-np.sum(n_splits))# if there are splits with zero samples even if ratio is not 0,# borrow one from the split who has one or more.forii,num_splitinenumerate(n_splits):ifnum_split==0andNEAR_ZERO<ratio[ii]:midx=np.argmax(n_splits)ifn_splits[midx]>0:n_splits[ii]+=1n_splits[midx]-=1sections=np.add.accumulate(n_splits[:-1])returnsections,n_splits@staticmethoddef_group_by_attr(items):""" Args: items: list of (idx_img, ann). ann is the annotation from Label object. Returns: by_attributes: dict of { combination-of-attrs : list of index } """# float--> numerical, others(int, string, bool) --> categoricaldef_is_float(value):ifisinstance(value,str):casted=cast(value,float)ifcastedisnotNone:ifcast(casted,str)==value:returnTruereturnFalseelifisinstance(value,float):cast(value,float)returnTruereturnFalse# group by attributesby_attributes=dict()foridx_img,anninitems:# ignore numeric attributesfiltered={}forattr,valueinann.attributes.items():if_is_float(value):continuefiltered[attr]=valueattributes=tuple(sorted(filtered.items()))ifattributesnotinby_attributes:by_attributes[attributes]=[]by_attributes[attributes].append(idx_img)returnby_attributesdef_split_by_attr(self,datasets,snames,ratio,out_splits,merge_small_classes=True):def_split_indice(indice):sections,_=self._get_sections(len(indice),ratio)splits=np.array_split(indice,sections)forsubset,splitinzip(snames,splits):if0<len(split):out_splits[subset].extend(split)required=self._get_required(ratio)rest=[]for_,itemsindatasets.items():np.random.shuffle(items)by_attributes=self._group_by_attr(items)attr_combinations=list(by_attributes.keys())np.random.shuffle(attr_combinations)# add randomnessforattrinattr_combinations:indice=by_attributes[attr]quo=len(indice)//requiredifquo>0:filtered_size=quo*required_split_indice(indice[:filtered_size])rest.extend(indice[filtered_size:])else:rest.extend(indice)quo=len(rest)//requiredifquo>0:filtered_size=quo*required_split_indice(rest[:filtered_size])rest=rest[filtered_size:]ifnotmerge_small_classesandlen(rest)>0:_split_indice(rest)rest=[]iflen(rest)>0:_split_indice(rest)def_split_unlabeled(self,unlabeled,by_splits):""" split unlabeled data into subsets (detection, classification) Args: unlabeled: list of index of unlabeled or multi-labeled data by_splits: splits up to now Returns: by_splits: final splits """dataset_size=len(self._extractor)_,n_splits=list(self._get_sections(dataset_size,self._sratio))counts=[len(by_splits[sname])forsnameinself._snames]expected=[max(0,v)forvinnp.subtract(n_splits,counts)]sections=np.add.accumulate(expected[:-1])np.random.shuffle(unlabeled)splits=np.array_split(unlabeled,sections)forsubset,splitinzip(self._snames,splits):if0<len(split):by_splits[subset].extend(split)def_find_split(self,index):forsubset_indices,subsetinself._parts:ifindexinsubset_indices:returnsubsetreturnDEFAULT_SUBSET_NAME# all the possible remainder --> defaultdef_split_dataset(self):raiseNotImplementedError()class_ClassificationSplit(_TaskSpecificSplit):""" Splits dataset into subsets(train/val/test) in class-wise manner. |n Splits dataset images in the specified ratio, keeping the initial class distribution.|n |n Notes:|n |s|s- Each image is expected to have only one Label. Unlabeled or |s|s|s|smulti-labeled images will be split into subsets randomly. |n |s|s- If Labels also have attributes, also splits by attribute values.|n |s|s- If there is not enough images in some class or attributes group, |s|s|s|sthe split ratio can't be guaranteed.|n |n Example:|n .. code-block:: |s|s%(prog)s -t classification --subset train:.5 --subset val:.2 --subset test:.3 """def__init__(self,dataset,splits,seed=None):""" Parameters ---------- dataset : Dataset splits : list A list of (subset(str), ratio(float)) The sum of ratios is expected to be 1. seed : int optional """super().__init__(dataset,splits,seed)def_split_dataset(self):np.random.seed(self._seed)# support only single label for a DatasetItem# 1. group by labelby_labels=dict()annotations,unlabeled=self._get_uniq_annotations(self._extractor)foridx,anninenumerate(annotations):label=getattr(ann,"label",None)iflabelnotinby_labels:by_labels[label]=[]by_labels[label].append((idx,ann))by_splits=dict()forsubsetinself._subsets:by_splits[subset]=[]# 2. group by attributesself._split_by_attr(by_labels,self._snames,self._sratio,by_splits)# 3. split unlabeled dataiflen(unlabeled)>0:self._split_unlabeled(unlabeled,by_splits)# 4. set partsself._set_parts(by_splits)class_ReidentificationSplit(_TaskSpecificSplit):""" Splits a dataset for re-identification task.|n Produces a split with a specified ratio of images, avoiding having same labels in different subsets.|n |n In this task, the test set should consist of images of unseen people or objects during the training phase. |n This function splits a dataset in the following way:|n |n |s|s1. Splits the dataset into 'train + val' and 'test' sets|n |s|s|s|s|sbased on person or object ID.|n |s|s2. Splits 'test' set into 'test-gallery' and 'test-query' sets|n |s|s|s|s|sin class-wise manner.|n |s|s3. Splits the 'train + val' set into 'train' and 'val' sets|n |s|s|s|s|sin the same way.|n |n The final subsets would be 'train', 'val', 'test-gallery' and 'test-query'. |n |n Notes:|n |s|s- Each image is expected to have a single Label. Unlabeled or multi-labeled |s|s|s|simages will be split into 'not-supported'.|n |s|s- Object ID can be described by Label, or by attribute (--attr parameter)|n |s|s- The splits of the test set are controlled by '--query' parameter. |n |s|s|s|sGallery ratio would be 1.0 - query.|n |n Example: split a dataset in the specified ratio, split the test set|n into gallery and query in 1:1 ratio|n .. code-block:: |s|s%(prog)s -t reidentification --subset train:.5 --subset val:.2 --subset test:.3 --query .5|n |n Example: use 'person_id' attribute for splitting|n .. code-block:: |s|s%(prog)s --attr person_id """_default_query_ratio=0.5def__init__(self,dataset,splits,query=None,attr_for_id=None,seed=None):""" Parameters ---------- dataset : Dataset splits : list A list of (subset(str), ratio(float)) Subset is expected to be one of ["train", "val", "test"]. The sum of ratios is expected to be 1. query : float The ratio of 'test-query' set. The ratio of 'test-gallery' set would be 1.0 - query. attr_for_id: str attribute name representing the person/object id. if this is not specified, label would be used. seed : int optional """super().__init__(dataset,splits,seed,restrict=True)ifqueryisNone:query=self._default_query_ratioassert0.0<=queryandquery<=1.0,("Query ratio is expected to be in the range ""[0, 1], but got %f"%query)test_splits=[("test-query",query),("test-gallery",1.0-query)]# remove subset name restrictionself._subsets={"train","val","test-gallery","test-query"}self._test_splits=test_splitsself._attr_for_id=attr_for_iddef_split_dataset(self):np.random.seed(self._seed)id_snames,id_ratio=self._snames,self._sratioattr_for_id=self._attr_for_iddataset=self._extractor# group by ID(attr_for_id)by_id=dict()annotations,unlabeled=self._get_uniq_annotations(dataset)ifattr_for_idisNone:# use labelforidx,anninenumerate(annotations):ID=getattr(ann,"label",None)ifIDnotinby_id:by_id[ID]=[]by_id[ID].append((idx,ann))else:# use attr_for_idforidx,anninenumerate(annotations):attributes=dict(ann.attributes.items())assertattr_for_idinattributes,("'%s' is expected as an attribute name"%attr_for_id)ID=attributes[attr_for_id]ifIDnotinby_id:by_id[ID]=[]by_id[ID].append((idx,ann))required=self._get_required(id_ratio)iflen(by_id)<required:log.warning("There's not enough IDs, which is %s, ""so train/val/test ratio can't be guaranteed."%len(by_id))# 1. split dataset into trval and test# IDs in test set should not exist in train/val set.test=id_ratio[id_snames.index("test")]if"test"inid_snameselse0ifNEAR_ZERO<test:# has testsetsplit_ratio=np.array([test,1.0-test])IDs=list(by_id.keys())np.random.shuffle(IDs)sections,_=self._get_sections(len(IDs),split_ratio)splits=np.array_split(IDs,sections)testset={pid:by_id[pid]forpidinsplits[0]}trval={pid:by_id[pid]forpidinsplits[1]}# follow the ratio of datasetitems as possible.# naive heuristic: exchange the best item one by one.expected_count=int((len(self._extractor)-len(unlabeled))*split_ratio[0])testset_total=int(np.sum([len(v)forvintestset.values()]))self._rebalancing(testset,trval,expected_count,testset_total)else:testset=dict()trval=by_idby_splits=dict()forsubsetinself._subsets:by_splits[subset]=[]# 2. split 'test' into 'test-gallery' and 'test-query'if0<len(testset):test_snames=[]test_ratio=[]forsname,ratioinself._test_splits:test_snames.append(sname)test_ratio.append(float(ratio))self._split_by_attr(testset,test_snames,test_ratio,by_splits,merge_small_classes=False)# 3. split 'trval' into 'train' and 'val'trval_snames=["train","val"]trval_ratio=[]forsubsetintrval_snames:ifsubsetinid_snames:val=id_ratio[id_snames.index(subset)]else:val=0.0trval_ratio.append(val)trval_ratio=np.array(trval_ratio)total_ratio=np.sum(trval_ratio)iftotal_ratio<NEAR_ZERO:trval_splits=list(zip(["train","val"],trval_ratio))log.warning("Sum of ratios is expected to be positive, ""got %s, which is %s"%(trval_splits,total_ratio))else:trval_ratio/=total_ratio# normalizeself._split_by_attr(trval,trval_snames,trval_ratio,by_splits,merge_small_classes=False)# split unlabeled data into 'not-supported'.iflen(unlabeled)>0:self._subsets.add("not-supported")by_splits["not-supported"]=unlabeledself._set_parts(by_splits)@staticmethoddef_rebalancing(test,trval,expected_count,testset_total):diffs=dict()forid_test,items_testintest.items():count_test=len(items_test)forid_trval,items_trvalintrval.items():count_trval=len(items_trval)diff=count_trval-count_testifdiff==0:continue# exchange has no effectifdiffnotindiffs:diffs[diff]=[(id_test,id_trval)]else:diffs[diff].append((id_test,id_trval))iflen(diffs)==0:# nothing would be changed by exchangereturnexchanges=[]whileTrue:target_diff=expected_count-testset_total# find nearest diff.keys=np.array(list(diffs.keys()))idx=(np.abs(keys-target_diff)).argmin()nearest=keys[idx]ifabs(target_diff)<=abs(target_diff-nearest):breakchoice=np.random.choice(range(len(diffs[nearest])))id_test,id_trval=diffs[nearest][choice]testset_total+=nearestnew_diffs=dict()fordiff,IDsindiffs.items():new_list=[]forid1,id2inIDs:ifid1==id_testorid2==id_trval:continuenew_list.append((id1,id2))if0<len(new_list):new_diffs[diff]=new_listdiffs=new_diffsexchanges.append((id_test,id_trval))# exchangeforid_test,id_trvalinexchanges:test[id_trval]=trval.pop(id_trval)trval[id_test]=test.pop(id_test)class_InstanceSpecificSplit(_TaskSpecificSplit):""" Splits a dataset into subsets(train/val/test), using object annotations as a basis for splitting.|n Tries to produce an image split with the specified ratio, keeping the initial distribution of class objects.|n |n each image can have multiple object annotations - (instance bounding boxes, masks, polygons). Since an image shouldn't be included in multiple subsets at the same time, and image annotations shouldn't be split, in general, dataset annotations are unlikely to be split exactly in the specified ratio. |n This split tries to split dataset images as close as possible to the specified ratio, keeping the initial class distribution.|n |n Notes:|n |s|s- Each image is expected to have one or more annotations.|n |s|s- Only bbox annotations are considered in detection task.|n |s|s- Mask or Polygon annotations are considered in segmentation task.|n |n Example: split dataset so that each object class annotations were split|n in the specified ratio between subsets|n .. code-block:: |s|s%(prog)s -t detection --subset train:.5 --subset val:.2 --subset test:.3 |n |s|s%(prog)s -t segmentation --subset train:.5 --subset val:.2 --subset test:.3 """def__init__(self,dataset,splits,task,seed=None):""" Parameters ---------- dataset : Dataset splits : list A list of (subset(str), ratio(float)) The sum of ratios is expected to be 1. seed : int optional """super().__init__(dataset,splits,seed)iftask==SplitTask.detection.name:self.annotation_type=[AnnotationType.bbox]eliftask==SplitTask.segmentation.name:self.annotation_type=[AnnotationType.mask,AnnotationType.polygon]def_group_by_labels(self,dataset):by_labels=dict()unlabeled=[]foridx,iteminenumerate(dataset):instance_anns=[aforainitem.annotationsifa.typeinself.annotation_type]iflen(instance_anns)==0:unlabeled.append(idx)continueforinstance_annininstance_anns:label=getattr(instance_ann,"label",None)iflabelnotinby_labels:by_labels[label]=[(idx,instance_ann)]else:by_labels[label].append((idx,instance_ann))returnby_labels,unlabeleddef_split_dataset(self):np.random.seed(self._seed)subsets,sratio=self._snames,self._sratio# 1. group by bbox labelby_labels,unlabeled=self._group_by_labels(self._extractor)# 2. group by attributesrequired=self._get_required(sratio)by_combinations=list()for_,itemsinby_labels.items():by_attributes=self._group_by_attr(items)# merge groups which have too small samples.attr_combinations=list(by_attributes.keys())np.random.shuffle(attr_combinations)# add randomlesscluster=[]min_cluster=max(required,len(items)*0.01)# temp solutionforattrinattr_combinations:indice=by_attributes[attr]iflen(indice)>=min_cluster:by_combinations.append(indice)else:cluster.extend(indice)iflen(cluster)>=min_cluster:by_combinations.append(cluster)cluster=[]iflen(cluster)>0:by_combinations.append(cluster)cluster=[]total=len(self._extractor)# total number of GT samples per label-attr combinationsn_combs=[len(v)forvinby_combinations]# 3-1. initially count per-image GT samplescounts_all={}foridx_imginrange(total):ifidx_imgnotinunlabeled:counts_all[idx_img]=dict()foridx_comb,indiceinenumerate(by_combinations):foridx_imginindice:ifidx_combnotincounts_all[idx_img]:counts_all[idx_img][idx_comb]=1else:counts_all[idx_img][idx_comb]+=1by_splits=dict()forsnameinself._subsets:by_splits[sname]=[]target_ins=[]# target instance numbers to be splitforsname,ratioinzip(subsets,sratio):target_ins.append([sname,np.array(n_combs)*ratio])init_scores={}foridx_img,distributionsincounts_all.items():norm_sum=0.0foridx_comb,disindistributions.items():norm_sum+=dis/n_combs[idx_comb]init_scores[idx_img]=norm_sumby_scores=dict()foridx_img,scoreininit_scores.items():ifscorenotinby_scores:by_scores[score]=[idx_img]else:by_scores[score].append(idx_img)# functions for keep the # of annotations not exceed the target_ins numdefcompute_penalty(counts,n_combs):p=0foridx_comb,vincounts.items():ifn_combs[idx_comb]<=0:p+=1else:p+=max(0,(v/n_combs[idx_comb])-1.0)returnpdefupdate_nc(counts,n_combs):foridx_comb,vincounts.items():n_combs[idx_comb]=n_combs[idx_comb]-v# 3-2. assign each DatasetItem to a split, one by oneactual_ins=copy.deepcopy(target_ins)forscoreinsorted(by_scores.keys(),reverse=True):indice=by_scores[score]np.random.shuffle(indice)# add randomness for the same scoreforidxinindice:counts=counts_all[idx]# shuffling split order to add randomness# when two or more splits have the same penalty valuenp.random.shuffle(actual_ins)pp=[]forsname,ncinactual_ins:ifnp.sum(nc)<=0:# the split has enough instances,# stop adding more images to this splitpp.append(1e08)else:# compute penalty based on the number of GT samples# added in the splitpp.append(compute_penalty(counts,nc))# we push an image to a split with the minimum penaltymidx=np.argmin(pp)sname,nc=actual_ins[midx]by_splits[sname].append(idx)update_nc(counts,nc)# split unlabeled dataiflen(unlabeled)>0:self._split_unlabeled(unlabeled,by_splits)self._set_parts(by_splits)