Source code for datumaro.components.validator
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
from enum import Enum, auto
from typing import Dict, List
from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.dataset import IDataset
[docs]
class Severity(Enum):
info = auto()
warning = auto()
error = auto()
[docs]
class TaskType(Enum):
classification = auto()
detection = auto()
segmentation = auto()
tabular = auto()
[docs]
class Validator(CliPlugin):
[docs]
def validate(self, dataset: IDataset) -> Dict:
"""
Returns the validation results of a dataset based on task type.
Args:
dataset (IDataset): Dataset to be validated
Raises:
ValueError
Returns:
validation_results (dict):
Dict with validation statistics, reports and summary.
"""
validation_results = {}
if not isinstance(dataset, IDataset):
raise TypeError("Invalid dataset type '%s'" % type(dataset))
# generate statistics
stats = self.compute_statistics(dataset)
validation_results["statistics"] = stats
# generate validation reports and summary
reports = self.generate_reports(stats)
reports = list(map(lambda r: r.to_dict(), reports))
summary = {
"errors": sum(map(lambda r: r["severity"] == "error", reports)),
"warnings": sum(map(lambda r: r["severity"] == "warning", reports)),
"infos": sum(map(lambda r: r["severity"] == "info", reports)),
}
validation_results["validation_reports"] = reports
validation_results["summary"] = summary
return validation_results
[docs]
def compute_statistics(self, dataset: IDataset) -> Dict:
"""
Computes statistics of the dataset based on task type.
Args:
dataset (IDataset): a dataset to be validated
Returns:
stats (dict): A dict object containing statistics of the dataset.
"""
raise NotImplementedError("Must be implemented in a subclass")
[docs]
def generate_reports(self, stats: Dict) -> List[Dict]:
raise NotImplementedError("Must be implemented in a subclass")