datumaro.components.dataset_storage#

Classes

DatasetPatch(data, infos, categories, ...[, ...])

DatasetStorage(source[, infos, categories, ...])

StreamDatasetStorage(source[, infos, ...])

StreamSubset(source, subset)

class datumaro.components.dataset_storage.DatasetPatch(data: DatasetItemStorage, infos: Dict[str, Any], categories: Dict[AnnotationType, Categories], updated_items: Dict[Tuple[str, str], ItemStatus], updated_subsets: Dict[str, ItemStatus] | None = None)[source]#

Bases: object

class DatasetPatchWrapper(patch: DatasetPatch, parent: IDataset)[source]#

Bases: DatasetItemStorageDatasetView

subsets()[source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

property updated_subsets: Dict[str, ItemStatus]#
as_dataset(parent: IDataset) IDataset[source]#
class datumaro.components.dataset_storage.DatasetStorage(source: IDataset | DatasetItemStorage, infos: Dict[str, Any] | None = None, categories: Dict[AnnotationType, Categories] | None = None, media_type: Type[MediaElement] | None = None, ann_types: Set[AnnotationType] | None = None)[source]#

Bases: IDataset

is_cache_initialized() bool[source]#
init_cache() None[source]#
infos() Dict[str, Any][source]#

Returns meta-info of dataset.

define_infos(infos: Dict[str, Any])[source]#
categories() Dict[AnnotationType, Categories][source]#

Returns metainfo about dataset labels.

define_categories(categories: Dict[AnnotationType, Categories])[source]#
media_type() Type[MediaElement][source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

ann_types() Set[AnnotationType][source]#

Returns available task type from dataset annotation types.

put(item: DatasetItem) None[source]#
get(id: str, subset: str | None = None) DatasetItem | None[source]#

Provides random access to dataset items.

remove(id: str, subset: str | None = None) None[source]#
get_subset(name: str) IDataset[source]#
subsets() Dict[str, IDataset][source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

get_annotated_items() int[source]#
get_annotations() int[source]#
get_datasetitem_by_path(path: str) DatasetItem | None[source]#
transform(method: Type[Transform], *args, **kwargs) None[source]#
has_updated_items()[source]#
get_patch() DatasetPatch[source]#
flush_changes()[source]#
update(source: DatasetPatch | IDataset | Iterable[DatasetItem])[source]#
class datumaro.components.dataset_storage.AnnotationType(value)[source]#

Bases: IntEnum

An enumeration.

unknown = 0#
label = 1#
mask = 2#
points = 3#
polygon = 4#
polyline = 5#
bbox = 6#
caption = 7#
cuboid_3d = 8#
super_resolution_annotation = 9#
depth_annotation = 10#
ellipse = 11#
hash_key = 12#
feature_vector = 13#
tabular = 14#
rotated_bbox = 15#
cuboid_2d = 16#
exception datumaro.components.dataset_storage.CategoriesRedefinedError[source]#

Bases: DatasetError

exception datumaro.components.dataset_storage.ConflictingCategoriesError(msg=None, *, sources=None)[source]#

Bases: DatasetMergeError

sources#
class datumaro.components.dataset_storage.DatasetBase(*, length: int | None = None, subsets: ~typing.Sequence[str] | None = None, media_type: ~typing.Type[~datumaro.components.media.MediaElement] = <class 'datumaro.components.media.Image'>, ann_types: ~typing.List[~datumaro.components.annotation.AnnotationType] | None = None, ctx: ~datumaro.components.contexts.importer.ImportContext | None = None)[source]#

Bases: _DatasetBase, CliPlugin

A base class for user-defined and built-in extractors. Should be used in cases, where SubsetBase is not enough, or its use makes problems with performance, implementation etc.

media_type()[source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

ann_types()[source]#

Returns available task type from dataset annotation types.

exception datumaro.components.dataset_storage.DatasetInfosRedefinedError[source]#

Bases: DatasetError

class datumaro.components.dataset_storage.DatasetItem(id: str, *, subset: str | None = None, media: str | MediaElement | None = None, annotations: List[Annotation] | None = None, attributes: Dict[str, Any] | None = None)[source]#

Bases: object

id: str#
subset: str#
media: MediaElement | None#
annotations: Annotations#
attributes: Dict[str, Any]#
wrap(**kwargs)[source]#
media_as(t: Type[T]) T[source]#
class datumaro.components.dataset_storage.DatasetItemStorage[source]#

Bases: object

is_empty() bool[source]#
put(item: DatasetItem) bool[source]#
get(id: str | DatasetItem, subset: str | None = None, dummy: Any | None = None) DatasetItem | None[source]#
remove(id: str | DatasetItem, subset: str | None = None) bool[source]#
get_subset(name)[source]#
subsets()[source]#
get_annotated_items()[source]#
get_datasetitem_by_path(path)[source]#
get_annotations()[source]#
class datumaro.components.dataset_storage.DatasetItemStorageDatasetView(parent: DatasetItemStorage, infos: Dict[str, Any], categories: Dict[AnnotationType, Categories], media_type: Type[MediaElement] | None, ann_types: Set[AnnotationType] | None)[source]#

Bases: IDataset

class Subset(parent: DatasetItemStorageDatasetView, name: str)[source]#

Bases: IDataset

put(item)[source]#
get(id, subset=None)[source]#

Provides random access to dataset items.

remove(id, subset=None)[source]#
get_subset(name)[source]#
subsets()[source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

infos()[source]#

Returns meta-info of dataset.

categories()[source]#

Returns metainfo about dataset labels.

media_type()[source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

ann_types()[source]#

Returns available task type from dataset annotation types.

infos()[source]#

Returns meta-info of dataset.

categories()[source]#

Returns metainfo about dataset labels.

get_subset(name)[source]#
subsets()[source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

get(id, subset=None)[source]#

Provides random access to dataset items.

media_type()[source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

ann_types()[source]#

Returns available task type from dataset annotation types.

class datumaro.components.dataset_storage.IDataset[source]#

Bases: object

subsets() Dict[str, IDataset][source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

get_subset(name) IDataset[source]#
infos() Dict[str, Any][source]#

Returns meta-info of dataset.

categories() Dict[AnnotationType, Categories][source]#

Returns metainfo about dataset labels.

get(id: str, subset: str | None = None) DatasetItem | None[source]#

Provides random access to dataset items.

media_type() Type[MediaElement][source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

ann_types() List[AnnotationType][source]#

Returns available task type from dataset annotation types.

property is_stream: bool#

Boolean indicating whether the dataset is a stream

If the dataset is a stream, the dataset item is generated on demand from its iterator.

class datumaro.components.dataset_storage.ItemStatus(value)[source]#

Bases: Enum

An enumeration.

added = 1#
modified = 2#
removed = 3#
class datumaro.components.dataset_storage.ItemTransform(extractor: IDataset)[source]#

Bases: Transform

transform_item(item: DatasetItem) DatasetItem | None[source]#

Returns a modified copy of the input item.

Avoid changing and returning the input item, because it can lead to unexpected problems. Use wrap_item() or item.wrap() to simplify copying.

class datumaro.components.dataset_storage.LabelCategories(items: List[str] = _Nothing.NOTHING, label_groups: List[LabelGroup] = _Nothing.NOTHING, *, attributes: Set[str] = _Nothing.NOTHING)[source]#

Bases: Categories

Method generated by attrs for class LabelCategories.

class Category(name, parent: str = '', attributes: Set[str] = _Nothing.NOTHING)[source]#

Bases: object

Method generated by attrs for class LabelCategories.Category.

name: str#
parent: str#
attributes: Set[str]#
class LabelGroup(name, labels: List[str] = [], group_type: GroupType = GroupType.EXCLUSIVE)[source]#

Bases: object

Method generated by attrs for class LabelCategories.LabelGroup.

name: str#
labels: List[str]#
group_type: GroupType#
items: List[str]#
label_groups: List[LabelGroup]#
classmethod from_iterable(iterable: Iterable[str | Tuple[str] | Tuple[str, str] | Tuple[str, str, List[str]]]) LabelCategories[source]#

Creates a LabelCategories from iterable.

Parameters:

iterable

This iterable object can be:

  • a list of str - will be interpreted as list of Category names

  • a list of positional arguments - will generate Categories with these arguments

Returns: a LabelCategories object

add(name: str, parent: str | None = None, attributes: Set[str] | None = None) int[source]#
add_label_group(name: str, labels: List[str], group_type: GroupType) int[source]#
find(name: str) Tuple[int | None, Category | None][source]#
class datumaro.components.dataset_storage.MediaElement(crypter: ~datumaro.components.crypter.Crypter = <datumaro.components.crypter.NullCrypter object>, *args, **kwargs)[source]#

Bases: Generic[AnyData]

as_dict() Dict[str, Any][source]#
from_self(**kwargs)[source]#
property is_encrypted: bool#
set_crypter(crypter: Crypter)[source]#
property type: MediaType#
property data: AnyData | None#
property has_data: bool#
property bytes: bytes | None#
save(fp: str | ~io.IOBase, crypter: ~datumaro.components.crypter.Crypter = <datumaro.components.crypter.NullCrypter object>)[source]#
exception datumaro.components.dataset_storage.MediaTypeError[source]#

Bases: DatumaroError

exception datumaro.components.dataset_storage.NotAvailableError[source]#

Bases: DatumaroError

exception datumaro.components.dataset_storage.RepeatedItemError(item_id)[source]#

Bases: DatasetError

Method generated by attrs for class RepeatedItemError.

item_id#
class datumaro.components.dataset_storage.StreamDatasetStorage(source: IDataset, infos: Dict[str, Any] | None = None, categories: Dict[AnnotationType, Categories] | None = None, media_type: Type[MediaElement] | None = None, ann_types: Set[AnnotationType] | None = None)[source]#

Bases: DatasetStorage

is_cache_initialized() bool[source]#
init_cache() None[source]#
property stacked_transform: IDataset#
put(item: DatasetItem) None[source]#
get(id: str, subset: str | None = None) DatasetItem | None[source]#

Provides random access to dataset items.

remove(id: str, subset: str | None = None) None[source]#
get_subset(name: str) IDataset[source]#
property subset_names#
subsets() Dict[str, IDataset][source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

transform(method: Type[Transform], *args, **kwargs) None[source]#
get_annotated_items() int[source]#
get_annotations() int[source]#
get_datasetitem_by_path(path: str) DatasetItem | None[source]#
get_patch()[source]#
flush_changes()[source]#
update(source: DatasetPatch | IDataset | Iterable[DatasetItem])[source]#
infos() Dict[str, Any][source]#

Returns meta-info of dataset.

categories() Dict[AnnotationType, Categories][source]#

Returns metainfo about dataset labels.

property is_stream: bool#

Boolean indicating whether the dataset is a stream

If the dataset is a stream, the dataset item is generated on demand from its iterator.

class datumaro.components.dataset_storage.StreamSubset(source: IDataset, subset: str)[source]#

Bases: IDataset

subsets() Dict[str, IDataset][source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

get_subset(name) IDataset[source]#
infos() Dict[str, Any][source]#

Returns meta-info of dataset.

categories() Dict[AnnotationType, Categories][source]#

Returns metainfo about dataset labels.

get(id: str, subset: str | None = None) DatasetItem | None[source]#

Provides random access to dataset items.

media_type() Type[MediaElement][source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

ann_types() Set[AnnotationType][source]#

Returns available task type from dataset annotation types.

property is_stream: bool#

Boolean indicating whether the dataset is a stream

If the dataset is a stream, the dataset item is generated on demand from its iterator.

class datumaro.components.dataset_storage.Transform(extractor: IDataset)[source]#

Bases: DatasetBase, CliPlugin

A base class for dataset transformations that change dataset items or their annotations.

static wrap_item(item, **kwargs)[source]#
categories()[source]#

Returns metainfo about dataset labels.

subsets()[source]#

Enumerates subsets in the dataset. Each subset can be a dataset itself.

media_type()[source]#

Returns media type of the dataset items.

All the items are supposed to have the same media type. Supposed to be constant and known immediately after the object construction (i.e. doesn’t require dataset iteration).

infos()[source]#

Returns meta-info of dataset.

datumaro.components.dataset_storage.is_method_redefined(method_name, base_class, target) bool[source]#