Source code for datumaro.util.meta_file_util

# Copyright (C) 2022-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

import os
import os.path as osp
from collections import OrderedDict

import numpy as np

from datumaro.components.annotation import AnnotationType, HashKey
from datumaro.util import dump_json_file, find, parse_json_file

DATASET_META_FILE = "dataset_meta.json"
DATASET_HASHKEY_FILE = "hash_keys.json"
DATASET_HASHKEY_FOLDER = "hash_key_meta"



[docs]
def is_meta_file(path):
    return osp.splitext(osp.basename(path))[1] == ".json"




[docs]
def has_meta_file(path):
    return osp.isfile(get_meta_file(path))




[docs]
def has_hashkey_file(path):
    return osp.isfile(get_hashkey_file(path))




[docs]
def get_meta_file(path):
    return osp.join(path, DATASET_META_FILE)




[docs]
def get_hashkey_file(path):
    hashkey_folder_path = osp.join(path, DATASET_HASHKEY_FOLDER)
    return osp.join(hashkey_folder_path, DATASET_HASHKEY_FILE)




[docs]
def parse_meta_file(path):
    meta_file = path
    if osp.isdir(path):
        meta_file = get_meta_file(path)

    dataset_meta = parse_json_file(meta_file)

    label_map = OrderedDict()

    for label in dataset_meta.get("labels", []):
        label_map[label] = None

    colors = dataset_meta.get("segmentation_colors", [])
    for i, label in dataset_meta.get("label_map", {}).items():
        label_map[label] = None

        if any(colors) and colors[int(i)] is not None:
            label_map[label] = tuple(colors[int(i)])

    return label_map




[docs]
def save_meta_file(path, categories):
    dataset_meta = {}

    labels = [label.name for label in categories[AnnotationType.label]]
    dataset_meta["labels"] = labels

    if categories.get(AnnotationType.mask):
        label_map = {}
        segmentation_colors = []
        for i, color in categories[AnnotationType.mask].colormap.items():
            if color:
                segmentation_colors.append([int(color[0]), int(color[1]), int(color[2])])
                label_map[str(i)] = labels[i]
        dataset_meta["label_map"] = label_map
        dataset_meta["segmentation_colors"] = segmentation_colors

        bg_label = find(
            categories[AnnotationType.mask].colormap.items(), lambda x: x[1] == (0, 0, 0)
        )
        if bg_label is not None:
            dataset_meta["background_label"] = str(bg_label[0])

    meta_file = path
    if osp.isdir(path):
        meta_file = get_meta_file(path)

    dump_json_file(meta_file, dataset_meta, indent=True)




[docs]
def parse_hashkey_file(path):
    meta_file = path
    if osp.isdir(path):
        meta_file = get_hashkey_file(path)

    if not osp.exists(meta_file):
        return None

    dataset_meta = parse_json_file(meta_file)

    hashkey_dict = OrderedDict()
    for id_, hashkey in dataset_meta.get("hashkey", {}).items():
        hashkey_dict[id_] = hashkey

    return hashkey_dict




[docs]
def save_hashkey_file(path, item_list):
    dataset_hashkey = {}

    if osp.isdir(path):
        meta_file = get_hashkey_file(path)
    hashkey_folder_path = osp.join(path, DATASET_HASHKEY_FOLDER)
    if not osp.exists(hashkey_folder_path):
        os.makedirs(hashkey_folder_path)

    hashkey_dict = parse_hashkey_file(path)
    if not hashkey_dict:
        hashkey_dict = {}

    for item in item_list:
        item_id = item.id
        item_subset = item.subset
        for annotation in item.annotations:
            if isinstance(annotation, HashKey):
                hashkey = annotation.hash_key
                break
        hashkey_dict.update({item_subset + "/" + item_id: hashkey.tolist()})

    dataset_hashkey["hashkey"] = hashkey_dict

    dump_json_file(meta_file, dataset_hashkey, indent=True)




[docs]
def load_hash_key(path, dataset):
    if not os.path.isdir(path) or not has_hashkey_file(path):
        return dataset

    hashkey_dict = parse_hashkey_file(path)
    for item in dataset:
        hash_key = hashkey_dict[item.subset + "/" + item.id]
        item.annotations.append(HashKey(hash_key=np.asarray(hash_key, dtype=np.uint8)))
    return dataset