Source code for datumaro.util.meta_file_util
# Copyright (C) 2022-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
import os
import os.path as osp
from collections import OrderedDict
import numpy as np
from datumaro.components.annotation import AnnotationType, HashKey
from datumaro.util import dump_json_file, find, parse_json_file
DATASET_META_FILE = "dataset_meta.json"
DATASET_HASHKEY_FILE = "hash_keys.json"
DATASET_HASHKEY_FOLDER = "hash_key_meta"
[docs]
def has_hashkey_file(path):
return osp.isfile(get_hashkey_file(path))
[docs]
def get_hashkey_file(path):
hashkey_folder_path = osp.join(path, DATASET_HASHKEY_FOLDER)
return osp.join(hashkey_folder_path, DATASET_HASHKEY_FILE)
[docs]
def parse_hashkey_file(path):
meta_file = path
if osp.isdir(path):
meta_file = get_hashkey_file(path)
if not osp.exists(meta_file):
return None
dataset_meta = parse_json_file(meta_file)
hashkey_dict = OrderedDict()
for id_, hashkey in dataset_meta.get("hashkey", {}).items():
hashkey_dict[id_] = hashkey
return hashkey_dict
[docs]
def save_hashkey_file(path, item_list):
dataset_hashkey = {}
if osp.isdir(path):
meta_file = get_hashkey_file(path)
hashkey_folder_path = osp.join(path, DATASET_HASHKEY_FOLDER)
if not osp.exists(hashkey_folder_path):
os.makedirs(hashkey_folder_path)
hashkey_dict = parse_hashkey_file(path)
if not hashkey_dict:
hashkey_dict = {}
for item in item_list:
item_id = item.id
item_subset = item.subset
for annotation in item.annotations:
if isinstance(annotation, HashKey):
hashkey = annotation.hash_key
break
hashkey_dict.update({item_subset + "/" + item_id: hashkey.tolist()})
dataset_hashkey["hashkey"] = hashkey_dict
dump_json_file(meta_file, dataset_hashkey, indent=True)
[docs]
def load_hash_key(path, dataset):
if not os.path.isdir(path) or not has_hashkey_file(path):
return dataset
hashkey_dict = parse_hashkey_file(path)
for item in dataset:
hash_key = hashkey_dict[item.subset + "/" + item.id]
item.annotations.append(HashKey(hash_key=np.asarray(hash_key, dtype=np.uint8)))
return dataset