Source code for otx.algorithms.visual_prompting.tasks.openvino

"""OpenVINO Visual Prompting Task."""

# Copyright (C) 2023 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions
# and limitations under the License.

import io
import json
import os
import pickle  # nosec B403
import random
import tempfile
import time
from collections import defaultdict
from copy import deepcopy
from itertools import product
from pathlib import Path
from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Type, Union
from zipfile import ZipFile

import attr
import cv2
import nncf
import numpy as np
import openvino.runtime as ov
from addict import Dict as ADDict
from nncf.common.quantization.structs import QuantizationPreset
from openvino.model_api.adapters import OpenvinoAdapter, create_core
from openvino.model_api.models import Model

from otx.algorithms.common.utils import get_default_async_reqs_num, read_py_config
from import check_if_quantized
from otx.algorithms.visual_prompting.adapters.openvino import model_wrappers
from otx.algorithms.visual_prompting.adapters.pytorch_lightning.datasets.dataset import (
from otx.algorithms.visual_prompting.configs.base import VisualPromptingBaseConfig
from otx.api.entities.annotation import Annotation
from otx.api.entities.dataset_item import DatasetItemEntity
from otx.api.entities.datasets import DatasetEntity
from otx.api.entities.inference_parameters import (
from otx.api.entities.label_schema import LabelSchemaEntity
from otx.api.entities.model import (
from otx.api.entities.model_template import TaskType
from otx.api.entities.optimization_parameters import OptimizationParameters
from otx.api.entities.resultset import ResultSetEntity
from otx.api.entities.subset import Subset
from otx.api.entities.task_environment import TaskEnvironment
from otx.api.serialization.label_mapper import LabelSchemaMapper, label_schema_to_bytes
from otx.api.usecases.evaluation.metrics_helper import MetricsHelper
from otx.api.usecases.exportable_code import demo
from otx.api.usecases.exportable_code.inference.inference import IInferencer
from otx.api.usecases.exportable_code.prediction_to_annotation_converter import (
from otx.api.usecases.tasks.interfaces.deployment_interface import IDeploymentTask
from otx.api.usecases.tasks.interfaces.evaluate_interface import IEvaluationTask
from otx.api.usecases.tasks.interfaces.inference_interface import IInferenceTask
from otx.api.usecases.tasks.interfaces.optimization_interface import (
from otx.utils.logger import get_logger

logger = get_logger()

[docs] class OpenVINOVisualPromptingInferencer(IInferencer): """Inferencer implementation for Visual Prompting using OpenVINO backend. This inferencer has two models, image encoder and decoder. Args: hparams (VisualPromptingBaseConfig): Hyper parameters that the model should use. label_schema (LabelSchemaEntity): LabelSchemaEntity that was used during model training. model_files (Dict[str, Union[str, Path, bytes]]): Path or bytes to model to load, `.xml`, `.bin` or `.onnx` file. weight_files (Dict[str, Union[str, Path, bytes, None]], optional): Path or bytes to weights to load, `.xml`, `.bin` or `.onnx` file. Defaults to None. device (str): Device to run inference on, such as CPU, GPU or MYRIAD. Defaults to "CPU". num_requests (int) : Maximum number of requests that the inferencer can make. Good value is the number of available cores. Defaults to 1. """ def __init__( self, hparams: VisualPromptingBaseConfig, label_schema: LabelSchemaEntity, model_files: Dict[str, Union[str, Path, bytes]], weight_files: Optional[Dict[str, Union[str, Path, bytes, None]]] = {}, device: str = "CPU", num_requests: int = 1, ): assert all(module in model_files for module in ["image_encoder", "decoder"]) self.model = {} model_parameters = {"decoder": {"input_layouts": "image_embeddings:NCHW"}} self.configuration = { "image_encoder": { **attr.asdict( hparams.postprocessing, filter=lambda attr, value: in ["image_size", "resize_type", "downsizing"], ) }, "decoder": { **attr.asdict( hparams.postprocessing, filter=lambda attr, value: not in [ "header", "description", "type", "visible_in_ui", "class_name", "downsizing", ], ) }, } for name in ["image_encoder", "decoder"]: model_adapter = OpenvinoAdapter( core=create_core(), model=model_files.get(name), weights_path=weight_files.get(name, None), model_parameters=model_parameters.get(name, {}), device=device, max_num_requests=num_requests, plugin_config={"PERFORMANCE_HINT": "THROUGHPUT"}, ) self.model[name] = Model.create_model(model_adapter, name, self.configuration.get(name, {}), preload=True) self.converter = VisualPromptingToAnnotationConverter() self.labels = label_schema.get_labels(include_empty=False) self.transform = get_transform() # TODO (sungchul): insert args
[docs] def pre_process( self, dataset_item: DatasetItemEntity, extra_processing: bool = False, use_bbox: bool = False, use_point: bool = False, ) -> Tuple[Dict[str, Any], Dict[str, Any], List[Dict[str, Any]]]: """Pre-process function of OpenVINO Visual Prompting Inferencer for image encoder.""" if use_bbox and use_point: logger.warning("If both use_bbox and use_point are set, bboxes and points will be generated randomly.") prob = 1.0 if not use_point else 0.0 if not use_bbox and use_point else 0.5 images, meta = self.model["image_encoder"].preprocess(dataset_item.numpy, extra_processing) prompts = OTXVisualPromptingDataset.get_prompts(dataset_item, self.labels, prob=prob) prompts = self.model["decoder"].preprocess(prompts, meta) return images, meta, prompts # type: ignore
[docs] def post_process( self, prediction: Dict[str, np.ndarray], metadata: Dict[str, Any] ) -> Tuple[List[Annotation], Any, Any]: """Post-process function of OpenVINO Visual Prompting Inferencer.""" hard_prediction, soft_prediction = self.model["decoder"].postprocess(prediction, metadata) annotation = self.converter.convert_to_annotation(hard_prediction, metadata) return annotation, hard_prediction, soft_prediction
[docs] def predict(self, dataset_item: DatasetItemEntity) -> List[Annotation]: # type: ignore """Perform a prediction for a given input image.""" # forward image encoder images, meta, prompts = self.pre_process(dataset_item) image_embeddings = self.forward_image_encoder(images) annotations: List[Annotation] = [] hard_predictions: List[np.ndarray] = [] soft_predictions: List[np.ndarray] = [] for prompt in prompts: label = prompt.pop("label") prompt.update(image_embeddings) # forward decoder to get predicted mask prediction = self.forward_decoder(prompt) prediction["scores"] = prediction["iou_predictions"] metadata = {"label": label} # set annotation for eval annotation, hard_prediction, soft_prediction = self.post_process(prediction, metadata) annotations.extend(annotation) hard_predictions.append(hard_prediction) soft_predictions.append(soft_prediction) return annotations
[docs] def forward_image_encoder(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: """Forward function of OpenVINO Visual Prompting Inferencer.""" return self.model["image_encoder"].infer_sync(inputs)
[docs] def forward_decoder(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: """Forward function of OpenVINO Visual Prompting Inferencer.""" return self.model["decoder"].infer_sync(inputs)
[docs] def await_all(self) -> None: """Await all running infer requests if any.""" self.model["image_encoder"].await_all() self.model["decoder"].await_all()
[docs] class OpenVINOZeroShotVisualPromptingInferencer(OpenVINOVisualPromptingInferencer): """Inferencer implementation for Zero-shot Visual Prompting using OpenVINO backend. This inferencer has two models, image encoder and decoder. Args: hparams (VisualPromptingBaseConfig): Hyper parameters that the model should use. label_schema (LabelSchemaEntity): LabelSchemaEntity that was used during model training. model_files (Dict[str, Union[str, Path, bytes]]): Path or bytes to model to load, `.xml`, `.bin` or `.onnx` file. weight_files (Dict[str, Union[str, Path, bytes, None]], optional): Path or bytes to weights to load, `.xml`, `.bin` or `.onnx` file. Defaults to None. device (str): Device to run inference on, such as CPU, GPU or MYRIAD. Defaults to "CPU". num_requests (int) : Maximum number of requests that the inferencer can make. Good value is the number of available cores. Defaults to 1. """ def __init__( self, hparams: VisualPromptingBaseConfig, label_schema: LabelSchemaEntity, model_files: Dict[str, Union[str, Path, bytes]], weight_files: Optional[Dict[str, Union[str, Path, bytes, None]]] = {}, device: str = "CPU", num_requests: int = 1, ): super().__init__(hparams, label_schema, model_files, weight_files, device, num_requests) self.point_labels_box = np.array([[2, 3]], dtype=np.float32) self.has_mask_inputs = [np.array([[0.0]]), np.array([[1.0]])] self.reference_feats: Optional[np.ndarray] = None self.used_indices: Optional[np.ndarray] = None
[docs] def pre_process_image_encoder( self, inputs: np.ndarray, extra_processing: bool = False ) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]: """Pre-process function of OpenVINO Zero-shot Visual Prompting Inferencer for image encoder.""" return self.model["image_encoder"].preprocess(inputs, extra_processing)
[docs] def learn( self, dataset_item: DatasetItemEntity, reset_feat: bool = False, use_bbox: bool = False, use_point: bool = False, path_reference_info: str = "vpm_zsl_reference_infos/{}/reference_info.pickle", default_threshold_reference: float = 0.3, ) -> Tuple[Dict[str, np.ndarray], np.ndarray]: """Learn for reference features.""" ref_masks: np.ndarray if reset_feat or self.reference_feats is None: self.initialize_reference_info() images, meta, prompts = self.pre_process(dataset_item, use_bbox, use_point) largest_label: int = max([int(p["label"].id) for p in prompts]) self.expand_reference_info(largest_label) image_embeddings = self.forward_image_encoder(images) processed_embedding = image_embeddings["image_embeddings"].squeeze().transpose(1, 2, 0) original_size = meta["original_shape"][:2] ref_masks = np.zeros((largest_label + 1, *map(int, original_size)), dtype=np.uint8) for prompt in prompts: if "point_coords" in prompt: # bboxes and points label = prompt.pop("label") original_size = prompt.get("orig_size") prompt.update(image_embeddings) prediction = self.forward_decoder(prompt, original_size, is_cascade=False) ref_mask = prediction["upscaled_masks"] else: logger.warning("annotation and polygon will be supported.") continue ref_masks[int(] += ref_mask ref_masks = np.clip(ref_masks, 0, 1) for label in range(largest_label + 1): ref_mask = ref_masks[label] if ref_mask.sum() == 0: # empty prediction continue ref_feat = None cur_default_threshold_reference = deepcopy(default_threshold_reference) while ref_feat is None:"[*] default_threshold_reference : {cur_default_threshold_reference:.4f}") ref_feat = self._generate_masked_features( processed_embedding, ref_masks[label], cur_default_threshold_reference ) cur_default_threshold_reference -= 0.05 self.reference_feats[label] = ref_feat self.used_indices = np.concatenate((self.used_indices, np.array([label]))) reference_info = {"reference_feats": self.reference_feats, "used_indices": self.used_indices} path_reference_info = path_reference_info.format(time.strftime("%Y%m%d-%H%M%S"))"Saved reference info at {path_reference_info}.") pickle.dump(reference_info, open(path_reference_info, "wb")) return reference_info, ref_masks
[docs] def infer( self, images: np.ndarray, reference_feats: np.ndarray, used_indices: np.ndarray, is_cascade: bool = False, threshold: float = 0.0, num_bg_points: int = 1, default_threshold_target: float = 0.65, ) -> Tuple[List[Any], DefaultDict[Any, Any], DefaultDict[Any, Any]]: """Perform a prediction for a given input image.""" points_score: np.ndarray # forward image encoder images, meta = self.pre_process_image_encoder(images) original_shape = np.asarray(meta["original_shape"][:2], dtype=np.int64) image_embeddings = self.forward_image_encoder(images) # get point candidates total_points_scores, total_bg_coords = self._get_prompt_candidates( image_embeddings=image_embeddings["image_embeddings"], reference_feats=reference_feats, used_indices=used_indices, original_shape=original_shape, threshold=threshold, num_bg_points=num_bg_points, default_threshold_target=default_threshold_target, image_size=self.model["image_encoder"].image_size, downsizing=self.model["image_encoder"].downsizing, ) annotations: DefaultDict = defaultdict(list) predicted_masks: DefaultDict = defaultdict(list) used_points: DefaultDict = defaultdict(list) for label in total_points_scores.keys(): points_scores = total_points_scores[label] bg_coords = total_bg_coords[label] for points_score in points_scores: if points_score[-1] in [-1.0, 0.0]: continue x, y = points_score[:2] is_done = False for pm in predicted_masks.get(label, []): # check if that point is already assigned if pm[int(y), int(x)] > 0: is_done = True break if is_done: continue point_coords = np.concatenate((np.array([[x, y]]), bg_coords), axis=0, dtype=np.float32) point_coords = self.model["decoder"]._apply_coords(point_coords, original_shape) point_labels = np.array([1] + [0] * len(bg_coords), dtype=np.float32) inputs_decoder = { "point_coords": point_coords[None], "point_labels": point_labels[None], "orig_size": original_shape[None], } inputs_decoder.update(image_embeddings) prediction = self.forward_decoder(inputs_decoder, original_shape, is_cascade) prediction.update({"scores": points_score[-1]}) predicted_masks[label].append(prediction[self.model["decoder"].output_blob_name]) used_points[label].append(points_score) self._inspect_overlapping_areas(predicted_masks, used_points) for label, predictions in predicted_masks.items(): if len(predictions) == 0: continue metadata = { "label": [_label for _label in self.labels if int(_label.id_) == label][0], "original_size": original_shape, } for prediction, used_point in zip(predictions, used_points[label]): annotation, _, _ = self.post_process( {self.model["decoder"].output_blob_name: prediction, "scores": used_point[-1]}, metadata ) annotations[label].extend(annotation) return sum(annotations.values(), []), predicted_masks, used_points
[docs] def forward_decoder( # type: ignore self, inputs: Dict[str, np.ndarray], original_size: np.ndarray, is_cascade: bool = True, ) -> Dict[str, np.ndarray]: """Forward function of OpenVINO Visual Prompting Inferencer.""" masks: np.ndarray logits: np.ndarray scores: np.ndarray num_iter = 3 if is_cascade else 1 for i in range(num_iter): if i == 0: # First-step prediction mask_input = np.zeros( (1, 1, *map(lambda x: x * 4, inputs["image_embeddings"].shape[2:])), dtype=np.float32 ) has_mask_input = self.has_mask_inputs[0] elif i == 1: # Cascaded Post-refinement-1 mask_input, masks = self._postprocess_masks(masks, logits, scores, is_single=True) # noqa: F821 if masks.sum() == 0: return {"upscaled_masks": masks} has_mask_input = self.has_mask_inputs[1] elif i == 2: # Cascaded Post-refinement-2 mask_input, masks = self._postprocess_masks(masks, logits, scores) # noqa: F821 if masks.sum() == 0: return {"upscaled_masks": masks} has_mask_input = self.has_mask_inputs[1] y, x = np.nonzero(masks) box_coords = self.model["decoder"]._apply_coords( np.array([[[x.min(), y.min()], [x.max(), y.max()]]], dtype=np.float32), original_size ) inputs.update( { "point_coords": np.concatenate((inputs["point_coords"], box_coords), axis=1), "point_labels": np.concatenate((inputs["point_labels"], self.point_labels_box), axis=1), } ) inputs.update({"mask_input": mask_input, "has_mask_input": has_mask_input}) prediction = self.model["decoder"].infer_sync(inputs) upscaled_masks, scores, logits = ( prediction["upscaled_masks"], prediction["iou_predictions"], prediction["low_res_masks"], ) masks = upscaled_masks > self.model["decoder"].mask_threshold _, masks = self._postprocess_masks(masks, logits, scores) return {"upscaled_masks": masks}
def _get_prompt_candidates( self, image_embeddings: np.ndarray, reference_feats: np.ndarray, used_indices: np.ndarray, original_shape: np.ndarray, threshold: float = 0.0, num_bg_points: int = 1, default_threshold_target: float = 0.65, image_size: int = 1024, downsizing: int = 64, ) -> Tuple[Dict[int, np.ndarray], Dict[int, np.ndarray]]: """Get prompt candidates.""" target_feat = image_embeddings.squeeze() c_feat, h_feat, w_feat = target_feat.shape target_feat = target_feat / np.linalg.norm(target_feat, axis=0, keepdims=True) target_feat = target_feat.reshape(c_feat, h_feat * w_feat) total_points_scores: Dict[int, np.ndarray] = {} total_bg_coords: Dict[int, np.ndarray] = {} for label in used_indices: sim = reference_feats[label] @ target_feat sim = sim.reshape(h_feat, w_feat) sim = self._resize_to_original_shape(sim, image_size, original_shape) threshold = (threshold == 0) * default_threshold_target + threshold points_scores, bg_coords = self._point_selection( mask_sim=sim, original_shape=original_shape, threshold=threshold, num_bg_points=num_bg_points, image_size=image_size, downsizing=downsizing, ) if points_scores is not None: total_points_scores[label] = points_scores total_bg_coords[label] = bg_coords return total_points_scores, total_bg_coords def _point_selection( self, mask_sim: np.ndarray, original_shape: np.ndarray, threshold: float = 0.0, num_bg_points: int = 1, image_size: int = 1024, downsizing: int = 64, ) -> Tuple[np.ndarray, np.ndarray]: """Select point used as point prompts.""" _, w_sim = mask_sim.shape # Top-first point selection point_coords = np.where(mask_sim > threshold) fg_coords_scores = np.stack(point_coords[::-1] + (mask_sim[point_coords],), axis=0).T ## skip if there is no point coords if len(fg_coords_scores) == 0: return None, None ratio = image_size / original_shape.max() width = (original_shape[1] * ratio).astype(np.int64) n_w = width // downsizing ## get grid numbers idx_grid = fg_coords_scores[:, 1] * ratio // downsizing * n_w + fg_coords_scores[:, 0] * ratio // downsizing idx_grid_unique = np.unique(idx_grid.astype(np.int64)) ## get matched indices matched_matrix = np.expand_dims(idx_grid, axis=-1) == idx_grid_unique # (totalN, uniqueN) ## sample fg_coords_scores matched by matched_matrix matched_grid = np.expand_dims(fg_coords_scores, axis=1) * np.expand_dims(matched_matrix, axis=-1) ## sample the highest score one of the samples that are in the same grid matched_indices = self._topk_numpy(matched_grid[..., -1], k=1, axis=0, largest=True)[1][0].astype(np.int64) points_scores = matched_grid[matched_indices].diagonal().T ## sort by the highest score sorted_points_scores_indices = np.flip(np.argsort(points_scores[:, -1]), axis=-1).astype(np.int64) points_scores = points_scores[sorted_points_scores_indices] # Top-last point selection bg_indices = self._topk_numpy(mask_sim.flatten(), num_bg_points, largest=False)[1] bg_x = np.expand_dims(bg_indices // w_sim, axis=0) bg_y = bg_indices - bg_x * w_sim bg_coords = np.concatenate((bg_y, bg_x), axis=0).transpose(1, 0) bg_coords = bg_coords.astype(np.float32) return points_scores, bg_coords def _postprocess_masks( self, masks: np.ndarray, logits: np.ndarray, scores: np.ndarray, is_single: bool = False ) -> Tuple[np.ndarray, ...]: """Post-process logits for resized masks according to best index based on scores.""" if is_single: best_idx = 0 else: # skip the first index components scores, masks, logits = map(lambda x: x[:, 1:], (scores, masks, logits)) # filter zero masks while len(scores[0]) > 0 and masks[0, (best_idx := np.argmax(scores[0]))].sum() == 0: scores, masks, logits = map( lambda x: np.concatenate((x[:, :best_idx], x[:, best_idx + 1 :]), axis=1), (scores, masks, logits) ) if len(scores[0]) == 0: # all predicted masks were zero masks, ignore them. return None, np.zeros(masks.shape[-2:]) best_idx = np.argmax(scores[0]) return logits[:, [best_idx]], masks[0, best_idx] def _resize_to_original_shape(self, masks: np.ndarray, image_size: int, original_shape: np.ndarray) -> np.ndarray: """Resize feature size to original shape.""" # resize feature size to input size masks = cv2.resize(masks, (image_size, image_size), interpolation=cv2.INTER_LINEAR) # remove pad prepadded_size = self._get_prepadded_size(original_shape, image_size) masks = masks[..., : prepadded_size[0], : prepadded_size[1]] # resize unpadded one to original shape original_shape = original_shape.astype(np.int64) h, w = original_shape[0], original_shape[1] return cv2.resize(masks, (w, h), interpolation=cv2.INTER_LINEAR) def _get_prepadded_size(self, original_shape: int, image_size: int) -> np.ndarray: """Get pre-padded size.""" scale = image_size / np.max(original_shape) transformed_size = scale * original_shape return np.floor(transformed_size + 0.5).astype(np.int64) def _inspect_overlapping_areas( self, predicted_masks: Dict[int, List[np.ndarray]], used_points: Dict[int, List[np.ndarray]], threshold_iou: float = 0.8, ): def _calculate_mask_iou(mask1: np.ndarray, mask2: np.ndarray): assert mask1.ndim == 2 and mask2.ndim == 2 intersection = np.logical_and(mask1, mask2).sum().item() union = np.logical_or(mask1, mask2).sum().item() # Avoid division by zero if union == 0: return 0.0 iou = intersection / union return iou for (label, masks), (other_label, other_masks) in product(predicted_masks.items(), predicted_masks.items()): if other_label <= label: continue overlapped_label = [] overlapped_other_label = [] for (im, mask), (jm, other_mask) in product(enumerate(masks), enumerate(other_masks)): _mask_iou = _calculate_mask_iou(mask, other_mask) if _mask_iou > threshold_iou: if used_points[label][im][2] > used_points[other_label][jm][2]: overlapped_other_label.append(jm) else: overlapped_label.append(im) elif _mask_iou > 0: # refine the slightly overlapping region overlapped_coords = np.where(np.logical_and(mask, other_mask)) if used_points[label][im][2] > used_points[other_label][jm][2]: other_mask[overlapped_coords] = 0.0 else: mask[overlapped_coords] = 0.0 for im in sorted(list(set(overlapped_label)), reverse=True): masks.pop(im) used_points[label].pop(im) for jm in sorted(list(set(overlapped_other_label)), reverse=True): other_masks.pop(jm) used_points[other_label].pop(jm)
[docs] def predict(self, dataset_item: DatasetItemEntity) -> List[Annotation]: # type: ignore """Perform a prediction for a given input image.""" results = self.infer(dataset_item.numpy, self.reference_feats, self.used_indices) return results[0]
def _find_latest_reference_info(self, root: str = "vpm_zsl_reference_infos") -> Union[str, None]: """Find latest reference info to be used.""" if not os.path.isdir(root): return None if len(stamps := sorted(os.listdir(root), reverse=True)) > 0: return stamps[0] return None def _get_reference_info( self, root: str = "vpm_zsl_reference_infos", path_reference_info: str = "{}/reference_info.pickle" ) -> Union[Tuple[np.ndarray, np.ndarray], None]: """Get reference info through loading previously saved one or running `learn`.""" if (latest_stamp := self._find_latest_reference_info(root)) is not None: # load previously saved reference info latest_reference_info = os.path.join(root, path_reference_info.format(latest_stamp)) # pickle.load() used for getting the latest reference info from the previously dumped object reference_info = pickle.load(open(latest_reference_info, "rb")) # nosec B301 return reference_info["reference_feats"], reference_info["used_indices"] return None, None
[docs] def initialize_reference_info(self) -> None: """Initialize reference information.""" self.reference_feats = np.zeros((0, 1, 256), dtype=np.float32) self.used_indices = np.array([], dtype=np.int64)
[docs] def expand_reference_info(self, new_largest_label: int) -> None: """Expand reference info dimensions if newly given processed prompts have more lables.""" if new_largest_label > (cur_largest_label := len(self.reference_feats) - 1): diff = new_largest_label - cur_largest_label self.reference_feats = np.pad(self.reference_feats, ((0, diff), (0, 0), (0, 0)), constant_values=0.0)
def _generate_masked_features( self, feats: np.ndarray, masks: np.ndarray, threshold_mask: float, ) -> Tuple[np.ndarray, ...]: """Generate masked features. Args: feats (np.ndarray): Raw reference features. It will be filtered with masks. masks (np.ndarray): Reference masks used to filter features. threshold_mask (float): Threshold to control masked region. Returns: (np.ndarray): Masked features. """ target_shape = self.model["image_encoder"].image_size / max(masks.shape) * np.array(masks.shape) target_shape = target_shape[::-1].astype(np.int32) # Post-process masks masks = cv2.resize(masks, target_shape, interpolation=cv2.INTER_LINEAR) masks = self._pad_to_square(masks) masks = cv2.resize(masks, feats.shape[:2][::-1], interpolation=cv2.INTER_LINEAR) # Target feature extraction if (masks > threshold_mask).sum() == 0: # (for stability) there is no area to be extracted return None masked_feat = feats[masks > threshold_mask] masked_feat = masked_feat.mean(0)[None] masked_feat = masked_feat / np.linalg.norm(masked_feat, axis=-1, keepdims=True) return masked_feat def _pad_to_square(self, x: np.ndarray) -> np.ndarray: """Pad to a square input. Args: x (np.ndarray): Mask to be padded. Returns: (np.ndarray): Padded mask. """ h, w = x.shape[-2:] padh = self.model["image_encoder"].image_size - h padw = self.model["image_encoder"].image_size - w x = np.pad(x, ((0, padh), (0, padw)), constant_values=0.0) return x def _topk_numpy(self, x: np.ndarray, k: int, axis: int = -1, largest: bool = True) -> np.ndarray: """Top-k function for numpy same with torch.topk.""" if largest: k = -k indices = range(k, 0) else: indices = range(k) partitioned_ind = np.argpartition(x, k, axis=axis).take(indices=indices, axis=axis) partitioned_scores = np.take_along_axis(x, partitioned_ind, axis=axis) sorted_trunc_ind = np.flip(np.argsort(partitioned_scores, axis=axis), axis=axis) ind = np.take_along_axis(partitioned_ind, sorted_trunc_ind, axis=axis) scores = np.take_along_axis(partitioned_scores, sorted_trunc_ind, axis=axis) return scores, ind
[docs] class OTXOpenVinoDataLoader: """DataLoader implementation for VisualPromptingOpenVINOTask.""" def __init__( self, dataset: Any, inferencer: OpenVINOVisualPromptingInferencer, module_name: str, shuffle: bool = True, output_model: Optional[ModelEntity] = None, **kwargs, ): self.dataset = dataset self.inferencer = inferencer self.module_name = module_name self.shuffler = None if shuffle: self.shuffler = list(range(len(dataset))) random.shuffle(self.shuffler) self.target_length = self.inferencer.model["image_encoder"].orig_width if self.module_name not in ["image_encoder"]: self.image_encoder = self._load_module("image_encoder", output_model) def _load_module(self, module_name: str, output_model: ModelEntity, core=ov.Core()): """Load specific module.""" compressed_model = core.read_model( output_model.get_data(f"visual_prompting_{module_name}.xml"), output_model.get_data(f"visual_prompting_{module_name}.bin"), ) return core.compile_model( model=compressed_model, device_name=self.inferencer.model[module_name].inference_adapter.device ) def __getitem__(self, index: int): """Get item from dataset.""" if self.shuffler is not None: index = self.shuffler[index] items = self.dataset[index] images, _, prompts = self.inferencer.pre_process(items, extra_processing=True) _, _, h, w = images["images"].shape pad_width = ((0, 0), (0, 0), (0, self.target_length - h), (0, self.target_length - w)) images["images"] = np.pad(images["images"], pad_width, mode="constant", constant_values=0) if self.module_name == "image_encoder": return images else: image_embeddings = self.image_encoder(images["images"]) prompt = prompts[0] # only use the first prompt prompt.pop("label") prompt.update({"image_embeddings": image_embeddings["image_embeddings"]}) return prompt # TODO (sungchul): change has_mask_input def __len__(self): """Get length of dataset.""" return len(self.dataset)
[docs] class OpenVINOVisualPromptingTask(IInferenceTask, IEvaluationTask, IOptimizationTask, IDeploymentTask): """Task implementation for Visual Prompting using OpenVINO backend.""" def __init__(self, task_environment: TaskEnvironment) -> None: self.task_environment = task_environment self.model = self.task_environment.model self.model_name = self.task_environment.model_template.model_template_id self.inferencer = self.load_inferencer() self._avg_time_per_image: Optional[float] = None labels = task_environment.get_labels(include_empty=False) self._label_dictionary = dict(enumerate(labels, 1)) template_file_path = self.task_environment.model_template.model_template_path self._base_dir = os.path.abspath(os.path.dirname(template_file_path)) self.task_type = TaskType.VISUAL_PROMPTING @property def hparams(self): """Hparams of OpenVINO Visual Prompting Task.""" return self.task_environment.get_hyper_parameters(VisualPromptingBaseConfig) @property def avg_time_per_image(self) -> Optional[float]: """Average inference time per image.""" return self._avg_time_per_image
[docs] def load_inferencer(self) -> OpenVINOVisualPromptingInferencer: """Load OpenVINO Visual Prompting Inferencer.""" if self.model is None: raise RuntimeError("load_inferencer failed, model is None") return OpenVINOVisualPromptingInferencer( self.hparams, self.task_environment.label_schema, { "image_encoder": self.model.get_data("visual_prompting_image_encoder.xml"), "decoder": self.model.get_data("visual_prompting_decoder.xml"), }, { "image_encoder": self.model.get_data("visual_prompting_image_encoder.bin"), "decoder": self.model.get_data("visual_prompting_decoder.bin"), }, num_requests=get_default_async_reqs_num(), )
[docs] def infer( self, dataset: DatasetEntity, inference_parameters: Optional[InferenceParameters] = None, ) -> DatasetEntity: """Infer function of OpenVINOVisualPromptingTask. Currently, asynchronous execution is not supported, synchronous execution will be executed instead. """ if inference_parameters is not None: update_progress_callback = inference_parameters.update_progress enable_async_inference = inference_parameters.enable_async_inference else: update_progress_callback = default_progress_callback enable_async_inference = True # FIXME (sungchul): Support async inference. if enable_async_inference: logger.warning("Asynchronous inference doesn't work, synchronous inference will be executed.") enable_async_inference = False predicted_validation_dataset = dataset.with_empty_annotations() def add_prediction(id: int, annotations: List[Annotation]): dataset_item = predicted_validation_dataset[id] dataset_item.append_annotations(annotations) total_time = 0.0 dataset_size = len(dataset) for i, dataset_item in enumerate(dataset, 1): start_time = time.perf_counter() annotations = self.inferencer.predict(dataset_item) add_prediction(i - 1, annotations) end_time = time.perf_counter() - start_time total_time += end_time update_progress_callback(int(i / dataset_size * 100), None) self.inferencer.await_all() self._avg_time_per_image = total_time / len(dataset)"Avg time per image: {self._avg_time_per_image} secs")"Total time: {total_time} secs")"Visual Prompting OpenVINO inference completed") return predicted_validation_dataset
[docs] def evaluate(self, output_resultset: ResultSetEntity, evaluation_metric: Optional[str] = None): """Evaluate function of OpenVINOVisualPromptingTask.""""Computing mDice") metrics = MetricsHelper.compute_dice_averaged_over_pixels(output_resultset)"mDice after evaluation: {metrics.overall_dice.value}") output_resultset.performance = metrics.get_performance()
[docs] def deploy(self, output_model: ModelEntity) -> None: """Deploy function of OpenVINOVisualPromptingTask.""""Deploying the model") if self.model is None: raise RuntimeError("deploy failed, model is None") work_dir = os.path.dirname(demo.__file__) parameters: Dict[str, Any] = {} parameters["converter_type"] = f"{self.task_type}" parameters["model_parameters"] = self.inferencer.configuration parameters["model_parameters"]["labels"] = LabelSchemaMapper.forward(self.task_environment.label_schema) zip_buffer = io.BytesIO() with ZipFile(zip_buffer, "w") as arch: # model files arch.writestr( os.path.join("model", "visual_prompting_image_encoder.xml"), self.model.get_data("visual_prompting_image_encoder.xml"), ) arch.writestr( os.path.join("model", "visual_prompting_image_encoder.bin"), self.model.get_data("visual_prompting_image_encoder.bin"), ) arch.writestr( os.path.join("model", "visual_prompting_decoder.xml"), self.model.get_data("visual_prompting_decoder.xml"), ) arch.writestr( os.path.join("model", "visual_prompting_decoder.bin"), self.model.get_data("visual_prompting_decoder.bin"), ) arch.writestr( os.path.join("model", "config.json"), json.dumps(parameters, ensure_ascii=False, indent=4), ) # model_wrappers files for root, _, files in os.walk(os.path.dirname(model_wrappers.__file__)): if "__pycache__" in root: continue for file in files: file_path = os.path.join(root, file) arch.write( file_path, os.path.join( "python", "model_wrappers", file_path.split("model_wrappers/")[0], ), ) # other python files arch.write(os.path.join(work_dir, "requirements.txt"), os.path.join("python", "requirements.txt")) arch.write(os.path.join(work_dir, "LICENSE"), os.path.join("python", "LICENSE")) arch.write(os.path.join(work_dir, ""), os.path.join("python", "")) arch.write(os.path.join(work_dir, ""), os.path.join(".", "")) output_model.exportable_code = zip_buffer.getvalue()"Deploying completed")
[docs] def optimize( self, optimization_type: OptimizationType, dataset: DatasetEntity, output_model: ModelEntity, optimization_parameters: Optional[OptimizationParameters] = None, module_names: List[str] = ["image_encoder", "decoder"], ov_dataloader: Type[OTXOpenVinoDataLoader] = OTXOpenVinoDataLoader, **kwargs, ): """Optimize function of OpenVINOVisualPromptingTask.""""Start PTQ optimization") if self.model is None: raise RuntimeError("PTQ optimize failed, model is None") if optimization_type is not OptimizationType.POT: raise ValueError("PTQ is the only supported optimization type for OpenVino models") dataset = dataset.get_subset(Subset.TRAINING) for i, module_name in enumerate(module_names, 1): data_loader = ov_dataloader( dataset, self.inferencer, module_name=module_name, output_model=output_model, **kwargs ) quantization_dataset = nncf.Dataset(data_loader, lambda data: data) with tempfile.TemporaryDirectory() as tempdir: xml_path = os.path.join(tempdir, f"visual_prompting_{module_name}.xml") bin_path = os.path.join(tempdir, f"visual_prompting_{module_name}.bin") with open(xml_path, "wb") as f: f.write(self.model.get_data(f"visual_prompting_{module_name}.xml")) with open(bin_path, "wb") as f: f.write(self.model.get_data(f"visual_prompting_{module_name}.bin")) ov_model = ov.Core().read_model(xml_path, bin_path) if check_if_quantized(ov_model): raise RuntimeError("Model is already optimized by PTQ") optimization_config_path = os.path.join(self._base_dir, "") ptq_config = ADDict() if os.path.exists(optimization_config_path): ptq_config = read_py_config(optimization_config_path) ptq_config.update( subset_size=min(self.hparams.pot_parameters.stat_subset_size, len(data_loader)), preset=QuantizationPreset(, ) compressed_model = nncf.quantize(ov_model, quantization_dataset, **ptq_config) if optimization_parameters is not None: optimization_parameters.update_progress(90 // len(module_names) * i, None) with tempfile.TemporaryDirectory() as tempdir: xml_path = os.path.join(tempdir, f"visual_prompting_{module_name}.xml") bin_path = os.path.join(tempdir, f"visual_prompting_{module_name}.bin") ov.save_model(compressed_model, xml_path) with open(xml_path, "rb") as f: output_model.set_data(f"visual_prompting_{module_name}.xml", with open(bin_path, "rb") as f: output_model.set_data(f"visual_prompting_{module_name}.bin", output_model.set_data( "label_schema.json", label_schema_to_bytes(self.task_environment.label_schema), ) # set model attributes for quantized model output_model.model_format = ModelFormat.OPENVINO output_model.optimization_type = ModelOptimizationType.POT output_model.optimization_methods = [OptimizationMethod.QUANTIZATION] output_model.precision = [ModelPrecision.INT8] self.model = output_model self.inferencer = self.load_inferencer() if optimization_parameters is not None: optimization_parameters.update_progress(100, None)"PTQ optimization completed")
[docs] class OpenVINOZeroShotVisualPromptingTask(OpenVINOVisualPromptingTask): """Task implementation for Zero-shot Visual Prompting using OpenVINO backend."""
[docs] def load_inferencer(self) -> OpenVINOZeroShotVisualPromptingInferencer: """Load OpenVINO Zero-shot Visual Prompting Inferencer.""" if self.model is None: raise RuntimeError("load_inferencer failed, model is None") return OpenVINOZeroShotVisualPromptingInferencer( self.hparams, self.task_environment.label_schema, model_files={ "image_encoder": self.model.get_data("visual_prompting_image_encoder.xml"), "decoder": self.model.get_data("visual_prompting_decoder.xml"), }, weight_files={ "image_encoder": self.model.get_data("visual_prompting_image_encoder.bin"), "decoder": self.model.get_data("visual_prompting_decoder.bin"), }, num_requests=get_default_async_reqs_num(), )
[docs] def infer( self, dataset: DatasetEntity, inference_parameters: Optional[InferenceParameters] = None, root: str = "vpm_zsl_reference_infos", path_reference_info: str = "{}/reference_info.pickle", ) -> DatasetEntity: """Infer function of OpenVINOVisualPromptingTask. Currently, asynchronous execution is not supported, synchronous execution will be executed instead. """ if inference_parameters is not None: update_progress_callback = inference_parameters.update_progress enable_async_inference = inference_parameters.enable_async_inference else: update_progress_callback = default_progress_callback enable_async_inference = True # FIXME (sungchul): Support async inference. if enable_async_inference: logger.warning("Asynchronous inference doesn't work, synchronous inference will be executed.") enable_async_inference = False predicted_validation_dataset = dataset.with_empty_annotations() def add_prediction(id: int, annotations: List[Annotation]): dataset_item = predicted_validation_dataset[id] dataset_item.append_annotations(annotations) total_time = 0.0 dataset_size = len(dataset) if self.inferencer.reference_feats is None and self.inferencer.used_indices is None: # set reference_feats and used_indices from previously saved reference_info self.inferencer.reference_feats, self.inferencer.used_indices = self.inferencer._get_reference_info( root, path_reference_info ) if self.inferencer.reference_feats is None and self.inferencer.used_indices is None: # if they are empty, stop inference and return empty dataset logger.warning( ( "reference_feats and used_indices are empty, stop inference and return empty dataset. " "Please run learn function first." ) ) return predicted_validation_dataset for i, dataset_item in enumerate(dataset, 1): start_time = time.perf_counter() annotations = self.inferencer.predict(dataset_item) add_prediction(i - 1, annotations) end_time = time.perf_counter() - start_time total_time += end_time update_progress_callback(int(i / dataset_size * 100), None) self.inferencer.await_all() self._avg_time_per_image = total_time / len(dataset)"Avg time per image: {self._avg_time_per_image} secs")"Total time: {total_time} secs")"Visual Prompting OpenVINO inference completed") return predicted_validation_dataset
[docs] def optimize( self, optimization_type: OptimizationType, dataset: DatasetEntity, output_model: ModelEntity, optimization_parameters: Optional[OptimizationParameters] = None, module_names: List[str] = ["image_encoder", "decoder"], ov_dataloader: Type[OTXOpenVinoDataLoader] = OTXOpenVinoDataLoader, **kwargs, ): """Optimize function of OpenVINOZeroShotVisualPromptingTask.""" self.inferencer: OpenVINOZeroShotVisualPromptingInferencer reference_feats, used_indices = self.inferencer._get_reference_info() return super().optimize( optimization_type=optimization_type, dataset=dataset, output_model=output_model, optimization_parameters=optimization_parameters, module_names=module_names, ov_dataloader=ov_dataloader, reference_feats=reference_feats, used_indices=used_indices, )