Source code for otx.algo.detection.losses.ssd_loss

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) OpenMMLab. All rights reserved.
#
"""SSD criterion."""

from __future__ import annotations

from torch import Tensor, nn

from otx.algo.common.losses import smooth_l1_loss
from otx.algo.common.utils.utils import multi_apply



[docs]
class SSDCriterion(nn.Module):
    """SSDCriterion is a loss criterion for Single Shot MultiBox Detector (SSD).

    Args:
        num_classes (int): Number of classes including the background class.
        bbox_coder (nn.Module): Bounding box coder module. Defaults to None.
        neg_pos_ratio (int, optional): Ratio of negative to positive samples. Defaults to 3.
        reg_decoded_bbox (bool): If true, the regression loss would be
            applied directly on decoded bounding boxes, converting both
            the predicted boxes and regression targets to absolute
            coordinates format. Defaults to False. It should be `True` when
            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
        smoothl1_beta (float, optional): Beta parameter for the smooth L1 loss. Defaults to 1.0.
    """

    def __init__(
        self,
        num_classes: int,
        bbox_coder: nn.Module | None = None,
        neg_pos_ratio: int = 3,
        reg_decoded_bbox: bool = False,
        smoothl1_beta: float = 1.0,
    ) -> None:
        super().__init__()
        self.num_classes = num_classes
        self.bbox_coder = bbox_coder
        self.neg_pos_ratio = neg_pos_ratio
        self.reg_decoded_bbox = reg_decoded_bbox
        self.smoothl1_beta = smoothl1_beta


[docs]
    def forward(
        self,
        cls_score: Tensor,
        bbox_pred: Tensor,
        anchor: Tensor,
        labels: Tensor,
        label_weights: Tensor,
        bbox_targets: Tensor,
        bbox_weights: Tensor,
        avg_factor: int,
    ) -> dict[str, Tensor]:
        """Compute losses of images.

        Args:
            cls_score (Tensor): Box scores for images have shape (N, num_total_anchors, num_classes).
            bbox_pred (Tensor): Box energies / deltas for image levels with shape (N, num_total_anchors, 4).
            anchors (Tensor): Box reference for for scale levels with shape (N, num_total_anchors, 4).
            labels (Tensor): Labels of anchors with shape (N, num_total_anchors).
            label_weights (Tensor): Label weights of anchors with shape (N, num_total_anchors)
            bbox_targets (Tensor): BBox regression targets of anchors with shape (N, num_total_anchors, 4).
            bbox_weights (Tensor): BBox regression loss weights of anchors with shape (N, num_total_anchors, 4).
            avg_factor (int): Average factor that is used to average
                the loss. When using sampling method, avg_factor is usually
                the sum of positive and negative priors. When using
                `PseudoSampler`, `avg_factor` is usually equal to the number
                of positive priors.

        Returns:
            dict[str, Tensor]: A dictionary of loss components. the dict
            has components below:

            - loss_cls (list[Tensor]): A list containing each feature map \
            classification loss.
            - loss_bbox (list[Tensor]): A list containing each feature map \
            regression loss.
        """
        losses_cls, losses_bbox = multi_apply(
            self._forward,
            cls_score,
            bbox_pred,
            anchor,
            labels,
            label_weights,
            bbox_targets,
            bbox_weights,
            avg_factor=avg_factor,
        )
        return {"loss_cls": losses_cls, "loss_bbox": losses_bbox}


    def _forward(
        self,
        cls_score: Tensor,
        bbox_pred: Tensor,
        anchor: Tensor,
        labels: Tensor,
        label_weights: Tensor,
        bbox_targets: Tensor,
        bbox_weights: Tensor,
        avg_factor: int,
    ) -> tuple[Tensor, Tensor]:
        """Compute loss of a single image."""
        loss_cls_all = nn.functional.cross_entropy(cls_score, labels, reduction="none") * label_weights
        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(as_tuple=False).reshape(-1)
        neg_inds = (labels == self.num_classes).nonzero(as_tuple=False).view(-1)

        num_pos_samples = pos_inds.size(0)
        num_neg_samples = self.neg_pos_ratio * num_pos_samples
        num_neg_samples = min(num_neg_samples, neg_inds.size(0))
        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
        loss_cls_pos = loss_cls_all[pos_inds].sum()
        loss_cls_neg = topk_loss_cls_neg.sum()
        loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor

        if self.reg_decoded_bbox and self.bbox_coder:
            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
            # is applied directly on the decoded bounding boxes, it
            # decodes the already encoded coordinates to absolute format.
            bbox_pred = self.bbox_coder.decode(anchor, bbox_pred)

        loss_bbox = smooth_l1_loss(
            bbox_pred,
            bbox_targets,
            bbox_weights,
            beta=self.smoothl1_beta,
            avg_factor=avg_factor,
        )
        return loss_cls[None], loss_bbox