Source code for otx.algo.action_classification.backbones.movinet

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) OpenMMLab. All rights reserved.

"""Code modified from: https://github.com/Atze00/MoViNet-pytorch/blob/main/movinets/models.py."""
from __future__ import annotations

from collections import OrderedDict
from typing import Callable

import torch
import torch.nn.functional as F  # noqa: N812
from einops import rearrange
from omegaconf.dictconfig import DictConfig
from torch import Tensor, nn
from torch.nn.modules.utils import _pair, _triple


class Conv2dBNActivation(nn.Sequential):
    """A base module that applies a 2D Conv-BN-Activation.

    Args:
        in_planes (int): Number of input channels.
        out_planes (int): Number of output channels.
        kernel_size (Union[int, tuple[int, int]]): Size of the convolution kernel.
        padding (Union[int, tuple[int, int]]): Size of the padding applied to the input.
        stride (Union[int, tuple[int, int]], optional): Stride of the convolution. Default: 1.
        groups (int, optional): Number of groups in the convolution. Default: 1.
        norm_layer (Optional[Callable[..., nn.Module]], optional): Normalization layer to use.
            If None, identity is used. Default: None.
        activation_layer (Optional[Callable[..., nn.Module]], optional): Activation layer to use.
            If None, identity is used. Default: None.
        **kwargs (Any): Additional keyword arguments passed to nn.Conv2d.

    Attributes:
        kernel_size (tuple[int, int]): Size of the convolution kernel.
        stride (tuple[int, int]): Stride of the convolution.
        out_channels (int): Number of output channels.

    """

    def __init__(
        self,
        in_planes: int,
        out_planes: int,
        *,
        kernel_size: int | tuple[int, int],
        padding: int | tuple[int, int],
        stride: int | tuple[int, int] = 1,
        groups: int = 1,
        norm_layer: Callable[..., nn.Module] | None = None,
        activation_layer: Callable[..., nn.Module] | None = None,
        **kwargs,
    ) -> None:
        kernel_size = _pair(kernel_size)
        stride = _pair(stride)
        padding = _pair(padding)
        if norm_layer is None:
            norm_layer = nn.Identity
        if activation_layer is None:
            activation_layer = nn.Identity
        self.kernel_size = kernel_size
        self.stride = stride
        dict_layers = OrderedDict(
            {
                "conv2d": nn.Conv2d(
                    in_planes,
                    out_planes,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=padding,
                    groups=groups,
                    **kwargs,
                ),
                "norm": norm_layer(out_planes, eps=0.001),
                "act": activation_layer(),
            },
        )

        self.out_channels = out_planes
        super().__init__(dict_layers)


class Conv3DBNActivation(nn.Sequential):
    """A base module that applies a 3D Conv-BN-Activation.

    Args:
        in_planes (int): Number of input channels.
        out_planes (int): Number of output channels.
        kernel_size (Union[int, tuple[int, int, int]]): Size of the convolution kernel.
        padding (Union[int, tuple[int, int, int]]): Size of the padding applied to the input.
        stride (Union[int, tuple[int, int, int]], optional): Stride of the convolution. Default: 1.
        groups (int, optional): Number of groups in the convolution. Default: 1.
        norm_layer (Optional[Callable[..., nn.Module]], optional): Normalization layer to use.
            If None, identity is used. Default: None.
        activation_layer (Optional[Callable[..., nn.Module]], optional): Activation layer to use.
            If None, identity is used. Default: None.
        **kwargs (Any): Additional keyword arguments passed to nn.Conv3d.

    Attributes:
        kernel_size (tuple[int, int, int]): Size of the convolution kernel.
        stride (tuple[int, int, int]): Stride of the convolution.
        out_channels (int): Number of output channels.

    """

    def __init__(
        self,
        in_planes: int,
        out_planes: int,
        *,
        kernel_size: int | tuple[int, int, int],
        padding: int | tuple[int, int, int],
        stride: int | tuple[int, int, int] = 1,
        groups: int = 1,
        norm_layer: Callable[..., nn.Module] | None = None,
        activation_layer: Callable[..., nn.Module] | None = None,
        **kwargs,
    ) -> None:
        kernel_size = _triple(kernel_size)
        stride = _triple(stride)
        padding = _triple(padding)
        if norm_layer is None:
            norm_layer = nn.Identity
        if activation_layer is None:
            activation_layer = nn.Identity
        self.kernel_size = kernel_size
        self.stride = stride

        dict_layers = OrderedDict(
            {
                "conv3d": nn.Conv3d(
                    in_planes,
                    out_planes,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=padding,
                    groups=groups,
                    **kwargs,
                ),
                "norm": norm_layer(out_planes, eps=0.001),
                "act": activation_layer(),
            },
        )

        self.out_channels = out_planes
        super().__init__(dict_layers)


class ConvBlock3D(nn.Module):
    """A module that applies a 2+1D or 3D Conv-BN-activation sequential.

    Args:
        in_planes (int): Number of input channels.
        out_planes (int): Number of output channels.
        kernel_size (tuple[int, int, int]): Size of the convolution kernel.
        tf_like (bool): Whether to use TensorFlow-like padding and convolution.
        conv_type (str): Type of 3D convolution to use. Must be "2plus1d" or "3d".
        padding (tuple[int, int, int], optional): Size of the padding applied to the input.
            Default: (0, 0, 0).
        stride (tuple[int, int, int], optional): Stride of the convolution. Default: (1, 1, 1).
        norm_layer (Optional[Callable[..., nn.Module]], optional): Normalization layer to use.
            If None, identity is used. Default: None.
        activation_layer (Optional[Callable[..., nn.Module]], optional): Activation layer to use.
            If None, identity is used. Default: None.
        bias (bool, optional): Whether to use bias in the convolution. Default: False.
        **kwargs (Any): Additional keyword arguments passed to nn.Conv2d or nn.Conv3d.

    Attributes:
        conv_1 (Union[Conv2dBNActivation, Conv3DBNActivation]): Convolutional layer.
        conv_2 (Optional[Conv2dBNActivation]): Convolutional layer for 2+1D convolution.
        padding (tuple[int, int, int]): Size of the padding applied to the input.
        kernel_size (tuple[int, int, int]): Size of the convolution kernel.
        dim_pad (int): Padding along the temporal dimension.
        stride (tuple[int, int, int]): Stride of the convolution.
        conv_type (str): Type of 3D convolution used.
        tf_like (bool): Whether to use TensorFlow-like padding and convolution.

    """

    def __init__(
        self,
        in_planes: int,
        out_planes: int,
        kernel_size: tuple[int, int, int],
        tf_like: bool,
        conv_type: str,
        padding: tuple[int, int, int] = (0, 0, 0),
        stride: tuple[int, int, int] = (1, 1, 1),
        norm_layer: Callable[..., nn.Module] | None = None,
        activation_layer: Callable[..., nn.Module] | None = None,
        bias: bool = False,
        **kwargs,
    ) -> None:
        super().__init__()
        self.conv_2 = None
        if tf_like:
            # We need odd kernel to have even padding
            # and stride == 1 to precompute padding,
            if kernel_size[0] % 2 == 0:
                raise ValueError("tf_like supports only odd" + " kernels for temporal dimension")
            padding = ((kernel_size[0] - 1) // 2, 0, 0)
            if stride[0] != 1:
                raise ValueError("illegal stride value, tf like supports" + " only stride == 1 for temporal dimension")
            if stride[1] > kernel_size[1] or stride[2] > kernel_size[2]:
                # these values are not tested so should be avoided
                raise ValueError("tf_like supports only" + "  stride <= of the kernel size")

        if conv_type not in ["2plus1d", "3d"]:
            raise ValueError("only 2plus2d or 3d are " + "allowed as 3d convolutions")

        if conv_type == "2plus1d":
            self.conv_1 = Conv2dBNActivation(
                in_planes,
                out_planes,
                kernel_size=(kernel_size[1], kernel_size[2]),
                padding=(padding[1], padding[2]),
                stride=(stride[1], stride[2]),
                activation_layer=activation_layer,
                norm_layer=norm_layer,
                bias=bias,
                **kwargs,
            )
            if kernel_size[0] > 1:
                self.conv_2 = Conv2dBNActivation(
                    in_planes,
                    out_planes,
                    kernel_size=(kernel_size[0], 1),
                    padding=(padding[0], 0),
                    stride=(stride[0], 1),
                    activation_layer=activation_layer,
                    norm_layer=norm_layer,
                    bias=bias,
                    **kwargs,
                )
        elif conv_type == "3d":
            self.conv_1 = Conv3DBNActivation(
                in_planes,
                out_planes,
                kernel_size=kernel_size,
                padding=padding,
                activation_layer=activation_layer,
                norm_layer=norm_layer,
                stride=stride,
                bias=bias,
                **kwargs,
            )
        self.padding = padding
        self.kernel_size = kernel_size
        self.dim_pad = self.kernel_size[0] - 1
        self.stride = stride
        self.conv_type = conv_type
        self.tf_like = tf_like

    def _forward(self, x: Tensor) -> Tensor:
        shape_with_buffer = x.shape
        if self.conv_type == "2plus1d":
            x = rearrange(x, "b c t h w -> (b t) c h w")
        x = self.conv_1(x)
        if self.conv_type == "2plus1d":
            x = rearrange(x, "(b t) c h w -> b c t h w", t=shape_with_buffer[2])
            if self.conv_2 is not None:
                w = x.shape[-1]
                x = rearrange(x, "b c t h w -> b c t (h w)")
                x = self.conv_2(x)
                x = rearrange(x, "b c t (h w) -> b c t h w", w=w)
        return x

    def forward(self, x: Tensor) -> Tensor:
        """Forward function of ConvBlock3D."""
        if self.tf_like:
            x = same_padding(
                x,
                x.shape[-2],
                x.shape[-1],
                self.stride[-2],
                self.stride[-1],
                self.kernel_size[-2],
                self.kernel_size[-1],
            )
        return self._forward(x)


class SqueezeExcitation(nn.Module):
    """Implements the Squeeze-and-Excitation (SE) block.

    Args:
        input_channels (int): Number of input channels.
        activation_2 (nn.Module): Activation function applied after the second convolutional block.
        activation_1 (nn.Module): Activation function applied after the first convolutional block.
        conv_type (str): Convolutional block type ("2plus1d" or "3d").
        squeeze_factor (int, optional): The reduction factor for the number of channels (default: 4).
        bias (bool, optional): Whether to add a bias term to the convolutional blocks (default: True).
    """

    def __init__(
        self,
        input_channels: int,
        activation_2: nn.Module,
        activation_1: nn.Module,
        conv_type: str,
        squeeze_factor: int = 4,
        bias: bool = True,
    ) -> None:
        super().__init__()
        se_multiplier = 1
        squeeze_channels = _make_divisible(input_channels // squeeze_factor * se_multiplier, 8)
        self.fc1 = ConvBlock3D(
            input_channels * se_multiplier,
            squeeze_channels,
            kernel_size=(1, 1, 1),
            padding=(0, 0, 0),
            tf_like=False,
            conv_type=conv_type,
            bias=bias,
        )
        self.activation_1 = activation_1()
        self.activation_2 = activation_2()
        self.fc2 = ConvBlock3D(
            squeeze_channels,
            input_channels,
            kernel_size=(1, 1, 1),
            padding=(0, 0, 0),
            tf_like=False,
            conv_type=conv_type,
            bias=bias,
        )

    def _scale(self, x: Tensor) -> Tensor:
        """Computes the scaling factor for the input tensor.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, channels, time, height, width).

        Returns:
            torch.Tensor: Scaling factor for the input tensor of shape (batch_size, channels, 1, 1, 1).
        """
        scale = F.adaptive_avg_pool3d(x, 1)
        scale = self.fc1(scale)
        scale = self.activation_1(scale)
        scale = self.fc2(scale)
        return self.activation_2(scale)

    def forward(self, x: Tensor) -> Tensor:
        """Forward function of SqueezeExcitation."""
        scale = self._scale(x)
        return scale * x


def _make_divisible(value: float, divisor: int, min_value: int | None = None) -> int:
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(value + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * value:
        new_v += divisor
    return new_v


def same_padding(
    x: Tensor,
    in_height: int,
    in_width: int,
    stride_h: int,
    stride_w: int,
    filter_height: int,
    filter_width: int,
) -> Tensor:
    """Applies padding to the input tensor to ensure that the output tensor size is the same as the input tensor size.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, channels, time, height, width).
        in_height (int): Height of the input tensor.
        in_width (int): Width of the input tensor.
        stride_h (int): Stride in the height dimension.
        stride_w (int): Stride in the width dimension.
        filter_height (int): Height of the filter (kernel).
        filter_width (int): Width of the filter (kernel).

    Returns:
        torch.Tensor: Padded tensor of shape (batch_size, channels, time, height + pad_h, width + pad_w), where
        pad_h and pad_w are the heights and widths of the top, bottom, left, and right padding applied to the tensor.

    """
    if in_height % stride_h == 0:
        pad_along_height = max(filter_height - stride_h, 0)
    else:
        pad_along_height = max(filter_height - (in_height % stride_h), 0)
    if in_width % stride_w == 0:
        pad_along_width = max(filter_width - stride_w, 0)
    else:
        pad_along_width = max(filter_width - (in_width % stride_w), 0)
    pad_top = pad_along_height // 2
    pad_bottom = pad_along_height - pad_top
    pad_left = pad_along_width // 2
    pad_right = pad_along_width - pad_left
    padding_pad = (pad_left, pad_right, pad_top, pad_bottom)
    return torch.nn.functional.pad(x, padding_pad)


class TFAvgPool3D(nn.Module):
    """3D average pooling layer with padding."""

    def __init__(self) -> None:
        super().__init__()
        self.avgf = nn.AvgPool3d((1, 3, 3), stride=(1, 2, 2))

    def forward(self, x: Tensor) -> Tensor:
        """Applies 3D average pooling with padding to the input tensor.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, channels, time, height, width).

        Returns:
            torch.Tensor: Pooled tensor of shape (batch_size, channels, time, height', width'), where
            height' and width' are the heights and widths of the pooled tensor after padding is applied.

        """
        use_padding = x.shape[-1] % 2 != 0
        padding_pad = (0, 0, 0, 0) if use_padding else (0, 1, 0, 1)
        x = torch.nn.functional.pad(x, padding_pad)
        if use_padding:
            x = torch.nn.functional.avg_pool3d(
                x,
                (1, 3, 3),
                stride=(1, 2, 2),
                count_include_pad=False,
                padding=(0, 1, 1),
            )
        else:
            x = self.avgf(x)
            x[..., -1] = x[..., -1] * 9 / 6
            x[..., -1, :] = x[..., -1, :] * 9 / 6
        return x


class BasicBneck(nn.Module):
    """Basic bottleneck block of MoViNet network.

    Args:
        cfg (DictConfig): configuration object containing block's hyperparameters.
        tf_like (bool): A boolean indicating whether to use TensorFlow like convolution
            padding or not.
        conv_type (str): A string indicating the type of convolutional layer to use.
            Can be "2d" or "3d".
        norm_layer (Callable[..., nn.Module], optional): A callable normalization layer
            to use. Defaults to None.
        activation_layer (Callable[..., nn.Module], optional): A callable activation
            layer to use. Defaults to None.

    Attributes:
        expand (ConvBlock3D, optional): An optional expansion convolutional block.
        deep (ConvBlock3D): A convolutional block with kernel size, stride, padding,
            and groups as specified in the configuration object.
        squeeze_excitation (SqueezeExcitation): A squeeze-and-excitation block.
        project (ConvBlock3D): A projection convolutional block.
        res (nn.Sequential, optional): An optional residual convolutional block.
        alpha (nn.Parameter): A learnable parameter used in the ReZero operation.
    """

    def __init__(
        self,
        cfg: DictConfig,
        tf_like: bool,
        conv_type: str,
        norm_layer: Callable[..., nn.Module] | None = None,
        activation_layer: Callable[..., nn.Module] | None = None,
    ) -> None:
        super().__init__()
        self.res = None

        layers = []
        if cfg.expanded_channels != cfg.out_channels:
            self.expand = ConvBlock3D(
                in_planes=cfg.input_channels,
                out_planes=cfg.expanded_channels,
                kernel_size=(1, 1, 1),
                padding=(0, 0, 0),
                conv_type=conv_type,
                tf_like=tf_like,
                norm_layer=norm_layer,
                activation_layer=activation_layer,
            )
        self.deep = ConvBlock3D(
            in_planes=cfg.expanded_channels,
            out_planes=cfg.expanded_channels,
            kernel_size=cfg.kernel_size,
            padding=cfg.padding,
            stride=cfg.stride,
            groups=cfg.expanded_channels,
            conv_type=conv_type,
            tf_like=tf_like,
            norm_layer=norm_layer,
            activation_layer=activation_layer,
        )
        self.se = SqueezeExcitation(
            cfg.expanded_channels,
            activation_1=activation_layer,
            activation_2=(nn.Sigmoid if conv_type == "3d" else nn.Hardsigmoid),
            conv_type=conv_type,
        )
        self.project = ConvBlock3D(
            cfg.expanded_channels,
            cfg.out_channels,
            kernel_size=(1, 1, 1),
            padding=(0, 0, 0),
            conv_type=conv_type,
            tf_like=tf_like,
            norm_layer=norm_layer,
            activation_layer=nn.Identity,
        )

        if not (cfg.stride == (1, 1, 1) and cfg.input_channels == cfg.out_channels):
            if cfg.stride != (1, 1, 1):
                if tf_like:
                    layers.append(TFAvgPool3D())
                else:
                    layers.append(nn.AvgPool3d((1, 3, 3), stride=cfg.stride, padding=cfg.padding_avg))
            layers.append(
                ConvBlock3D(
                    in_planes=cfg.input_channels,
                    out_planes=cfg.out_channels,
                    kernel_size=(1, 1, 1),
                    padding=(0, 0, 0),
                    norm_layer=norm_layer,
                    activation_layer=nn.Identity,
                    conv_type=conv_type,
                    tf_like=tf_like,
                ),
            )
            self.res = nn.Sequential(*layers)
        # ReZero
        self.alpha = nn.Parameter(torch.tensor(0.0), requires_grad=True)

    def forward(self, x: Tensor) -> Tensor:
        """Forward function of BasicBneck."""
        residual = self.res(x) if self.res is not None else x
        if hasattr(self, "expand"):
            x = self.expand(x)
        x = self.deep(x)
        x = self.se(x)
        x = self.project(x)
        return residual + self.alpha * x


class MoViNetBackboneBase(nn.Module):
    """MoViNet class used for video classification.

    Args:
        cfg (DictConfig): configuration object containing network's hyperparameters.
        conv_type (str, optional): A string indicating the type of convolutional layer
            to use. Can be "2d" or "3d". Defaults to "3d".
        tf_like (bool, optional): A boolean indicating whether to use TensorFlow like
            convolution padding or not. Defaults to False.

    Attributes:
        conv1 (ConvBlock3D): A convolutional block for the first layer.
        blocks (nn.Sequential): A sequence of basic bottleneck blocks.
        conv7 (ConvBlock3D): A convolutional block for the final layer.

    Methods:
        avg(x: Tensor) -> Tensor: A static method that returns the adaptive average pool
            of the input tensor.
        _init_weights(module): A private method that initializes the weights of the network's
            convolutional, batch normalization, and linear layers.
        forward(x: Tensor) -> Tensor: The forward pass of the network.

    """

    def __init__(
        self,
        cfg: DictConfig,
        conv_type: str = "3d",
        tf_like: bool = False,
    ) -> None:
        super().__init__()
        tf_like = True
        blocks_dic = OrderedDict()

        norm_layer = nn.BatchNorm3d if conv_type == "3d" else nn.BatchNorm2d
        activation_layer = nn.SiLU if conv_type == "3d" else nn.Hardswish

        self.conv1 = ConvBlock3D(
            in_planes=cfg.conv1.input_channels,
            out_planes=cfg.conv1.out_channels,
            kernel_size=cfg.conv1.kernel_size,
            stride=cfg.conv1.stride,
            padding=cfg.conv1.padding,
            conv_type=conv_type,
            tf_like=tf_like,
            norm_layer=norm_layer,
            activation_layer=activation_layer,
        )
        for i, block in enumerate(cfg.blocks):
            for j, basicblock in enumerate(block):
                blocks_dic[f"b{i}_l{j}"] = BasicBneck(
                    basicblock,
                    conv_type=conv_type,
                    tf_like=tf_like,
                    norm_layer=norm_layer,
                    activation_layer=activation_layer,
                )
        self.blocks = nn.Sequential(blocks_dic)
        self.conv7 = ConvBlock3D(
            in_planes=cfg.conv7.input_channels,
            out_planes=cfg.conv7.out_channels,
            kernel_size=cfg.conv7.kernel_size,
            stride=cfg.conv7.stride,
            padding=cfg.conv7.padding,
            conv_type=conv_type,
            tf_like=tf_like,
            norm_layer=norm_layer,
            activation_layer=activation_layer,
        )

    def avg(self, x: Tensor) -> Tensor:
        """Returns the adaptive average pool of the input tensor.

        Args:
            x (Tensor): A tensor to be averaged.

        Returns:
            Tensor: A tensor with the averaged values.

        """
        return F.adaptive_avg_pool3d(x, 1)

    @staticmethod
    def _init_weights(module: nn.Module) -> None:
        if isinstance(module, nn.Conv3d):
            nn.init.kaiming_normal_(module.weight, mode="fan_out")
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, (nn.BatchNorm3d, nn.BatchNorm2d, nn.GroupNorm)):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, 0, 0.01)
            nn.init.zeros_(module.bias)

    def forward(self, x: Tensor) -> Tensor:
        """Forward function of MoViNet."""
        x = self.conv1(x)
        x = self.blocks(x)
        x = self.conv7(x)
        return self.avg(x)

    def init_weights(self) -> None:
        """Initializes the weights of network."""
        self.apply(self._init_weights)



[docs]
class MoViNetBackbone(MoViNetBackboneBase):
    """MoViNet wrapper class for OTX."""

    def __init__(self, **kwargs) -> None:
        cfg = DictConfig({})
        cfg.name = "A0"
        cfg.conv1 = DictConfig({})
        MoViNetBackbone.fill_conv(cfg.conv1, 3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1))

        cfg.blocks = [
            [DictConfig({})],
            [DictConfig({}) for _ in range(3)],
            [DictConfig({}) for _ in range(3)],
            [DictConfig({}) for _ in range(4)],
            [DictConfig({}) for _ in range(4)],
        ]

        # block 2
        MoViNetBackbone.fill_se_config(cfg.blocks[0][0], 8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1))

        # block 3
        MoViNetBackbone.fill_se_config(cfg.blocks[1][0], 8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0))
        MoViNetBackbone.fill_se_config(cfg.blocks[1][1], 32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[1][2], 32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))

        # block 4
        MoViNetBackbone.fill_se_config(cfg.blocks[2][0], 32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0))
        MoViNetBackbone.fill_se_config(cfg.blocks[2][1], 56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[2][2], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))

        # block 5
        MoViNetBackbone.fill_se_config(cfg.blocks[3][0], 56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[3][1], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[3][2], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[3][3], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))

        # block 6
        MoViNetBackbone.fill_se_config(cfg.blocks[4][0], 56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[4][1], 104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[4][2], 104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))
        MoViNetBackbone.fill_se_config(cfg.blocks[4][3], 104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))

        cfg.conv7 = DictConfig({})
        MoViNetBackbone.fill_conv(cfg.conv7, 104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0))

        cfg.dense9 = DictConfig({"hidden_dim": 2048})
        super().__init__(cfg)


[docs]
    @staticmethod
    def fill_se_config(
        conf: DictConfig,
        input_channels: int,
        out_channels: int,
        expanded_channels: int,
        kernel_size: tuple[int, int, int],
        stride: tuple[int, int, int],
        padding: tuple[int, int, int],
        padding_avg: tuple[int, int, int],
    ) -> None:
        """Set the values of a given DictConfig object to SE module.

        Args:
            conf (DictConfig): The DictConfig object to be updated.
            input_channels (int): The number of input channels.
            out_channels (int): The number of output channels.
            expanded_channels (int): The number of channels after expansion in the basic block.
            kernel_size (tuple[int]): The size of the kernel.
            stride (tuple[int]): The stride of the kernel.
            padding (tuple[int]): The padding of the kernel.
            padding_avg (tuple[int]): The padding for the average pooling operation.

        Returns:
            None.
        """
        conf.expanded_channels = expanded_channels
        conf.padding_avg = padding_avg
        MoViNetBackbone.fill_conv(
            conf,
            input_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
        )



[docs]
    @staticmethod
    def fill_conv(
        conf: DictConfig,
        input_channels: int,
        out_channels: int,
        kernel_size: tuple[int, int, int],
        stride: tuple[int, int, int],
        padding: tuple[int, int, int],
    ) -> None:
        """Set the values of a given DictConfig object to conv layer.

        Args:
            conf (DictConfig): The DictConfig object to be updated.
            input_channels (int): The number of input channels.
            out_channels (int): The number of output channels.
            kernel_size (tuple[int]): The size of the kernel.
            stride (tuple[int]): The stride of the kernel.
            padding (tuple[int]): The padding of the kernel.

        Returns:
            None.
        """
        conf.input_channels = input_channels
        conf.out_channels = out_channels
        conf.kernel_size = kernel_size
        conf.stride = stride
        conf.padding = padding