"""Custom Folder Dataset.
This script creates a custom dataset from a folder.
"""
# Copyright (C) 2022 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from pathlib import Path
import albumentations as A
from pandas import DataFrame
from anomalib.data.base import AnomalibDataModule, AnomalibDataset
from anomalib.data.task_type import TaskType
from anomalib.data.utils import (
DirType,
InputNormalizationMethod,
LabelName,
Split,
TestSplitMode,
ValSplitMode,
get_transforms,
)
from anomalib.data.utils.path import _prepare_files_labels, _resolve_path
[docs]def make_folder_dataset(
normal_dir: str | Path,
root: str | Path | None = None,
abnormal_dir: str | Path | None = None,
normal_test_dir: str | Path | None = None,
mask_dir: str | Path | None = None,
split: str | Split | None = None,
extensions: tuple[str, ...] | None = None,
) -> DataFrame:
"""Make Folder Dataset.
Args:
normal_dir (str | Path): Path to the directory containing normal images.
root (str | Path | None): Path to the root directory of the dataset.
abnormal_dir (str | Path | None, optional): Path to the directory containing abnormal images.
normal_test_dir (str | Path | None, optional): Path to the directory containing
normal images for the test dataset. Normal test images will be a split of `normal_dir`
if `None`. Defaults to None.
mask_dir (str | Path | None, optional): Path to the directory containing
the mask annotations. Defaults to None.
split (str | Split | None, optional): Dataset split (ie., Split.FULL, Split.TRAIN or Split.TEST).
Defaults to None.
extensions (tuple[str, ...] | None, optional): Type of the image extensions to read from the
directory.
Returns:
DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
"""
normal_dir = _resolve_path(normal_dir, root)
abnormal_dir = _resolve_path(abnormal_dir, root) if abnormal_dir is not None else None
normal_test_dir = _resolve_path(normal_test_dir, root) if normal_test_dir is not None else None
mask_dir = _resolve_path(mask_dir, root) if mask_dir is not None else None
assert normal_dir.is_dir(), "A folder location must be provided in normal_dir."
filenames = []
labels = []
dirs = {DirType.NORMAL: normal_dir}
if abnormal_dir:
dirs = {**dirs, **{DirType.ABNORMAL: abnormal_dir}}
if normal_test_dir:
dirs = {**dirs, **{DirType.NORMAL_TEST: normal_test_dir}}
if mask_dir:
dirs = {**dirs, **{DirType.MASK: mask_dir}}
for dir_type, path in dirs.items():
filename, label = _prepare_files_labels(path, dir_type, extensions)
filenames += filename
labels += label
samples = DataFrame({"image_path": filenames, "label": labels})
samples = samples.sort_values(by="image_path", ignore_index=True)
# Create label index for normal (0) and abnormal (1) images.
samples.loc[
(samples.label == DirType.NORMAL) | (samples.label == DirType.NORMAL_TEST), "label_index"
] = LabelName.NORMAL
samples.loc[(samples.label == DirType.ABNORMAL), "label_index"] = LabelName.ABNORMAL
samples.label_index = samples.label_index.astype("Int64")
# If a path to mask is provided, add it to the sample dataframe.
if mask_dir is not None and abnormal_dir is not None:
samples.loc[samples.label == DirType.ABNORMAL, "mask_path"] = samples.loc[
samples.label == DirType.MASK
].image_path.values
samples["mask_path"].fillna("", inplace=True)
samples = samples.astype({"mask_path": "str"})
# make sure all every rgb image has a corresponding mask image.
assert (
samples.loc[samples.label_index == LabelName.ABNORMAL]
.apply(lambda x: Path(x.image_path).stem in Path(x.mask_path).stem, axis=1)
.all()
), "Mismatch between anomalous images and mask images. Make sure the mask files \
folder follow the same naming convention as the anomalous images in the dataset \
(e.g. image: '000.png', mask: '000.png')."
else:
samples["mask_path"] = ""
# remove all the rows with temporal image samples that have already been assigned
samples = samples.loc[
(samples.label == DirType.NORMAL) | (samples.label == DirType.ABNORMAL) | (samples.label == DirType.NORMAL_TEST)
]
# Ensure the pathlib objects are converted to str.
# This is because torch dataloader doesn't like pathlib.
samples = samples.astype({"image_path": "str"})
# Create train/test split.
# By default, all the normal samples are assigned as train.
# and all the abnormal samples are test.
samples.loc[(samples.label == DirType.NORMAL), "split"] = Split.TRAIN
samples.loc[(samples.label == DirType.ABNORMAL) | (samples.label == DirType.NORMAL_TEST), "split"] = Split.TEST
# Get the data frame for the split.
if split:
samples = samples[samples.split == split]
samples = samples.reset_index(drop=True)
return samples
[docs]class FolderDataset(AnomalibDataset):
"""Folder dataset.
Args:
task (TaskType): Task type. (``classification``, ``detection`` or ``segmentation``).
transform (A.Compose): Albumentations Compose object describing the transforms that are applied to the inputs.
split (str | Split | None): Fixed subset split that follows from folder structure on file system.
Choose from [Split.FULL, Split.TRAIN, Split.TEST]
normal_dir (str | Path): Path to the directory containing normal images.
root (str | Path | None): Root folder of the dataset.
abnormal_dir (str | Path | None, optional): Path to the directory containing abnormal images.
normal_test_dir (str | Path | None, optional): Path to the directory containing
normal images for the test dataset. Defaults to None.
mask_dir (str | Path | None, optional): Path to the directory containing
the mask annotations. Defaults to None.
extensions (tuple[str, ...] | None, optional): Type of the image extensions to read from the
directory.
val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
Raises:
ValueError: When task is set to classification and `mask_dir` is provided. When `mask_dir` is
provided, `task` should be set to `segmentation`.
"""
def __init__(
self,
task: TaskType,
transform: A.Compose,
normal_dir: str | Path,
root: str | Path | None = None,
abnormal_dir: str | Path | None = None,
normal_test_dir: str | Path | None = None,
mask_dir: str | Path | None = None,
split: str | Split | None = None,
extensions: tuple[str, ...] | None = None,
) -> None:
super().__init__(task, transform)
self.split = split
self.root = root
self.normal_dir = normal_dir
self.abnormal_dir = abnormal_dir
self.normal_test_dir = normal_test_dir
self.mask_dir = mask_dir
self.extensions = extensions
def _setup(self) -> None:
"""Assign samples."""
self.samples = make_folder_dataset(
root=self.root,
normal_dir=self.normal_dir,
abnormal_dir=self.abnormal_dir,
normal_test_dir=self.normal_test_dir,
mask_dir=self.mask_dir,
split=self.split,
extensions=self.extensions,
)
[docs]class Folder(AnomalibDataModule):
"""Folder DataModule.
Args:
normal_dir (str | Path): Name of the directory containing normal images.
Defaults to "normal".
root (str | Path | None): Path to the root folder containing normal and abnormal dirs.
abnormal_dir (str | Path | None): Name of the directory containing abnormal images.
Defaults to "abnormal".
normal_test_dir (str | Path | None, optional): Path to the directory containing
normal images for the test dataset. Defaults to None.
mask_dir (str | Path | None, optional): Path to the directory containing
the mask annotations. Defaults to None.
normal_split_ratio (float, optional): Ratio to split normal training images and add to the
test set in case test set doesn't contain any normal images.
Defaults to 0.2.
extensions (tuple[str, ...] | None, optional): Type of the image extensions to read from the
directory. Defaults to None.
image_size (int | tuple[int, int] | None, optional): Size of the input image.
Defaults to None.
center_crop (int | tuple[int, int] | None, optional): When provided, the images will be center-cropped
to the provided dimensions.
normalize (bool): When True, the images will be normalized to the ImageNet statistics.
train_batch_size (int, optional): Training batch size. Defaults to 32.
test_batch_size (int, optional): Test batch size. Defaults to 32.
num_workers (int, optional): Number of workers. Defaults to 8.
task (TaskType, optional): Task type. Could be ``classification``, ``detection`` or ``segmentation``.
Defaults to segmentation.
transform_config_train (str | A.Compose | None, optional): Config for pre-processing
during training.
Defaults to None.
transform_config_val (str | A.Compose | None, optional): Config for pre-processing
during validation.
Defaults to None.
test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained.
test_split_ratio (float): Fraction of images from the train set that will be reserved for testing.
val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
val_split_ratio (float): Fraction of train or test images that will be reserved for validation.
seed (int | None, optional): Seed used during random subset splitting.
"""
def __init__(
self,
normal_dir: str | Path,
root: str | Path | None = None,
abnormal_dir: str | Path | None = None,
normal_test_dir: str | Path | None = None,
mask_dir: str | Path | None = None,
normal_split_ratio: float = 0.2,
extensions: tuple[str] | None = None,
image_size: int | tuple[int, int] | None = None,
center_crop: int | tuple[int, int] | None = None,
normalization: str | InputNormalizationMethod = InputNormalizationMethod.IMAGENET,
train_batch_size: int = 32,
eval_batch_size: int = 32,
num_workers: int = 8,
task: TaskType = TaskType.SEGMENTATION,
transform_config_train: str | A.Compose | None = None,
transform_config_eval: str | A.Compose | None = None,
test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR,
test_split_ratio: float = 0.2,
val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST,
val_split_ratio: float = 0.5,
seed: int | None = None,
) -> None:
super().__init__(
train_batch_size=train_batch_size,
eval_batch_size=eval_batch_size,
num_workers=num_workers,
test_split_mode=test_split_mode,
test_split_ratio=test_split_ratio,
val_split_mode=val_split_mode,
val_split_ratio=val_split_ratio,
seed=seed,
)
self.normal_split_ratio = normal_split_ratio
transform_train = get_transforms(
config=transform_config_train,
image_size=image_size,
center_crop=center_crop,
normalization=InputNormalizationMethod(normalization),
)
transform_eval = get_transforms(
config=transform_config_eval,
image_size=image_size,
center_crop=center_crop,
normalization=InputNormalizationMethod(normalization),
)
self.train_data = FolderDataset(
task=task,
transform=transform_train,
split=Split.TRAIN,
root=root,
normal_dir=normal_dir,
abnormal_dir=abnormal_dir,
normal_test_dir=normal_test_dir,
mask_dir=mask_dir,
extensions=extensions,
)
self.test_data = FolderDataset(
task=task,
transform=transform_eval,
split=Split.TEST,
root=root,
normal_dir=normal_dir,
abnormal_dir=abnormal_dir,
normal_test_dir=normal_test_dir,
mask_dir=mask_dir,
extensions=extensions,
)