Source code for torch_uncertainty.datasets.classification.imagenet.tiny_imagenet_c

import logging
from collections.abc import Callable
from pathlib import Path

from torchvision.datasets import ImageFolder
from torchvision.datasets.utils import (
    check_integrity,
    download_and_extract_archive,
)



[docs]
class TinyImageNetC(ImageFolder):
    base_folder = "Tiny-ImageNet-C"
    tgz_md5 = [
        "f9c9a9dbdc11469f0b850190f7ad8be1",
        "0db0588d243cf403ef93449ec52b70eb",
    ]
    subsets = [
        "brightness",
        "contrast",
        "defocus_blur",
        "elastic_transform",
        "fog",
        "frost",
        "gaussian_blur",
        "gaussian_noise",
        "glass_blur",
        "impulse_noise",
        "jpeg_compression",
        "motion_blur",
        "pixelate",
        "saturate",
        "shot_noise",
        "snow",
        "spatter",
        "speckle_noise",
        "zoom_blur",
    ]

    url = "https://zenodo.org/record/8206060/files/"
    filename = ["Tiny-ImageNet-C.tar", "Tiny-ImageNet-C-extra.tar"]

    def __init__(
        self,
        root: str | Path,
        transform: Callable | None = None,
        target_transform: Callable | None = None,
        subset: str = "all",
        shift_severity: int = 1,
        download: bool = False,
    ) -> None:
        """The corrupted TinyImageNet-C Dataset.

        Args:
            root: Root directory of the datasets.
            transform: A function/transform that takes in
                a PIL image and returns a transformed version. E.g,
                ``transforms.RandomCrop``. Defaults to ``None``.
            target_transform: A function/transform that
                takes in the target and transforms it. Defaults to ``None``.
            subset: The subset to use, one of ``all`` or the keys in
                ``cifarc_subsets``.
            shift_severity: The shift_severity of the corruption, between ``1`` and ``5``.
            download: If True, downloads the dataset from the
                internet and puts it in root directory. If dataset is already
                downloaded, it is not downloaded again. Defaults to ``False``.

        References:
            [1] `Benchmarking neural network robustness to common corruptions and perturbations. Dan Hendrycks and Thomas Dietterich. In ICLR, 2019
            <https://arxiv.org/abs/1903.12261>`_.
        """
        self.root = Path(root)

        if download:
            self.download()

        if not self._check_integrity():
            raise RuntimeError("Dataset not found. You can use download=True to download it.")
        super().__init__(
            root=self.root / self.base_folder / "brightness/1/",
            transform=transform,
        )
        if subset not in ["all", *self.subsets]:
            raise ValueError(f"The subset '{subset}' does not exist in TinyImageNet-C.")
        self.subset = subset
        self.shift_severity = shift_severity

        self.transform = transform
        self.target_transform = target_transform

        if shift_severity not in list(range(1, 6)):
            raise ValueError(
                "Corruptions shift_severity should be chosen between 1 and 5 included."
            )

        # Update samples given the subset and shift_severity
        self._make_c_dataset(self.subset, self.shift_severity)

    def _make_c_dataset(self, subset: str, shift_severity: int) -> None:
        """Build the corrupted dataset according to the chosen subset and
        shift_severity. If the subset is 'all', gather all corruption types
        in the dataset.

        Args:
            subset: The name of the corruption subset to be used. Choose
                `all` for the dataset to contain all subsets.
            shift_severity: The shift_severity of the corruption applied to the
                images.
        """
        if subset == "all":
            collection = []
            for subset in self.subsets:
                imgs = [
                    (
                        img[0]
                        .replace("brightness", subset)
                        .replace("/1/", "/" + str(shift_severity) + "/"),
                        img[1],
                    )
                    for img in self.imgs
                ]

                collection.extend(imgs)
            self.imgs = collection
            self.samples = self.imgs
        else:
            self.imgs = [
                (
                    img[0]
                    .replace("brightness", subset)
                    .replace("/1/", "/" + str(shift_severity) + "/"),
                    img[1],
                )
                for img in self.imgs
            ]

    def _check_integrity(self) -> bool:
        """Check the integrity of the dataset."""
        for filename, md5 in list(zip(self.filename, self.tgz_md5, strict=True)):
            if "extra" in filename:
                fpath = self.root / "Tiny-ImageNet-C" / filename
            else:
                fpath = self.root / filename
            if not check_integrity(fpath, md5):
                return False
        return True


[docs]
    def download(self) -> None:
        """Download the dataset."""
        if self._check_integrity():
            logging.info("Files already downloaded and verified")
            return
        for filename, md5 in list(zip(self.filename, self.tgz_md5, strict=True)):
            if "extra" in filename:
                download_and_extract_archive(
                    self.url + filename,
                    self.root,
                    md5=md5,
                    filename="Tiny-ImageNet-C/" + filename,
                )
            else:
                download_and_extract_archive(
                    self.url + filename,
                    self.root,
                    md5=md5,
                )