Source code for torch_uncertainty.datasets.regression.uci_regression

import logging
from collections.abc import Callable
from importlib import util
from pathlib import Path

if util.find_spec("pandas"):
    import pandas as pd

    pandas_installed = True
else:  # coverage: ignore
    pandas_installed = False


import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.datasets.utils import (
    check_integrity,
    download_and_extract_archive,
    download_url,
)

boston_column_names = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV",
]

energy_prediction_column_names = [
    "Appliances",
    "lights",
    "T1",
    "RH_1",
    "T2",
    "RH_2",
    "T3",
    "RH_3",
    "T4",
    "RH_4",
    "T5",
    "RH_5",
    "T6",
    "RH_6",
    "T7",
    "RH_7",
    "T8",
    "RH_8",
    "T9",
    "RH_9",
    "T_out",  # Dropped
]



[docs]
class UCIRegression(Dataset):
    root_appendix = "uci_regression"
    uci_subsets = [
        "boston",
        "concrete",
        "energy-efficiency",
        "energy-prediction",
        "kin8nm",
        "naval-propulsion-plant",
        "power-plant",
        "protein",
        "wine-quality-red",
        "yacht",
    ]

    md5_tgz = [
        "d4accdce7a25600298819f8e28e8d593",
        "eba3e28907d4515244165b6b2c311b7b",
        "2018fb7b50778fdc1304d50a78874579",
        "d0f0f8ceaaf45df2233ce0600097bd84",
        "df08c665b7665809e74e32b107836a3a",
        "54f4febcf51bdba12e1ca63e28b3e973",
        "f5065a616eae05eb4ecae445ecf6e720",
        "37bcb77a8abad274a987439e6a3de632",
        "0ddfa7a9379510fe7ff88b9930e3c332",
        "4e6727f462779e2d396e8f7d2ddb79a3",
    ]
    urls = [
        "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
        "https://archive.ics.uci.edu/static/public/165/concrete+compressive+strength.zip",
        "https://archive.ics.uci.edu/static/public/242/energy+efficiency.zip",
        "https://archive.ics.uci.edu/static/public/374/appliances+energy+prediction.zip",
        "https://zenodo.org/records/14645866/files/kin8nm.csv",
        "https://raw.githubusercontent.com/luishpinto/cm-naval-propulsion-plant/master/data.csv",
        "https://archive.ics.uci.edu/static/public/294/combined+cycle+power+plant.zip",
        "https://archive.ics.uci.edu/static/public/265/physicochemical+"
        "properties+of+protein+tertiary+structure.zip",
        "https://archive.ics.uci.edu/static/public/186/wine+quality.zip",
        "https://archive.ics.uci.edu/static/public/243/yacht+hydrodynamics.zip",
    ]

    def __init__(
        self,
        root: Path | str,
        transform: Callable | None = None,
        target_transform: Callable | None = None,
        dataset_name: str = "energy",
        download: bool = False,
        seed: int = 42,
        shuffle: bool = True,
    ) -> None:
        """The UCI regression datasets.

        Args:
            root (str): Root directory of the datasets.
            transform (callable, optional): A function/transform that takes in a
                numpy array and returns a transformed version. Defaults to ``None``.
            target_transform (callable, optional): A function/transform that takes
                in the target and transforms it. Defaults to ``None``.
            dataset_name (str, optional): The name of the dataset. One of
                ``boston-housing``, ``concrete``, ``energy``, ``kin8nm``,
                ``naval-propulsion-plant``, ``power-plant``, ``protein``,
                ``wine-quality-red``, and ``yacht``. Defaults to ``energy``.
            download (bool, optional): If ``True``, downloads the dataset from the
                internet and puts it in root directory. If dataset is already
                downloaded, it is not downloaded again. Defaults to ``False``.
            seed (int, optional): The random seed for shuffling the dataset.
                Defaults to ``42``.
            shuffle (bool, optional): If ``True``, shuffles the dataset.
                Defaults to ``True``.

        Note:
            You may want to avoid using the boston-housing dataset because of
            ethical concerns.
            The licenses of the datasets may differ from TorchUncertainty's
            license. Check before use.
        """
        super().__init__()
        self.root = Path(root)
        self.transform = transform
        self.target_transform = target_transform
        self.seed = seed
        self.shuffle = shuffle

        if dataset_name not in self.uci_subsets:
            raise ValueError(
                f"The dataset {dataset_name} is not implemented. "
                "`dataset_name` should be one of {self.uci_subsets}."
            )
        self.dataset_name = dataset_name
        dataset_id = self.uci_subsets.index(dataset_name)
        self.url = self.urls[dataset_id]
        self.start_filename = self.url.split("/")[-1]
        self.md5 = self.md5_tgz[dataset_id]

        if download:
            self.download()

        self._make_dataset()

    def __len__(self) -> int:
        """Get the length of the dataset."""
        return self.data.shape[0]

    def _check_integrity(self) -> bool:
        """Check the integrity of the dataset(s)."""
        return check_integrity(
            self.root / self.root_appendix / Path(self.start_filename),
            self.md5,
        )

    def _standardize(self) -> None:
        self.data = (self.data - self.data_mean) / self.data_std
        self.targets = (self.targets - self.target_mean) / self.target_std

    def _compute_statistics(self) -> None:
        self.data_mean = self.data.mean(axis=0)
        self.data_std = self.data.std(axis=0)
        self.data_std[self.data_std == 0] = 1
        self.target_mean = self.targets.mean(axis=0)
        self.target_std = self.targets.std(axis=0)


[docs]
    def download(self) -> None:
        """Download and extract dataset."""
        if self._check_integrity():
            logging.info("Files already downloaded and verified")
            return
        if self.url is None:
            raise ValueError(f"The dataset {self.dataset_name} is not available for download.")
        download_root = self.root / self.root_appendix / self.dataset_name
        if self.dataset_name == "boston":
            download_url(
                self.url,
                root=download_root,
                filename="housing.data",
            )
        elif self.dataset_name == "kin8nm":
            download_url(
                self.url,
                root=download_root,
                filename="kin8nm.csv",
            )
        elif self.dataset_name == "naval-propulsion-plant":
            download_url(
                self.url,
                root=download_root,
                filename="data.csv",
            )
        else:
            download_and_extract_archive(
                self.url,
                download_root=download_root,
                extract_root=download_root,
                filename=self.start_filename,
                md5=self.md5,
            )


    def _make_dataset(self) -> None:
        """Create dataset from extracted files."""
        if not pandas_installed:  # coverage: ignore
            raise ImportError(
                "Please install torch_uncertainty with the tabular option:"
                """pip install -U "torch_uncertainty[tabular]"."""
            )
        path = self.root / self.root_appendix / self.dataset_name
        if self.dataset_name == "boston":
            array = pd.read_table(
                path / "housing.data",
                names=boston_column_names,
                header=None,
                sep=r"\s+",
            ).to_numpy()
        elif self.dataset_name == "concrete":
            array = pd.read_excel(path / "Concrete_Data.xls").to_numpy()
        elif self.dataset_name == "energy-efficiency":
            array = pd.read_excel(path / "ENB2012_data.xlsx").to_numpy()
        elif self.dataset_name == "energy-prediction":
            array = pd.read_csv(path / "energydata_complete.csv")[
                energy_prediction_column_names
            ].to_numpy()
        elif self.dataset_name == "kin8nm":
            array = pd.read_csv(path / "kin8nm.csv").to_numpy()
        elif self.dataset_name == "naval-propulsion-plant":
            df = pd.read_csv(path / "data.csv", header=None, sep=";", decimal=",")
            # convert Ex to 10^x and remove second target
            array = df.apply(pd.to_numeric, errors="coerce").to_numpy()[:, :-1]
        elif self.dataset_name == "protein":
            array = pd.read_csv(
                path / "CASP.csv",
            ).to_numpy()
        elif self.dataset_name == "wine-quality-red":
            array = pd.read_csv(
                path / "winequality-red.csv",
                sep=";",
            ).to_numpy()
        elif self.dataset_name == "yacht":
            array = pd.read_csv(
                path / "yacht_hydrodynamics.data",
                sep=r"\s+",
                header=None,
            ).to_numpy()
        else:
            raise ValueError("Dataset not implemented.")

        array = torch.as_tensor(array).float()

        if self.dataset_name == "energy-efficiency":
            self.data = array[:, 2:-3]
            self.targets = array[:, -2]
        else:
            self.data = array[:, :-1]
            self.targets = array[:, -1]

        self._compute_statistics()
        self._standardize()

        if self.dataset_name == "energy-prediction":
            self.data = F.pad(self.data, (0, 0, 13, 0), value=0)

        if self.shuffle:
            gen = torch.Generator()
            gen.manual_seed(self.seed)
            indexes = torch.randperm(array.shape[0], generator=gen)
            array = array[indexes]

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        """Get sample and target for a given index."""
        if self.dataset_name == "energy-prediction":
            data = self.data[index : index + 13, :]
            target = self.data[index : index + 13, :]
            return data, target

        data = self.data[index]
        if self.transform is not None:
            data = self.transform(data)
        target = self.targets[index]
        if self.target_transform is not None:
            target = self.target_transform(target)
        return data, target

    def __repr__(self) -> str:
        """Dataset representation."""
        head = f"Dataset {self.__class__.__name__}: {self.dataset_name}"
        body = [f"Number of datapoints: {self.__len__()}"]
        if self.root is not None:
            body.append(f"Root location: {self.root}")
        body += self.extra_repr().splitlines()
        if hasattr(self, "transforms") and self.transforms is not None:
            body += [repr(self.transforms)]
        lines = [head] + [" " * 4 + line for line in body]
        return "\n".join(lines)

    def _format_transform_repr(self, transform: Callable, head: str) -> list[str]:
        lines = transform.__repr__().splitlines()
        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]

    def extra_repr(self) -> str:
        return f"number of features: {self.data.shape[1]}\nnumber of outputs: {self.targets.shape[1] if self.targets.ndim > 1 else 1}"