Source code for torch_uncertainty.datasets.classification.uci.online_shoppers

from collections.abc import Callable
from pathlib import Path

import pandas as pd
import torch

from .uci_classification import UCIClassificationDataset



[docs]
class OnlineShoppers(UCIClassificationDataset):
    md5_zip = "d835049e5f428f3b8cb8a6e6937f5537"
    url = "https://archive.ics.uci.edu/static/public/468/online+shoppers+purchasing+intention+dataset.zip"
    dataset_name = "online_shoppers"
    filename = "online_shoppers_intention.csv"
    num_features = 28

    def __init__(
        self,
        root: Path | str,
        transform: Callable | None = None,
        target_transform: Callable | None = None,
        binary: bool = True,
        download: bool = False,
        train: bool = True,
        test_split: float = 0.2,
        split_seed: int = 21893027,
    ) -> None:
        """The Online Shoppers Intention UCI classification dataset.

        Args:
            root (str | Path): Root directory of the datasets.
            train (bool, optional): If ``True``, creates dataset from training set,
                otherwise creates from test set.
            transform (callable, optional): A function/transform that takes in a
                numpy array and returns a transformed version. Defaults to ``None``.
            target_transform (callable, optional): A function/transform that takes
                in the target and transforms it. Defaults to ``None``.
            download (bool, optional): If ``True``, downloads the dataset from the
                internet and puts it in root directory. If dataset is already
                downloaded, it is not downloaded again. Defaults to ``False``.
            binary (bool, optional): Whether to use binary classification. Defaults
                to ``True``.
            test_split (float, optional): The fraction of the dataset to use as test set.
                Defaults to ``0.2``.
            split_seed (int, optional): The random seed for splitting the dataset.
                Defaults to ``21893027``.

        Note:
            License: The licenses of the datasets may differ from TorchUncertainty's
            license. Check before use.
        """
        super().__init__(
            root,
            transform,
            target_transform,
            binary,
            download,
            train,
            test_split,
            split_seed,
        )

    def _make_dataset(self) -> None:
        """Create dataset from extracted files."""
        data = pd.read_csv(
            self.root / self.dataset_name / self.filename,
            sep=",",
            true_values=["TRUE"],
            false_values=["FALSE"],
        )
        self.targets = torch.as_tensor(data["Revenue"].values, dtype=torch.long)

        data = pd.get_dummies(data).astype(float)
        data = data.drop(columns=["Revenue"])
        self.data = torch.as_tensor(data.values, dtype=torch.float32)
        self.num_features = self.data.shape[1]