Source code for torch_uncertainty.datasets.classification.tabular.bank_marketing

import logging

import numpy as np
import pandas as pd
import torch
from torchvision.datasets.utils import download_and_extract_archive, extract_archive

from .base import TabularClassificationDataset



[docs]
class BankMarketing(TabularClassificationDataset):
    """The UCI Bank Marketing dataset.

    Predicts whether a client subscribes to a term deposit from direct
    marketing campaign data.

    Reference:
        S. Moro et al., *A Data-Driven Approach to Predict the Success of Bank
        Telemarketing*, Decision Support Systems, 2014.

    Note:
        The licenses of the datasets may differ from TorchUncertainty's
        license. Check before use.
    """

    md5_zip = "3a3c6c4189975ea1f3040dbd60ad106c"
    url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
    dataset_name = "bank_marketing"
    filename = "bank-additional-full.csv"

    def _check_integrity(self) -> bool:
        return (self.root / self.dataset_name / "bank-additional" / self.filename).is_file()

    def download(self) -> None:
        if self._check_integrity():
            logging.info("Files already downloaded and verified")
            return
        download_and_extract_archive(
            self.url,
            download_root=self.root / self.dataset_name,
            filename="bank+marketing.zip",
            md5=self.md5_zip,
        )
        extract_archive(
            self.root / self.dataset_name / "bank-additional.zip",
            self.root / self.dataset_name,
        )

    def _make_dataset(self) -> None:
        data = pd.read_csv(
            self.root / self.dataset_name / "bank-additional" / self.filename,
            sep=";",
        )
        self.targets = torch.as_tensor(np.where(data["y"] == "yes", 1, 0).copy(), dtype=torch.long)
        data = data.drop(columns=["y"])
        # Compress columns whose unique non-null values are literally {"yes", "no"}.
        # Other 2-value object columns (e.g. ``contact``: cellular/telephone) must
        # be left to one-hot encoding to avoid silently zeroing out a real feature.
        for col in data.select_dtypes(include="object").columns:
            uniques = set(data[col].dropna().unique())
            if uniques == {"yes", "no"}:
                data[col] = np.where(data[col] == "yes", 1, 0)
        self.data = torch.as_tensor(
            pd.get_dummies(data).astype(float).values.copy(), dtype=torch.float32
        )
        self.num_features = self.data.shape[1]