Source code for torch_uncertainty.datasets.classification.tabular.bank_marketing
import logging
import numpy as np
import pandas as pd
import torch
from torchvision.datasets.utils import download_and_extract_archive, extract_archive
from .base import TabularClassificationDataset
[docs]
class BankMarketing(TabularClassificationDataset):
"""The UCI Bank Marketing dataset.
Predicts whether a client subscribes to a term deposit from direct
marketing campaign data.
Reference:
S. Moro et al., *A Data-Driven Approach to Predict the Success of Bank
Telemarketing*, Decision Support Systems, 2014.
Note:
The licenses of the datasets may differ from TorchUncertainty's
license. Check before use.
"""
md5_zip = "3a3c6c4189975ea1f3040dbd60ad106c"
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
dataset_name = "bank_marketing"
filename = "bank-additional-full.csv"
def _check_integrity(self) -> bool:
return (self.root / self.dataset_name / "bank-additional" / self.filename).is_file()
def download(self) -> None:
if self._check_integrity():
logging.info("Files already downloaded and verified")
return
download_and_extract_archive(
self.url,
download_root=self.root / self.dataset_name,
filename="bank+marketing.zip",
md5=self.md5_zip,
)
extract_archive(
self.root / self.dataset_name / "bank-additional.zip",
self.root / self.dataset_name,
)
def _make_dataset(self) -> None:
data = pd.read_csv(
self.root / self.dataset_name / "bank-additional" / self.filename,
sep=";",
)
self.targets = torch.as_tensor(np.where(data["y"] == "yes", 1, 0).copy(), dtype=torch.long)
data = data.drop(columns=["y"])
# Compress columns whose unique non-null values are literally {"yes", "no"}.
# Other 2-value object columns (e.g. ``contact``: cellular/telephone) must
# be left to one-hot encoding to avoid silently zeroing out a real feature.
for col in data.select_dtypes(include="object").columns:
uniques = set(data[col].dropna().unique())
if uniques == {"yes", "no"}:
data[col] = np.where(data[col] == "yes", 1, 0)
self.data = torch.as_tensor(
pd.get_dummies(data).astype(float).values.copy(), dtype=torch.float32
)
self.num_features = self.data.shape[1]