Source code for torch_uncertainty.datasets.classification.tabular.telco_churn
import pandas as pd
import torch
from .base import TabularClassificationDataset, load_arff
[docs]
class TelcoChurn(TabularClassificationDataset):
"""The Telecom Customer Churn dataset (OpenML 40701).
Predicts whether a customer churns. The ``phone_number`` column is dropped
as it is a non-predictive identifier. Downloaded from the OpenML repository
as an ARFF file.
Note:
The licenses of the datasets may differ from TorchUncertainty's
license. Check before use.
"""
# OpenML dataset 40701, file_id 4965302
url = "https://api.openml.org/data/v1/download/4965302"
dataset_name = "telco_churn"
filename = "churn.arff"
is_archive = False
def _make_dataset(self) -> None:
df = load_arff(self.root / self.dataset_name / self.filename)
# Drop non-predictive identifier
df = df.drop(columns=["phone_number"], errors="ignore")
target_col = "class"
normalised = df[target_col].astype(str).str.strip().str.rstrip(".").str.lower()
unique = set(normalised.unique())
positive_aliases = {"true", "1", "yes"}
negative_aliases = {"false", "0", "no"}
if not unique <= positive_aliases | negative_aliases:
raise ValueError(
f"TelcoChurn: unexpected values in '{target_col}': {sorted(unique)}. "
"Expected a binary nominal attribute (True/False, 1/0, or yes/no)."
)
self.targets = torch.as_tensor(
normalised.isin(positive_aliases).astype(int).values.copy(), dtype=torch.long
)
df = df.drop(columns=[target_col])
cat_cols = df.select_dtypes(include="object").columns
df = pd.get_dummies(df, columns=cat_cols).astype(float)
self.data = torch.as_tensor(df.values.copy(), dtype=torch.float32)
self.num_features = self.data.shape[1]