Source code for torch_uncertainty.datasets.classification.tabular.kdd_churn

import pandas as pd
import torch

from .base import TabularClassificationDataset, load_arff


[docs] class KDDChurn(TabularClassificationDataset): """The KDD Cup 2009 Customer Churn dataset (OpenML 1112). Predicts telecommunications customer churn from 190 numeric and 40 categorical features. Missing values are imputed with the column mean (numeric); categorical columns use label encoding because several have thousands of unique values, making one-hot encoding impractical. Downloaded from OpenML as an ARFF. Reference: G. Lemaitre et al., *Challenges in Representation Learning: A Report on Three Machine Learning Contests*, KDD Cup, 2009. Note: The licenses of the datasets may differ from TorchUncertainty's license. Check before use. """ # OpenML dataset 1112, file_id 53995 url = "https://api.openml.org/data/v1/download/53995" dataset_name = "kdd_churn" filename = "KDDCup09_churn.arff" is_archive = False def _make_dataset(self) -> None: df = load_arff(self.root / self.dataset_name / self.filename) target_col = "CHURN" self.targets = torch.as_tensor( (df[target_col].astype(float) > 0).astype(int).values.copy(), dtype=torch.long ) df = df.drop(columns=[target_col]) # Impute and label-encode: one-hot encoding is impractical here because # several categorical columns have thousands of unique values (>70k total). for col in df.columns: if not pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].fillna("__missing__").astype("category").cat.codes.astype(float) else: df[col] = df[col].fillna(df[col].mean()) self.data = torch.as_tensor(df.values.astype(float).copy(), dtype=torch.float32) self.num_features = self.data.shape[1]