Source code for torch_uncertainty.datasets.classification.tabular.kdd_churn

import pandas as pd
import torch

from .base import TabularClassificationDataset, load_arff



[docs]
class KDDChurn(TabularClassificationDataset):
    """The KDD Cup 2009 Customer Churn dataset (OpenML 1112).

    Predicts telecommunications customer churn from 190 numeric and 40
    categorical features. Missing values are imputed with the column mean
    (numeric); categorical columns use label encoding because several have
    thousands of unique values, making one-hot encoding impractical.
    Downloaded from OpenML as an ARFF.

    Reference:
        G. Lemaitre et al., *Challenges in Representation Learning: A Report
        on Three Machine Learning Contests*, KDD Cup, 2009.

    Note:
        The licenses of the datasets may differ from TorchUncertainty's
        license. Check before use.
    """

    # OpenML dataset 1112, file_id 53995
    url = "https://api.openml.org/data/v1/download/53995"
    dataset_name = "kdd_churn"
    filename = "KDDCup09_churn.arff"
    is_archive = False

    def _make_dataset(self) -> None:
        df = load_arff(self.root / self.dataset_name / self.filename)
        target_col = "CHURN"
        self.targets = torch.as_tensor(
            (df[target_col].astype(float) > 0).astype(int).values.copy(), dtype=torch.long
        )
        df = df.drop(columns=[target_col])
        # Impute and label-encode: one-hot encoding is impractical here because
        # several categorical columns have thousands of unique values (>70k total).
        for col in df.columns:
            if not pd.api.types.is_numeric_dtype(df[col]):
                df[col] = df[col].fillna("__missing__").astype("category").cat.codes.astype(float)
            else:
                df[col] = df[col].fillna(df[col].mean())
        self.data = torch.as_tensor(df.values.astype(float).copy(), dtype=torch.float32)
        self.num_features = self.data.shape[1]