Source code for torch_uncertainty.datasets.classification.tabular.telco_churn

import pandas as pd
import torch

from .base import TabularClassificationDataset, load_arff



[docs]
class TelcoChurn(TabularClassificationDataset):
    """The Telecom Customer Churn dataset (OpenML 40701).

    Predicts whether a customer churns. The ``phone_number`` column is dropped
    as it is a non-predictive identifier. Downloaded from the OpenML repository
    as an ARFF file.

    Note:
        The licenses of the datasets may differ from TorchUncertainty's
        license. Check before use.
    """

    # OpenML dataset 40701, file_id 4965302
    url = "https://api.openml.org/data/v1/download/4965302"
    dataset_name = "telco_churn"
    filename = "churn.arff"
    is_archive = False

    def _make_dataset(self) -> None:
        df = load_arff(self.root / self.dataset_name / self.filename)
        # Drop non-predictive identifier
        df = df.drop(columns=["phone_number"], errors="ignore")
        target_col = "class"
        normalised = df[target_col].astype(str).str.strip().str.rstrip(".").str.lower()
        unique = set(normalised.unique())
        positive_aliases = {"true", "1", "yes"}
        negative_aliases = {"false", "0", "no"}
        if not unique <= positive_aliases | negative_aliases:
            raise ValueError(
                f"TelcoChurn: unexpected values in '{target_col}': {sorted(unique)}. "
                "Expected a binary nominal attribute (True/False, 1/0, or yes/no)."
            )
        self.targets = torch.as_tensor(
            normalised.isin(positive_aliases).astype(int).values.copy(), dtype=torch.long
        )
        df = df.drop(columns=[target_col])
        cat_cols = df.select_dtypes(include="object").columns
        df = pd.get_dummies(df, columns=cat_cols).astype(float)
        self.data = torch.as_tensor(df.values.copy(), dtype=torch.float32)
        self.num_features = self.data.shape[1]