Source code for torch_uncertainty.datasets.classification.tabular.credit_approval

import pandas as pd
import torch

from .base import TabularClassificationDataset



[docs]
class CreditApproval(TabularClassificationDataset):
    """The UCI Credit Approval dataset.

    Predicts credit card application approval. Features are anonymised;
    missing values (``?``) are imputed with the column mean for numeric
    attributes and the mode for categorical ones.

    Reference:
        J.R. Quinlan, *Simplifying Decision Trees*, IJMMS, 1987.

    Note:
        The licenses of the datasets may differ from TorchUncertainty's
        license. Check before use.
    """

    url = "https://archive.ics.uci.edu/static/public/27/credit+approval.zip"
    dataset_name = "credit_approval"
    filename = "crx.data"

    def _make_dataset(self) -> None:
        data = pd.read_csv(
            self.root / self.dataset_name / self.filename,
            header=None,
            na_values=["?"],
        )
        # Target is the last column: '+' → 1, '-' → 0
        self.targets = torch.as_tensor(
            (data.iloc[:, -1] == "+").astype(int).values.copy(), dtype=torch.long
        )
        data = data.iloc[:, :-1]
        # Impute missing values
        for col in data.columns:
            if not pd.api.types.is_numeric_dtype(data[col]):
                data[col] = data[col].fillna(data[col].mode()[0])
            else:
                data[col] = data[col].fillna(data[col].mean())
        self.data = torch.as_tensor(
            pd.get_dummies(data).astype(float).values.copy(), dtype=torch.float32
        )
        self.num_features = self.data.shape[1]