Source code for torch_uncertainty.datasets.classification.tabular.credit_g

import pandas as pd
import torch

from .base import TabularClassificationDataset



[docs]
class GermanCredit(TabularClassificationDataset):
    """The UCI Statlog German Credit dataset.

    Predicts credit risk (good/bad). Reads the ``german.data`` file, whose
    categorical attributes are encoded as string codes (``A11``, ``A34`` ...)
    and are one-hot expanded via :func:`pandas.get_dummies`; numeric attributes
    are kept as-is.

    Reference:
        H. Hofmann, *Statlog (German Credit Data)*, UCI ML Repository, 1994.

    Note:
        The licenses of the datasets may differ from TorchUncertainty's
        license. Check before use.
    """

    url = "https://archive.ics.uci.edu/static/public/144/statlog+german+credit+data.zip"
    dataset_name = "german_credit"
    filename = "german.data"

    def _make_dataset(self) -> None:
        data = pd.read_csv(
            self.root / self.dataset_name / self.filename,
            sep=r"\s+",
            header=None,
        )
        # Last column: 1 = good credit → 0, 2 = bad credit → 1
        self.targets = torch.as_tensor((data.iloc[:, -1].values - 1).copy(), dtype=torch.long)
        data = data.iloc[:, :-1]
        self.data = torch.as_tensor(
            pd.get_dummies(data).astype(float).values.copy(), dtype=torch.float32
        )
        self.num_features = self.data.shape[1]