Source code for torch_uncertainty.datasets.classification.tabular.adult

import numpy as np
import pandas as pd
import torch
from torch import Tensor

from .base import TabularClassificationDataset

_ADULT_COLUMNS = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]



[docs]
class AdultCensusIncome(TabularClassificationDataset):
    """The UCI Adult Census Income dataset.

    Predicts whether income exceeds $50K/year. The ``fnlwgt`` sampling-weight
    column is dropped as it is not a predictive feature. Train and test files
    are pre-split by the data provider; one-hot columns and imputation
    statistics are derived from the training file and applied to the test file
    to avoid distribution drift.

    Reference:
        R. Kohavi, *Scaling Up the Accuracy of Naive-Bayes Classifiers: a
        Decision-Tree Hybrid*, KDD, 1996.

    Note:
        The licenses of the datasets may differ from TorchUncertainty's
        license. Check before use.
    """

    url = "https://archive.ics.uci.edu/static/public/2/adult.zip"
    dataset_name = "adult"
    filename = "adult.data"
    need_split = False
    pre_split = True

    def _read(self, fname: str, skiprows: int) -> pd.DataFrame:
        return pd.read_csv(
            self.root / self.dataset_name / fname,
            names=_ADULT_COLUMNS,
            skipinitialspace=True,
            na_values=["?"],
            skiprows=skiprows,
        )

    def _make_pre_split_dataset(self) -> tuple[Tensor, Tensor, Tensor, Tensor]:
        train_df = self._read("adult.data", skiprows=0)
        # Test file begins with a header comment line.
        test_df = self._read("adult.test", skiprows=1)

        # Strip trailing period from the test file target column
        train_df["income"] = train_df["income"].str.strip().str.rstrip(".")
        test_df["income"] = test_df["income"].str.strip().str.rstrip(".")

        train_targets = torch.as_tensor(
            np.where(train_df["income"] == ">50K", 1, 0).copy(), dtype=torch.long
        )
        test_targets = torch.as_tensor(
            np.where(test_df["income"] == ">50K", 1, 0).copy(), dtype=torch.long
        )
        train_df = train_df.drop(columns=["income", "fnlwgt"])
        test_df = test_df.drop(columns=["income", "fnlwgt"])

        # Fill missing categorical values with the training-set mode so test
        # imputation does not depend on test statistics.
        for col in train_df.select_dtypes(include="object").columns:
            mode = train_df[col].mode()[0]
            train_df[col] = train_df[col].fillna(mode)
            test_df[col] = test_df[col].fillna(mode)

        n_train = len(train_df)
        combined = pd.get_dummies(pd.concat([train_df, test_df], axis=0)).astype(float)
        train_data = torch.as_tensor(combined.iloc[:n_train].values.copy(), dtype=torch.float32)
        test_data = torch.as_tensor(combined.iloc[n_train:].values.copy(), dtype=torch.float32)
        self.num_features = train_data.shape[1]
        return train_data, train_targets, test_data, test_targets