Source code for torch_uncertainty.datasets.classification.tabular.adult
import numpy as np
import pandas as pd
import torch
from torch import Tensor
from .base import TabularClassificationDataset
_ADULT_COLUMNS = [
"age",
"workclass",
"fnlwgt",
"education",
"education-num",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"income",
]
[docs]
class AdultCensusIncome(TabularClassificationDataset):
"""The UCI Adult Census Income dataset.
Predicts whether income exceeds $50K/year. The ``fnlwgt`` sampling-weight
column is dropped as it is not a predictive feature. Train and test files
are pre-split by the data provider; one-hot columns and imputation
statistics are derived from the training file and applied to the test file
to avoid distribution drift.
Reference:
R. Kohavi, *Scaling Up the Accuracy of Naive-Bayes Classifiers: a
Decision-Tree Hybrid*, KDD, 1996.
Note:
The licenses of the datasets may differ from TorchUncertainty's
license. Check before use.
"""
url = "https://archive.ics.uci.edu/static/public/2/adult.zip"
dataset_name = "adult"
filename = "adult.data"
need_split = False
pre_split = True
def _read(self, fname: str, skiprows: int) -> pd.DataFrame:
return pd.read_csv(
self.root / self.dataset_name / fname,
names=_ADULT_COLUMNS,
skipinitialspace=True,
na_values=["?"],
skiprows=skiprows,
)
def _make_pre_split_dataset(self) -> tuple[Tensor, Tensor, Tensor, Tensor]:
train_df = self._read("adult.data", skiprows=0)
# Test file begins with a header comment line.
test_df = self._read("adult.test", skiprows=1)
# Strip trailing period from the test file target column
train_df["income"] = train_df["income"].str.strip().str.rstrip(".")
test_df["income"] = test_df["income"].str.strip().str.rstrip(".")
train_targets = torch.as_tensor(
np.where(train_df["income"] == ">50K", 1, 0).copy(), dtype=torch.long
)
test_targets = torch.as_tensor(
np.where(test_df["income"] == ">50K", 1, 0).copy(), dtype=torch.long
)
train_df = train_df.drop(columns=["income", "fnlwgt"])
test_df = test_df.drop(columns=["income", "fnlwgt"])
# Fill missing categorical values with the training-set mode so test
# imputation does not depend on test statistics.
for col in train_df.select_dtypes(include="object").columns:
mode = train_df[col].mode()[0]
train_df[col] = train_df[col].fillna(mode)
test_df[col] = test_df[col].fillna(mode)
n_train = len(train_df)
combined = pd.get_dummies(pd.concat([train_df, test_df], axis=0)).astype(float)
train_data = torch.as_tensor(combined.iloc[:n_train].values.copy(), dtype=torch.float32)
test_data = torch.as_tensor(combined.iloc[n_train:].values.copy(), dtype=torch.float32)
self.num_features = train_data.shape[1]
return train_data, train_targets, test_data, test_targets