Source code for torch_uncertainty.datasets.classification.tabular.aps_failure
import pandas as pd
import torch
from torch import Tensor
from .base import TabularClassificationDataset
[docs]
class APSFailure(TabularClassificationDataset):
"""The UCI APS Failure at Scania Trucks dataset.
Predicts whether an air pressure system (APS) component caused a truck
failure. The dataset is provided pre-split; ``train=False`` loads the
held-out test set. Missing values (``na``) are imputed with the
training-set column mean for both splits.
Reference:
M. Cerqueira et al., *Predicting Failures in Industrial Plants*,
UCI ML Repository, 2016.
Note:
The licenses of the datasets may differ from TorchUncertainty's
license. Check before use.
"""
url = "https://archive.ics.uci.edu/static/public/421/aps+failure+at+scania+trucks.zip"
dataset_name = "aps_failure"
filename = "aps_failure_training_set.csv"
need_split = False
pre_split = True
def _read(self, fname: str) -> pd.DataFrame:
# The CSV files begin with ~20 comment lines followed by the header row.
return pd.read_csv(
self.root / self.dataset_name / fname,
na_values=["na"],
comment=None,
header=0,
skiprows=20,
)
def _make_pre_split_dataset(self) -> tuple[Tensor, Tensor, Tensor, Tensor]:
train_df = self._read("aps_failure_training_set.csv")
test_df = self._read("aps_failure_test_set.csv")
train_targets = torch.tensor(
(train_df["class"] == "pos").astype(int).to_numpy().copy(), dtype=torch.long
)
test_targets = torch.tensor(
(test_df["class"] == "pos").astype(int).to_numpy().copy(), dtype=torch.long
)
train_df = train_df.drop(columns=["class"])
test_df = test_df.drop(columns=["class"])
train_df = train_df.apply(pd.to_numeric, errors="coerce")
test_df = test_df.apply(pd.to_numeric, errors="coerce")
# Impute with training-set column mean
train_means = train_df.mean()
train_df = train_df.fillna(train_means)
test_df = test_df.fillna(train_means)
train_data = torch.tensor(train_df.to_numpy(dtype=float).copy(), dtype=torch.float32)
test_data = torch.tensor(test_df.to_numpy(dtype=float).copy(), dtype=torch.float32)
self.num_features = train_data.shape[1]
return train_data, train_targets, test_data, test_targets