Source code for torch_uncertainty.datasets.classification.tabular.aps_failure

import pandas as pd
import torch
from torch import Tensor

from .base import TabularClassificationDataset


[docs] class APSFailure(TabularClassificationDataset): """The UCI APS Failure at Scania Trucks dataset. Predicts whether an air pressure system (APS) component caused a truck failure. The dataset is provided pre-split; ``train=False`` loads the held-out test set. Missing values (``na``) are imputed with the training-set column mean for both splits. Reference: M. Cerqueira et al., *Predicting Failures in Industrial Plants*, UCI ML Repository, 2016. Note: The licenses of the datasets may differ from TorchUncertainty's license. Check before use. """ url = "https://archive.ics.uci.edu/static/public/421/aps+failure+at+scania+trucks.zip" dataset_name = "aps_failure" filename = "aps_failure_training_set.csv" need_split = False pre_split = True def _read(self, fname: str) -> pd.DataFrame: # The CSV files begin with ~20 comment lines followed by the header row. return pd.read_csv( self.root / self.dataset_name / fname, na_values=["na"], comment=None, header=0, skiprows=20, ) def _make_pre_split_dataset(self) -> tuple[Tensor, Tensor, Tensor, Tensor]: train_df = self._read("aps_failure_training_set.csv") test_df = self._read("aps_failure_test_set.csv") train_targets = torch.tensor( (train_df["class"] == "pos").astype(int).to_numpy().copy(), dtype=torch.long ) test_targets = torch.tensor( (test_df["class"] == "pos").astype(int).to_numpy().copy(), dtype=torch.long ) train_df = train_df.drop(columns=["class"]) test_df = test_df.drop(columns=["class"]) train_df = train_df.apply(pd.to_numeric, errors="coerce") test_df = test_df.apply(pd.to_numeric, errors="coerce") # Impute with training-set column mean train_means = train_df.mean() train_df = train_df.fillna(train_means) test_df = test_df.fillna(train_means) train_data = torch.tensor(train_df.to_numpy(dtype=float).copy(), dtype=torch.float32) test_data = torch.tensor(test_df.to_numpy(dtype=float).copy(), dtype=torch.float32) self.num_features = train_data.shape[1] return train_data, train_targets, test_data, test_targets