Source code for torch_uncertainty.datasets.classification.tabular.amazon_access
import torch
from .base import TabularClassificationDataset, load_arff
[docs]
class AmazonAccess(TabularClassificationDataset):
"""The Amazon Employee Access dataset (OpenML 41135).
Predicts whether an employee's request for access to a resource should be
granted, based on role and resource identifiers. All nine predictive features
are categorical (encoded as integers). Downloaded from OpenML as an ARFF.
Note:
The exact OpenML dataset identifier for this dataset should be verified
before use. The URL below targets OpenML dataset 41135; if the download
fails, check ``https://www.openml.org`` for the correct file identifier.
The licenses of the datasets may differ from TorchUncertainty's
license. Check before use.
"""
# OpenML dataset 4135 — file_id from https://www.openml.org/api/v1/json/data/4135
url = "https://api.openml.org/data/v1/download/1681098"
dataset_name = "amazon_access"
filename = "amazon_employee_access.arff"
is_archive = False
def _make_dataset(self) -> None:
df = load_arff(self.root / self.dataset_name / self.filename)
target_col = "target" if "target" in df.columns else df.columns[-1]
self.targets = torch.tensor(df[target_col].astype(int).values.copy(), dtype=torch.long)
df = df.drop(columns=[target_col])
# All 9 features are high-cardinality nominal integer IDs; label-encode
# each column to avoid the OOM that one-hot encoding would cause.
for col in df.columns:
df[col] = df[col].astype("category").cat.codes.astype(float)
self.data = torch.as_tensor(df.values.copy(), dtype=torch.float32)
self.num_features = self.data.shape[1]