Source code for torch_uncertainty.datasets.classification.tabular.amazon_access

import torch

from .base import TabularClassificationDataset, load_arff


[docs] class AmazonAccess(TabularClassificationDataset): """The Amazon Employee Access dataset (OpenML 41135). Predicts whether an employee's request for access to a resource should be granted, based on role and resource identifiers. All nine predictive features are categorical (encoded as integers). Downloaded from OpenML as an ARFF. Note: The exact OpenML dataset identifier for this dataset should be verified before use. The URL below targets OpenML dataset 41135; if the download fails, check ``https://www.openml.org`` for the correct file identifier. The licenses of the datasets may differ from TorchUncertainty's license. Check before use. """ # OpenML dataset 4135 — file_id from https://www.openml.org/api/v1/json/data/4135 url = "https://api.openml.org/data/v1/download/1681098" dataset_name = "amazon_access" filename = "amazon_employee_access.arff" is_archive = False def _make_dataset(self) -> None: df = load_arff(self.root / self.dataset_name / self.filename) target_col = "target" if "target" in df.columns else df.columns[-1] self.targets = torch.tensor(df[target_col].astype(int).values.copy(), dtype=torch.long) df = df.drop(columns=[target_col]) # All 9 features are high-cardinality nominal integer IDs; label-encode # each column to avoid the OOM that one-hot encoding would cause. for col in df.columns: df[col] = df[col].astype("category").cat.codes.astype(float) self.data = torch.as_tensor(df.values.copy(), dtype=torch.float32) self.num_features = self.data.shape[1]