Source code for torch_uncertainty.datasets.classification.tabular.pima_diabetes

import torch

from .base import TabularClassificationDataset, load_arff



[docs]
class PimaDiabetes(TabularClassificationDataset):
    """The UCI Pima Indians Diabetes dataset.

    Predicts diabetes onset from clinical measurements. All features are
    numeric. The dataset is downloaded from the OpenML static data server
    (dataset 37).

    Reference:
        J.W. Smith et al., *Using the ADAP Learning Algorithm to Forecast the
        Onset of Diabetes Mellitus*, Proc. SCAMC, 1988.

    Note:
        The licenses of the datasets may differ from TorchUncertainty's
        license. Check before use.
    """

    # OpenML dataset 37 — static CDN, no database dependency
    url = "https://data.openml.org/datasets/0000/0037/dataset_37.arff"
    dataset_name = "pima_diabetes"
    filename = "pima_diabetes.arff"
    is_archive = False

    def _make_dataset(self) -> None:
        df = load_arff(self.root / self.dataset_name / self.filename)
        target_col = "class"
        target_vals = df[target_col].str.strip()
        # OpenML encodes: tested_negative → 0, tested_positive → 1
        self.targets = torch.as_tensor(
            (target_vals == "tested_positive").astype(int).values.copy(), dtype=torch.long
        )
        df = df.drop(columns=[target_col])
        self.data = torch.as_tensor(df.values.astype(float).copy(), dtype=torch.float32)
        self.num_features = self.data.shape[1]