Updated bash scripts. Added background.txt file. Appended dataset file with optional OpenML caching

2025-11-11 03:03:01 -04:00
parent 0c2cf3d53d
commit 554605d9ab
5 changed files with 73 additions and 6 deletions
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -0,0 +1,50 @@
+import os
+import pandas as pd
+import openml
+
+# --- CACHE SETUP ---
+# Change this path to your preferred local cache directory
+#CACHE_DIR = os.path.expanduser("~/openml_cache")
+#os.makedirs(CACHE_DIR, exist_ok=True)
+#openml.config.cache_directory = CACHE_DIR
+
+# OpenML CC18 classification tasks (task ids)
+TASKS = {
+    "adult": 7592,       # Adult Income classification
+    "spambase": 43,      # Spambase classification
+    "optdigits": 28,     # Optdigits classification
+}
+
+# Regression dataset (dataset id)
+DATASETS = {
+    "cal_housing": 44025
+}
+
+def _load_task_dataframe(task_id: int):
+    task = openml.tasks.get_task(task_id)
+    dataset_id = task.dataset_id
+    dataset = openml.datasets.get_dataset(dataset_id)
+    X, y, categorical_indicator, _ = dataset.get_data(
+        dataset_format="dataframe",
+        target=task.target_name
+    )
+    # drop rows with NA target if any
+    if isinstance(y, pd.Series):
+        mask = ~y.isna()
+        X, y = X.loc[mask], y.loc[mask]
+    return X, y
+
+def load_dataset(name: str):
+    if name in TASKS:
+        X, y = _load_task_dataframe(TASKS[name])
+        return X, y, "classification"
+    elif name in DATASETS:
+        ds_id = DATASETS[name]
+        ds = openml.datasets.get_dataset(ds_id)
+        X, y, categorical_indicator, _ = ds.get_data(
+            dataset_format="dataframe", target=ds.default_target_attribute
+        )
+        mask = ~y.isna()
+        return X.loc[mask], y.loc[mask], "regression"
+    else:
+        raise ValueError(f"Unknown dataset {name}")