Updated bash scripts. Added background.txt file. Appended dataset file with optional OpenML caching

This commit is contained in:
Varyngoth
2025-11-11 03:03:01 -04:00
parent 0c2cf3d53d
commit 554605d9ab
5 changed files with 73 additions and 6 deletions

50
src/dataset.py Normal file
View File

@@ -0,0 +1,50 @@
import os
import pandas as pd
import openml
# --- CACHE SETUP ---
# Change this path to your preferred local cache directory
#CACHE_DIR = os.path.expanduser("~/openml_cache")
#os.makedirs(CACHE_DIR, exist_ok=True)
#openml.config.cache_directory = CACHE_DIR
# OpenML CC18 classification tasks (task ids)
TASKS = {
"adult": 7592, # Adult Income classification
"spambase": 43, # Spambase classification
"optdigits": 28, # Optdigits classification
}
# Regression dataset (dataset id)
DATASETS = {
"cal_housing": 44025
}
def _load_task_dataframe(task_id: int):
task = openml.tasks.get_task(task_id)
dataset_id = task.dataset_id
dataset = openml.datasets.get_dataset(dataset_id)
X, y, categorical_indicator, _ = dataset.get_data(
dataset_format="dataframe",
target=task.target_name
)
# drop rows with NA target if any
if isinstance(y, pd.Series):
mask = ~y.isna()
X, y = X.loc[mask], y.loc[mask]
return X, y
def load_dataset(name: str):
if name in TASKS:
X, y = _load_task_dataframe(TASKS[name])
return X, y, "classification"
elif name in DATASETS:
ds_id = DATASETS[name]
ds = openml.datasets.get_dataset(ds_id)
X, y, categorical_indicator, _ = ds.get_data(
dataset_format="dataframe", target=ds.default_target_attribute
)
mask = ~y.isna()
return X.loc[mask], y.loc[mask], "regression"
else:
raise ValueError(f"Unknown dataset {name}")