From 554605d9ab3d420b377447124d8c80b407c611d2 Mon Sep 17 00:00:00 2001 From: Varyngoth Date: Tue, 11 Nov 2025 03:03:01 -0400 Subject: [PATCH] Updated bash scripts. Added background.txt file. Appended dataset file with optional OpenML caching --- background.txt | 5 ++++ setup.sh | 12 ++++++++++ src/dataset.py | 50 ++++++++++++++++++++++++++++++++++++++++ src/grid_search_batch.sh | 6 ++--- src/nsga_batch.sh | 6 ++--- 5 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 setup.sh create mode 100644 src/dataset.py diff --git a/background.txt b/background.txt index 58d5acb..f162c63 100644 --- a/background.txt +++ b/background.txt @@ -1,5 +1,10 @@ https://gitlab.com/university-of-prince-edward-isalnd/explanation-aware-optimization-and-automl/-/tree/main/src?ref_type=heads +Operation: +Specify working directory (local repo location), cache directory (dataset download location), and + + +$WORK_DIR= ############################################################################################################################################################ diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..68e858d --- /dev/null +++ b/setup.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +sudo apt install nfs-common -y + +mkdir /mnt/data + +mount 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data + +$WORK_DIR=/mnt/data + +mkdir -p /mnt/data/cache # ensure directory exists +export OPENML_CACHE_DIR=/mnt/data/cache diff --git a/src/dataset.py b/src/dataset.py new file mode 100644 index 0000000..5d0208a --- /dev/null +++ b/src/dataset.py @@ -0,0 +1,50 @@ +import os +import pandas as pd +import openml + +# --- CACHE SETUP --- +# Change this path to your preferred local cache directory +#CACHE_DIR = os.path.expanduser("~/openml_cache") +#os.makedirs(CACHE_DIR, exist_ok=True) +#openml.config.cache_directory = CACHE_DIR + +# OpenML CC18 classification tasks (task ids) +TASKS = { + "adult": 7592, # Adult Income classification + "spambase": 43, # Spambase classification + "optdigits": 28, # Optdigits classification +} + +# Regression dataset (dataset id) +DATASETS = { + "cal_housing": 44025 +} + +def _load_task_dataframe(task_id: int): + task = openml.tasks.get_task(task_id) + dataset_id = task.dataset_id + dataset = openml.datasets.get_dataset(dataset_id) + X, y, categorical_indicator, _ = dataset.get_data( + dataset_format="dataframe", + target=task.target_name + ) + # drop rows with NA target if any + if isinstance(y, pd.Series): + mask = ~y.isna() + X, y = X.loc[mask], y.loc[mask] + return X, y + +def load_dataset(name: str): + if name in TASKS: + X, y = _load_task_dataframe(TASKS[name]) + return X, y, "classification" + elif name in DATASETS: + ds_id = DATASETS[name] + ds = openml.datasets.get_dataset(ds_id) + X, y, categorical_indicator, _ = ds.get_data( + dataset_format="dataframe", target=ds.default_target_attribute + ) + mask = ~y.isna() + return X.loc[mask], y.loc[mask], "regression" + else: + raise ValueError(f"Unknown dataset {name}") diff --git a/src/grid_search_batch.sh b/src/grid_search_batch.sh index feef913..22d55d4 100644 --- a/src/grid_search_batch.sh +++ b/src/grid_search_batch.sh @@ -8,16 +8,16 @@ #SBATCH --error=%x-%j.err # Separate error log # Load necessary modules -module load python/3.8 +#module load python/3.8 # Activate your virtual environment -source ~/envs/workdir/bin/activate +source /env0/bin/activate # Parameters TIME=$1 # Run the Python script with the specified time parameter -srun python /home/dvera/scratch/Framework_EXP/grid_search_exp.py --time $TIME +srun python $WORK_DIR/src/grid_search_exp.py --time $TIME # Deactivate the virtual environment deactivate diff --git a/src/nsga_batch.sh b/src/nsga_batch.sh index cc7e455..0b6ae40 100644 --- a/src/nsga_batch.sh +++ b/src/nsga_batch.sh @@ -8,13 +8,13 @@ #SBATCH --error=%x-%j.err # Separate error log # Load necessary modules -module load python/3.8 +#module load python/3.8 # Activate your virtual environment -source ~/envs/workdir/bin/activate +source /env0/bin/activate # Run the Python script with the specified time parameter -srun python /home/dvera/scratch/Framework_EXP/nsga_exp.py +srun python $WORK_DIR/src/nsga_exp.py # Deactivate the virtual environment deactivate