Updated bash scripts. Added background.txt file. Appended dataset file with optional OpenML caching

2025-11-11 03:03:01 -04:00
parent 0c2cf3d53d
commit 554605d9ab
5 changed files with 73 additions and 6 deletions
--- a/background.txt
+++ b/background.txt
@@ -1,5 +1,10 @@
 https://gitlab.com/university-of-prince-edward-isalnd/explanation-aware-optimization-and-automl/-/tree/main/src?ref_type=heads
 Operation:
 Specify working directory (local repo location), cache directory (dataset download location), and 
 $WORK_DIR=
 ############################################################################################################################################################
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,12 @@
 #!/bin/bash
 sudo apt install nfs-common -y
 mkdir /mnt/data
 mount 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data
 $WORK_DIR=/mnt/data
 mkdir -p /mnt/data/cache       # ensure directory exists
 export OPENML_CACHE_DIR=/mnt/data/cache
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -0,0 +1,50 @@
 import os
 import pandas as pd
 import openml
 # --- CACHE SETUP ---
 # Change this path to your preferred local cache directory
 #CACHE_DIR = os.path.expanduser("~/openml_cache")
 #os.makedirs(CACHE_DIR, exist_ok=True)
 #openml.config.cache_directory = CACHE_DIR
 # OpenML CC18 classification tasks (task ids)
 TASKS = {
    "adult": 7592,       # Adult Income classification
    "spambase": 43,      # Spambase classification
    "optdigits": 28,     # Optdigits classification
 }
 # Regression dataset (dataset id)
 DATASETS = {
    "cal_housing": 44025
 }
 def _load_task_dataframe(task_id: int):
    task = openml.tasks.get_task(task_id)
    dataset_id = task.dataset_id
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, _ = dataset.get_data(
        dataset_format="dataframe",
        target=task.target_name
    )
    # drop rows with NA target if any
    if isinstance(y, pd.Series):
        mask = ~y.isna()
        X, y = X.loc[mask], y.loc[mask]
    return X, y
 def load_dataset(name: str):
    if name in TASKS:
        X, y = _load_task_dataframe(TASKS[name])
        return X, y, "classification"
    elif name in DATASETS:
        ds_id = DATASETS[name]
        ds = openml.datasets.get_dataset(ds_id)
        X, y, categorical_indicator, _ = ds.get_data(
            dataset_format="dataframe", target=ds.default_target_attribute
        )
        mask = ~y.isna()
        return X.loc[mask], y.loc[mask], "regression"
    else:
        raise ValueError(f"Unknown dataset {name}")
--- a/src/grid_search_batch.sh
+++ b/src/grid_search_batch.sh
@@ -8,16 +8,16 @@
 #SBATCH --error=%x-%j.err         # Separate error log
 # Load necessary modules
-module load python/3.8
+#module load python/3.8
 # Activate your virtual environment
-source ~/envs/workdir/bin/activate
+source /env0/bin/activate
 # Parameters
 TIME=$1
 # Run the Python script with the specified time parameter
-srun python /home/dvera/scratch/Framework_EXP/grid_search_exp.py --time $TIME
+srun python $WORK_DIR/src/grid_search_exp.py --time $TIME
 # Deactivate the virtual environment
 deactivate
--- a/src/nsga_batch.sh
+++ b/src/nsga_batch.sh
@@ -8,13 +8,13 @@
 #SBATCH --error=%x-%j.err         # Separate error log
 # Load necessary modules
-module load python/3.8
+#module load python/3.8
 # Activate your virtual environment
-source ~/envs/workdir/bin/activate
+source /env0/bin/activate
 # Run the Python script with the specified time parameter
-srun python /home/dvera/scratch/Framework_EXP/nsga_exp.py
+srun python $WORK_DIR/src/nsga_exp.py
 # Deactivate the virtual environment
 deactivate