Major changes for real this time

2025-11-11 20:24:36 -04:00
parent 0a081e48f1
commit 5775534e22
6 changed files with 196 additions and 59 deletions
--- a/background.txt
+++ b/background.txt
@@ -1,5 +1,9 @@
+Codebase:
 https://gitlab.com/university-of-prince-edward-isalnd/explanation-aware-optimization-and-automl/-/tree/main/src?ref_type=heads

+Previous Analysis:
+https://gitlab.com/agri-food-canada/potato-yield-predictions-by-postal-code-ml
+
 Operation:
 Specify working directory (local repo location), cache directory (dataset download location), and 

@@ -12,15 +16,84 @@ Code File Structure

 Shell scripts

-            h20_batch.sh ->   
-            nsga_batch.sh ->
-            grid_search_batch.sh ->
+            h20_batch.sh -> h20_autoML.py  
+            nsga_batch.sh -> nsga_exp.py
+            grid_search_batch.sh -> grid_search_exp.py

+grid_search_batch calls both algorithms and combine_datasets
+
+Run order should be
+
+datasets -> algorithms -> combine_datasets -> 3 .sh files -> shap_values_computation.py
+############################################################################################################################################################
+Objective:
+
+Current code is built to perform ML analysis on a potato yield dataset as shown in Potato Yield Predictions by Postal Code ML
+The code will need to be modified to work with other datasets
+1. Modify code to work with California Housing Price dataset found in datasets.py
+        (cal_housing, regression dataset)
+
+2. Modify code to work with some other classification focused dataset
+        (dataset.py code contains cal_housing for regression and three classification datasets)
+
+3. Compare the performance of the model in both situations to compare baseline of regression vs. classification.
+        Table should include key performance indicators for both datasets as well as number of objects in each dataset
+
+4. (Ideally) Make models as easy as possible to migrate between datasets through user prompt.
+        Also cache files for easy referencing and to make sure that data can be analysed properly later
+
+Files that need changing
+
+dataset                 = YES
+algorithms              = NO
+nsga_exp                = YES
+shap_values_computation = NO(?)
+
+############################################################################################################################################################
+Scripting Tasks:
+datasets -> algorithms -> combine datasets -> nsga_exp.py -> shap_values_computation
+
+1. Make datasets generalizable
+
+2. Make combine datasets reference generalizable headers / infer from input
+
+3. Make nsga_exp.py reference the combine_dataset headers
+
+4. Make output folders specified by user at runtime / in the slurm bash script
+
+Operation Tasks:
+1. Run nsga_exp.py using the California Housing Dataset (regression)
+
+2. Run the nsga_exp.py script using a separate, classification dataset
+
+3. Compare results
+############################################################################################################################################################
+Code Changes:
+
+nsga_exp.py
+- Lines 24 & 26 reference yield_t/ha. This should be a parameter
+
+- Lines 33-36 reference relative paths to previous soil.csv files
+
+- Lines 112 and 116 reference a set value of k (k=25). It might be better to set this dynamically based on the size of the dataset
+
+- Lines 141 - 143 reference models_space, pipelines, and k_value range. Should be generalized for other datasets and features
+
+- Line 134 references an ngsa output directory. This could be parameterized for other datasets
+
+- Lines 183, 190, and 195 reference specific output path csv files. This will cause overwriting on subsequent runs. Change to store based on run
+
+- Lines 124 - 129 reference models and functions from algorithms.py. This could be generalized to allow any model dictionary but not likely beneficial for this study
+
+datasets.py
+- User prompt was added to allow users to choose a dataset of the four and list its Type
+- User prompt was added to choose a target feature and features to exclude
+- User prompt was added for a save location for the processed csv of the dataset output



 ############################################################################################################################################################
-Code Changes:
+Code Optimizations:

 - SHAP KernelExplainer
        Use shap.TreeExplainer on tree-based models instead
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,4 @@ joblib

 # Data acquisition
 openml
-
+deap
--- a/setup.sh
+++ b/setup.sh
@@ -1,12 +1,12 @@
 #!/bin/bash

-sudo apt install nfs-common -y
+dnf install -y nfs-utils kernel-modules-extra

 mkdir /mnt/data

-mount 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data
+mount -t nfs -o vers=3,proto=tcp 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data

-$WORK_DIR=/mnt/data

+WORK_DIR=/mnt/data
 mkdir -p /mnt/data/cache       # ensure directory exists
 export OPENML_CACHE_DIR=/mnt/data/cache
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -3,48 +3,113 @@ import pandas as pd
 import openml

 # --- CACHE SETUP ---
-# Change this path to your preferred local cache directory
-#CACHE_DIR = os.path.expanduser("~/openml_cache")
-#os.makedirs(CACHE_DIR, exist_ok=True)
-#openml.config.cache_directory = CACHE_DIR
+CACHE_DIR = os.path.expanduser("~/openml_cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+openml.config.cache_directory = CACHE_DIR

-# OpenML CC18 classification tasks (task ids)
+# --- Dataset IDs ---
 TASKS = {
-    "adult": 7592,       # Adult Income classification
-    "spambase": 43,      # Spambase classification
-    "optdigits": 28,     # Optdigits classification
+    "adult": 7592,
+    "spambase": 43,
+    "optdigits": 28,
 }

-# Regression dataset (dataset id)
 DATASETS = {
    "cal_housing": 44025
 }

+# --- Load functions ---
 def _load_task_dataframe(task_id: int):
    task = openml.tasks.get_task(task_id)
    dataset_id = task.dataset_id
    dataset = openml.datasets.get_dataset(dataset_id)
-    X, y, categorical_indicator, _ = dataset.get_data(
-        dataset_format="dataframe",
-        target=task.target_name
-    )
-    # drop rows with NA target if any
-    if isinstance(y, pd.Series):
+    X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=task.target_name)
    mask = ~y.isna()
-        X, y = X.loc[mask], y.loc[mask]
-    return X, y
+    return X.loc[mask], y.loc[mask]

 def load_dataset(name: str):
    if name in TASKS:
        X, y = _load_task_dataframe(TASKS[name])
-        return X, y, "classification"
+        task_type = "classification"
    elif name in DATASETS:
        ds_id = DATASETS[name]
        ds = openml.datasets.get_dataset(ds_id)
-        X, y, categorical_indicator, _ = ds.get_data(
-            dataset_format="dataframe", target=ds.default_target_attribute
-        )
-        mask = ~y.isna()
-        return X.loc[mask], y.loc[mask], "regression"
+        target_col = ds.default_target_attribute
+        X, y, _, _ = ds.get_data(dataset_format="dataframe", target=None)
+        mask = ~X[target_col].isna()
+        X = X.loc[mask]
+        y = X[target_col].loc[mask]
+        task_type = "regression"
    else:
        raise ValueError(f"Unknown dataset {name}")
+    return X, y, task_type
+
+# --- Interactive main ---
+def main():
+    print("Available datasets:")
+    all_datasets = list(TASKS.keys()) + list(DATASETS.keys())
+    for i, name in enumerate(all_datasets):
+        print(f"{i+1}. {name}")
+
+    selection = input("Enter the dataset name: ").strip()
+    if selection not in all_datasets:
+        raise ValueError(f"Dataset '{selection}' not recognized.")
+    
+    X, y, task_type = load_dataset(selection)
+
+    # --- Identify default target ---
+    default_target = y.name
+    print(f"\nDefault target column: {default_target}")
+
+    # --- Print all features (without explanations) ---
+    print("\nFeatures in the dataset:")
+    for col in X.columns.unique():
+        print(f"- {col} ({X[col].dtype})")
+
+    # --- Target selection ---
+    target = input("\nEnter the target feature (or press Enter to use default): ").strip()
+    if target:
+        if target not in X.columns:
+            raise ValueError(f"Target feature '{target}' not found.")
+        y = X[target]
+        X = X.drop(columns=[target], errors="ignore")
+    else:
+        target = default_target
+        X = X.drop(columns=[target], errors="ignore")
+
+    # --- Feature exclusion ---
+    exclude_input = input("\nEnter features to exclude (comma-separated), or press Enter to skip: ").strip()
+    if exclude_input:
+        exclude_cols = [col.strip() for col in exclude_input.split(",")]
+        for col in exclude_cols:
+            if col in X.columns:
+                X = X.drop(columns=[col])
+            else:
+                print(f"Warning: '{col}' not found in dataset and cannot be excluded.")
+
+    # --- Show preview ---
+    print("\nFinal dataset preview (first 5 rows):")
+    print(X.head())
+    print("\nTarget preview (first 5 rows):")
+    print(y.head())
+    print(f"\nTask type: {task_type}")
+    print(f"Target column: {target}")
+    print(f"Number of features: {len(X.columns)}")
+
+    # --- Export to CSV ---
+    output_file = input("\nEnter filename to save dataset as CSV (e.g., dataset.csv): ").strip()
+    if output_file:
+        df_export = X.copy()
+        df_export[target] = y  # append target at the end
+        df_export.to_csv(output_file, index=False)
+        print(f"Dataset saved to {output_file} (target column: '{target}')")
+        
+        # Save the CSV path to a temporary text file in the current directory
+        temp_path_file = "last_csv_path.txt"
+        full_path = os.path.abspath(output_file)
+        with open(temp_path_file, "w") as f:
+            f.write(full_path)
+        print(f"CSV path written to {temp_path_file}")
+
+if __name__ == "__main__":
+    main()
--- a/src/nsga_batch.sh
+++ b/src/nsga_batch.sh
@@ -1,9 +1,8 @@
 #!/bin/bash
-#SBATCH --account=def-xander      # Replace with your account
-#SBATCH --mem=10G                 # Memory allocation
+#SBATCH --mem=30G                 # Memory allocation
 #SBATCH --time=21:00:00           # Total run time limit (11 hours)
-#SBATCH --cpus-per-task=4         # Number of CPU cores per task
-#SBATCH --job-name=nsga_%A # Job name with job ID appended
+#SBATCH --cpus-per-task=8         # Number of CPU cores per task
+#SBATCH --job-name=nsga           # Job name with job ID appended
 #SBATCH --output=%x-%j.out        # Standard output and error log
 #SBATCH --error=%x-%j.err         # Separate error log

@@ -11,10 +10,10 @@
 #module load python/3.8

 # Activate your virtual environment
-source /env0/bin/activate
+#source /env0/bin/activate

 # Run the Python script with the specified time parameter
-srun python $WORK_DIR/src/nsga_exp.py
+srun python /mnt/data/src/nsga_exp.py

 # Deactivate the virtual environment
 deactivate
--- a/src/nsga_exp.py
+++ b/src/nsga_exp.py
@@ -13,31 +13,33 @@ import shap
 from deap import base, creator, tools, algorithms
 from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso

-from combine_datasets import combine_datasets
+import argparse

 creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0))  # Minimize both objectives
 creator.create("Individual", list, fitness=creator.FitnessMin)

+def load_dataset():
+
+    # Read the CSV path from the temporary file
+    with open("last_csv_path.txt", "r") as f:
+        csv_path = f.read().strip()
+
+    if not os.path.exists(csv_path):
+        raise FileNotFoundError(f"CSV file not found: {csv_path}")
+
+    # Load the dataset
+    data = pd.read_csv(csv_path)
+    return data
+
 # Preprocess the data and separate features and target
-def preprocess_data(input_data, pipeline, k_value):
+def preprocess_data(input_data, pipeline, k_value, target_column):
    pipeline.named_steps['feature_selection'].set_params(k=k_value)
-    X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
+    X = input_data.drop(columns=[target_column], errors='ignore')
    print(X.columns)    
-    y = input_data['yield_t/ha']
+    y = input_data[target_column]
    X = pipeline.fit_transform(X, y)
    return X, y

-# Load and combine datasets
-def load_and_combine_datasets(use_ndvi=True):
-    data = combine_datasets(
-        "./data/potatoes_dataset.csv",
-        "./data/updated_soil_data_with_awc.csv",
-        "./data/final_augmented_climate_data.csv",
-        "./data/NDVI.csv",
-        ndvi=use_ndvi
-    )
-    return data
-
 # Function to compute SHAP values using KMeans clustering
 def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
@@ -94,13 +96,12 @@ def evaluate_individual(individual, data, kf):
    return np.mean(mse_scores), np.mean(shap_stabilities), all_shap_values

 # Run NSGA-II experiment with a time limit
-def run_nsga_experiment(use_ndvi=True,
-                        output_base_directory="./results_nsga",
+def run_nsga_experiment(output_base_directory="./results_nsga",
                        population_size=30,
                        n_generations=50,
                        time_limit=36000):
-    # Load and combine dataset
-    data = load_and_combine_datasets(use_ndvi=use_ndvi)
+    # Load the dataset
+    data = load_dataset()

    # Pipelines for preprocessing
    global pipelines
@@ -199,7 +200,6 @@ def run_nsga_experiment(use_ndvi=True,

 if __name__ == "__main__":
    run_nsga_experiment(
-        use_ndvi=True,
        output_base_directory="./results_nsga",
        population_size=80,  # Larger population size for a comprehensive search
        n_generations=100,  # Increased number of generations