From 5775534e2221d044d36c6e392e2c200eeede58fb Mon Sep 17 00:00:00 2001 From: Varyngoth Date: Tue, 11 Nov 2025 20:24:36 -0400 Subject: [PATCH] Major changes for real this time --- background.txt | 81 +++++++++++++++++++++++++++++++-- requirements.txt | 2 +- setup.sh | 6 +-- src/dataset.py | 113 ++++++++++++++++++++++++++++++++++++---------- src/nsga_batch.sh | 11 ++--- src/nsga_exp.py | 42 ++++++++--------- 6 files changed, 196 insertions(+), 59 deletions(-) diff --git a/background.txt b/background.txt index f162c63..9f58376 100644 --- a/background.txt +++ b/background.txt @@ -1,5 +1,9 @@ +Codebase: https://gitlab.com/university-of-prince-edward-isalnd/explanation-aware-optimization-and-automl/-/tree/main/src?ref_type=heads +Previous Analysis: +https://gitlab.com/agri-food-canada/potato-yield-predictions-by-postal-code-ml + Operation: Specify working directory (local repo location), cache directory (dataset download location), and @@ -12,15 +16,84 @@ Code File Structure Shell scripts - h20_batch.sh -> - nsga_batch.sh -> - grid_search_batch.sh -> + h20_batch.sh -> h20_autoML.py + nsga_batch.sh -> nsga_exp.py + grid_search_batch.sh -> grid_search_exp.py +grid_search_batch calls both algorithms and combine_datasets + +Run order should be + +datasets -> algorithms -> combine_datasets -> 3 .sh files -> shap_values_computation.py +############################################################################################################################################################ +Objective: + +Current code is built to perform ML analysis on a potato yield dataset as shown in Potato Yield Predictions by Postal Code ML +The code will need to be modified to work with other datasets +1. Modify code to work with California Housing Price dataset found in datasets.py + (cal_housing, regression dataset) + +2. Modify code to work with some other classification focused dataset + (dataset.py code contains cal_housing for regression and three classification datasets) + +3. Compare the performance of the model in both situations to compare baseline of regression vs. classification. + Table should include key performance indicators for both datasets as well as number of objects in each dataset + +4. (Ideally) Make models as easy as possible to migrate between datasets through user prompt. + Also cache files for easy referencing and to make sure that data can be analysed properly later + +Files that need changing + +dataset = YES +algorithms = NO +nsga_exp = YES +shap_values_computation = NO(?) + +############################################################################################################################################################ +Scripting Tasks: +datasets -> algorithms -> combine datasets -> nsga_exp.py -> shap_values_computation + +1. Make datasets generalizable + +2. Make combine datasets reference generalizable headers / infer from input + +3. Make nsga_exp.py reference the combine_dataset headers + +4. Make output folders specified by user at runtime / in the slurm bash script + +Operation Tasks: +1. Run nsga_exp.py using the California Housing Dataset (regression) + +2. Run the nsga_exp.py script using a separate, classification dataset + +3. Compare results +############################################################################################################################################################ +Code Changes: + +nsga_exp.py +- Lines 24 & 26 reference yield_t/ha. This should be a parameter + +- Lines 33-36 reference relative paths to previous soil.csv files + +- Lines 112 and 116 reference a set value of k (k=25). It might be better to set this dynamically based on the size of the dataset + +- Lines 141 - 143 reference models_space, pipelines, and k_value range. Should be generalized for other datasets and features + +- Line 134 references an ngsa output directory. This could be parameterized for other datasets + +- Lines 183, 190, and 195 reference specific output path csv files. This will cause overwriting on subsequent runs. Change to store based on run + +- Lines 124 - 129 reference models and functions from algorithms.py. This could be generalized to allow any model dictionary but not likely beneficial for this study + +datasets.py +- User prompt was added to allow users to choose a dataset of the four and list its Type +- User prompt was added to choose a target feature and features to exclude +- User prompt was added for a save location for the processed csv of the dataset output ############################################################################################################################################################ -Code Changes: +Code Optimizations: - SHAP KernelExplainer Use shap.TreeExplainer on tree-based models instead diff --git a/requirements.txt b/requirements.txt index 82c9fcc..9a283b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ joblib # Data acquisition openml - +deap diff --git a/setup.sh b/setup.sh index 68e858d..dc6531b 100644 --- a/setup.sh +++ b/setup.sh @@ -1,12 +1,12 @@ #!/bin/bash -sudo apt install nfs-common -y +dnf install -y nfs-utils kernel-modules-extra mkdir /mnt/data -mount 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data +mount -t nfs -o vers=3,proto=tcp 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data -$WORK_DIR=/mnt/data +WORK_DIR=/mnt/data mkdir -p /mnt/data/cache # ensure directory exists export OPENML_CACHE_DIR=/mnt/data/cache diff --git a/src/dataset.py b/src/dataset.py index 5d0208a..3c307bf 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -3,48 +3,113 @@ import pandas as pd import openml # --- CACHE SETUP --- -# Change this path to your preferred local cache directory -#CACHE_DIR = os.path.expanduser("~/openml_cache") -#os.makedirs(CACHE_DIR, exist_ok=True) -#openml.config.cache_directory = CACHE_DIR +CACHE_DIR = os.path.expanduser("~/openml_cache") +os.makedirs(CACHE_DIR, exist_ok=True) +openml.config.cache_directory = CACHE_DIR -# OpenML CC18 classification tasks (task ids) +# --- Dataset IDs --- TASKS = { - "adult": 7592, # Adult Income classification - "spambase": 43, # Spambase classification - "optdigits": 28, # Optdigits classification + "adult": 7592, + "spambase": 43, + "optdigits": 28, } -# Regression dataset (dataset id) DATASETS = { "cal_housing": 44025 } +# --- Load functions --- def _load_task_dataframe(task_id: int): task = openml.tasks.get_task(task_id) dataset_id = task.dataset_id dataset = openml.datasets.get_dataset(dataset_id) - X, y, categorical_indicator, _ = dataset.get_data( - dataset_format="dataframe", - target=task.target_name - ) - # drop rows with NA target if any - if isinstance(y, pd.Series): - mask = ~y.isna() - X, y = X.loc[mask], y.loc[mask] - return X, y + X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=task.target_name) + mask = ~y.isna() + return X.loc[mask], y.loc[mask] def load_dataset(name: str): if name in TASKS: X, y = _load_task_dataframe(TASKS[name]) - return X, y, "classification" + task_type = "classification" elif name in DATASETS: ds_id = DATASETS[name] ds = openml.datasets.get_dataset(ds_id) - X, y, categorical_indicator, _ = ds.get_data( - dataset_format="dataframe", target=ds.default_target_attribute - ) - mask = ~y.isna() - return X.loc[mask], y.loc[mask], "regression" + target_col = ds.default_target_attribute + X, y, _, _ = ds.get_data(dataset_format="dataframe", target=None) + mask = ~X[target_col].isna() + X = X.loc[mask] + y = X[target_col].loc[mask] + task_type = "regression" else: raise ValueError(f"Unknown dataset {name}") + return X, y, task_type + +# --- Interactive main --- +def main(): + print("Available datasets:") + all_datasets = list(TASKS.keys()) + list(DATASETS.keys()) + for i, name in enumerate(all_datasets): + print(f"{i+1}. {name}") + + selection = input("Enter the dataset name: ").strip() + if selection not in all_datasets: + raise ValueError(f"Dataset '{selection}' not recognized.") + + X, y, task_type = load_dataset(selection) + + # --- Identify default target --- + default_target = y.name + print(f"\nDefault target column: {default_target}") + + # --- Print all features (without explanations) --- + print("\nFeatures in the dataset:") + for col in X.columns.unique(): + print(f"- {col} ({X[col].dtype})") + + # --- Target selection --- + target = input("\nEnter the target feature (or press Enter to use default): ").strip() + if target: + if target not in X.columns: + raise ValueError(f"Target feature '{target}' not found.") + y = X[target] + X = X.drop(columns=[target], errors="ignore") + else: + target = default_target + X = X.drop(columns=[target], errors="ignore") + + # --- Feature exclusion --- + exclude_input = input("\nEnter features to exclude (comma-separated), or press Enter to skip: ").strip() + if exclude_input: + exclude_cols = [col.strip() for col in exclude_input.split(",")] + for col in exclude_cols: + if col in X.columns: + X = X.drop(columns=[col]) + else: + print(f"Warning: '{col}' not found in dataset and cannot be excluded.") + + # --- Show preview --- + print("\nFinal dataset preview (first 5 rows):") + print(X.head()) + print("\nTarget preview (first 5 rows):") + print(y.head()) + print(f"\nTask type: {task_type}") + print(f"Target column: {target}") + print(f"Number of features: {len(X.columns)}") + + # --- Export to CSV --- + output_file = input("\nEnter filename to save dataset as CSV (e.g., dataset.csv): ").strip() + if output_file: + df_export = X.copy() + df_export[target] = y # append target at the end + df_export.to_csv(output_file, index=False) + print(f"Dataset saved to {output_file} (target column: '{target}')") + + # Save the CSV path to a temporary text file in the current directory + temp_path_file = "last_csv_path.txt" + full_path = os.path.abspath(output_file) + with open(temp_path_file, "w") as f: + f.write(full_path) + print(f"CSV path written to {temp_path_file}") + +if __name__ == "__main__": + main() diff --git a/src/nsga_batch.sh b/src/nsga_batch.sh index 0b6ae40..69ebddf 100644 --- a/src/nsga_batch.sh +++ b/src/nsga_batch.sh @@ -1,9 +1,8 @@ #!/bin/bash -#SBATCH --account=def-xander # Replace with your account -#SBATCH --mem=10G # Memory allocation +#SBATCH --mem=30G # Memory allocation #SBATCH --time=21:00:00 # Total run time limit (11 hours) -#SBATCH --cpus-per-task=4 # Number of CPU cores per task -#SBATCH --job-name=nsga_%A # Job name with job ID appended +#SBATCH --cpus-per-task=8 # Number of CPU cores per task +#SBATCH --job-name=nsga # Job name with job ID appended #SBATCH --output=%x-%j.out # Standard output and error log #SBATCH --error=%x-%j.err # Separate error log @@ -11,10 +10,10 @@ #module load python/3.8 # Activate your virtual environment -source /env0/bin/activate +#source /env0/bin/activate # Run the Python script with the specified time parameter -srun python $WORK_DIR/src/nsga_exp.py +srun python /mnt/data/src/nsga_exp.py # Deactivate the virtual environment deactivate diff --git a/src/nsga_exp.py b/src/nsga_exp.py index 729d2c9..943888f 100644 --- a/src/nsga_exp.py +++ b/src/nsga_exp.py @@ -13,31 +13,33 @@ import shap from deap import base, creator, tools, algorithms from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso -from combine_datasets import combine_datasets +import argparse creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0)) # Minimize both objectives creator.create("Individual", list, fitness=creator.FitnessMin) +def load_dataset(): + + # Read the CSV path from the temporary file + with open("last_csv_path.txt", "r") as f: + csv_path = f.read().strip() + + if not os.path.exists(csv_path): + raise FileNotFoundError(f"CSV file not found: {csv_path}") + + # Load the dataset + data = pd.read_csv(csv_path) + return data + # Preprocess the data and separate features and target -def preprocess_data(input_data, pipeline, k_value): +def preprocess_data(input_data, pipeline, k_value, target_column): pipeline.named_steps['feature_selection'].set_params(k=k_value) - X = input_data.drop(columns=['yield_t/ha'], errors='ignore') - print(X.columns) - y = input_data['yield_t/ha'] + X = input_data.drop(columns=[target_column], errors='ignore') + print(X.columns) + y = input_data[target_column] X = pipeline.fit_transform(X, y) return X, y -# Load and combine datasets -def load_and_combine_datasets(use_ndvi=True): - data = combine_datasets( - "./data/potatoes_dataset.csv", - "./data/updated_soil_data_with_awc.csv", - "./data/final_augmented_climate_data.csv", - "./data/NDVI.csv", - ndvi=use_ndvi - ) - return data - # Function to compute SHAP values using KMeans clustering def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10): kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train) @@ -94,13 +96,12 @@ def evaluate_individual(individual, data, kf): return np.mean(mse_scores), np.mean(shap_stabilities), all_shap_values # Run NSGA-II experiment with a time limit -def run_nsga_experiment(use_ndvi=True, - output_base_directory="./results_nsga", +def run_nsga_experiment(output_base_directory="./results_nsga", population_size=30, n_generations=50, time_limit=36000): - # Load and combine dataset - data = load_and_combine_datasets(use_ndvi=use_ndvi) + # Load the dataset + data = load_dataset() # Pipelines for preprocessing global pipelines @@ -199,7 +200,6 @@ def run_nsga_experiment(use_ndvi=True, if __name__ == "__main__": run_nsga_experiment( - use_ndvi=True, output_base_directory="./results_nsga", population_size=80, # Larger population size for a comprehensive search n_generations=100, # Increased number of generations