diff --git a/Figures/Framework.png b/Figures/Framework.png new file mode 100644 index 0000000..7318489 Binary files /dev/null and b/Figures/Framework.png differ diff --git a/Figures/Model_complex_Opt.png b/Figures/Model_complex_Opt.png new file mode 100644 index 0000000..443b1c1 Binary files /dev/null and b/Figures/Model_complex_Opt.png differ diff --git a/Figures/Pima_mutiplot.png b/Figures/Pima_mutiplot.png new file mode 100644 index 0000000..01a578b Binary files /dev/null and b/Figures/Pima_mutiplot.png differ diff --git a/Figures/features_heatmap.png b/Figures/features_heatmap.png new file mode 100644 index 0000000..3dfb2b3 Binary files /dev/null and b/Figures/features_heatmap.png differ diff --git a/Figures/optimization_surfaces.png b/Figures/optimization_surfaces.png new file mode 100644 index 0000000..3c2aa12 Binary files /dev/null and b/Figures/optimization_surfaces.png differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..0c7ee0f --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +# Explanation-Aware Automated Machine Learning + +This repository accompanies the research paper: + +**โ€œMulti-Objective Automated Machine Learning for Explainable Artificial Intelligence: Optimizing Predictive Accuracy and Shapley-Based Feature Stability.โ€** + +In high-stakes domains such as agriculture, machine learning models must be not only accurate but also transparent and aligned with domain knowledge. This project presents a novel **multi-objective optimization framework** that jointly maximizes predictive performance and explanation stability. Specifically, we introduce a formal metric based on the **variance of Shapley Additive Explanations across cross-validation folds**, embedding it directly into the model selection process. + +Our approach leverages the **Non-dominated Sorting Genetic Algorithm II** to evolve models that balance predictive accuracy with robust, semantically consistent explanations. When applied to potato yield prediction, the framework outperforms both **H2O.ai's Automatic Machine Learning platform** and traditional grid search, producing models that are both high-performing and interpretable. + +--- + +## ๐Ÿ” Key Features + +- Multi-objective optimization for predictive accuracy and explanation stability +- Shapley-based metric embedded into the model selection loop +- Implementation using NSGA-II for evolutionary search +- Reproducible case study in potato yield forecasting +- Baseline comparisons with grid search and H2O.aiโ€™s platform + +--- + +## ๐Ÿ“‚ Repository Structure diff --git a/background.txt b/background.txt new file mode 100644 index 0000000..58d5acb --- /dev/null +++ b/background.txt @@ -0,0 +1,42 @@ +https://gitlab.com/university-of-prince-edward-isalnd/explanation-aware-optimization-and-automl/-/tree/main/src?ref_type=heads + + + +############################################################################################################################################################ +Code File Structure + +Shell scripts + + h20_batch.sh -> + nsga_batch.sh -> + grid_search_batch.sh -> + + + + +############################################################################################################################################################ +Code Changes: + +- SHAP KernelExplainer + Use shap.TreeExplainer on tree-based models instead + +- AutoML search size + Reduce max_models or max_runtime_secs per fold or pre-select algorithms + +- Data transformations + Cache intermediate NumPy arrays to skip repeated fit_transform calls in each fold + +- Parallel folds + if CPU has many cores, parallelize the K-fold loop with joblib.parallel to fully use a higher core count CPU + +############################################################################################################################################################ +Notes +- The Slurm headers indicate that the programs should be run on a system with 4 cores per task and 10GB of RAM. + This is quite conservative and would not need to be directed towards a cloud-computing environment to run + +- The three jobs run with a run time limit of 11 hours. Considering average Compute Canada / AceNet servers (approx 2.5GHz CPUs), + allocate a time limit of at least 5 hours to run on a 13600KF system (assuming no hyperthreading and E-core processing) + +- H20 AutoML supports GPU compute using CUDA libraries. A CUDA accelerate GPU may see performance gains for this computation + +- \ No newline at end of file diff --git a/src/algorithms.py b/src/algorithms.py new file mode 100644 index 0000000..6974e08 --- /dev/null +++ b/src/algorithms.py @@ -0,0 +1,112 @@ +from sklearn.linear_model import Lasso, Ridge +from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import StackingRegressor + + +def lasso(): + model = Lasso() + params = { + 'alpha': [0.001 * (10 ** i) for i in range(5)], # Steps by powers of 10 + 'max_iter': [1000 + 1000 * i for i in range(10)], # Steps by 1000 + 'tol': [1e-4 / (10 ** i) for i in range(5)], # Steps by powers of 10 + 'warm_start': [True, False], + 'positive': [True, False], + 'selection': ['cyclic', 'random'] + } + return model, params + + +def random_forest(random_state=42): + model = RandomForestRegressor(random_state=random_state) + params = { + 'n_estimators': [100 + 100 * i for i in range(10)], # Steps by 100 + 'max_depth': [10 + 10 * i for i in range(4)], # Steps by 10 + 'min_samples_split': [2 + i for i in range(3)], # Steps by 1 + 'min_samples_leaf': [1 + i for i in range(4)], # Steps by 1 + 'max_features': ['sqrt', 'log2'], + 'bootstrap': [True, False], + } + return model, params + + +def gradient_boosting(random_state=42): + model = GradientBoostingRegressor(random_state=random_state) + params = { + 'n_estimators': [50, 100, 150, 200, 250, 300], # Reasonable range for most tasks + 'learning_rate': [0.01, 0.05, 0.1, 0.2], # More conservative learning rates + 'max_depth': [3, 4, 5, 6], # Keeps complexity manageable + 'min_samples_split': [2, 3, 4], # Commonly used values + 'min_samples_leaf': [1, 2, 3, 4], # Commonly used values + 'subsample': [0.8, 0.9, 1.0], # Reasonable subsample values + 'max_features': ['sqrt', 'log2'], + } + return model, params + + +def decision_tree_regressor(): + model = DecisionTreeRegressor() + params = { + "splitter": ["best", "random"], + "max_depth": [1 + 2 * i for i in range(6)], # Steps by 2 + "min_samples_split": [2 + i for i in range(3)], # Steps by 1 + "min_samples_leaf": [1 + i for i in range(4)], # Steps by 1 + "min_weight_fraction_leaf": [0.0 + 0.1 * i for i in range(3)], # Steps by 0.1 + "max_features": [None, "sqrt", "log2"], + "max_leaf_nodes": [10 * i for i in range(1, 6)], # Steps by 10 + } + return model, params + + +def ridge_regressor(random_state=42): + model = Ridge(random_state=random_state) + params = { + 'alpha': [0.001 * (10 ** i) for i in range(5)], # Steps by powers of 10 + 'max_iter': [1000 + 1000 * i for i in range(5)], # Steps by 1000 + 'tol': [1e-4 / (10 ** i) for i in range(5)], # Steps by powers of 10 + 'solver': ['auto'], + 'fit_intercept': [True, False], + 'positive': [True, False] + } + return model, params + + +def stacking_lasso(random_state=42): + base_learners = [ + ('gb', GradientBoostingRegressor(random_state=random_state)), + ('dt', DecisionTreeRegressor(random_state=random_state)), + ] + + model = StackingRegressor( + estimators=base_learners, + final_estimator=Lasso(random_state=random_state), + passthrough=True + ) + params = { + # Lasso hyperparameters (slightly expanded search space) + 'final_estimator__alpha': [0.001, 0.01, 0.1], # Expanded to allow some variation + 'final_estimator__max_iter': [3000, 5000], # Expanded to allow more iterations + 'final_estimator__tol': [1e-4, 1e-5], # Expanded tolerance + 'final_estimator__warm_start': [False], # Fixed + 'final_estimator__positive': [True], # Fixed + 'final_estimator__selection': ['random'], # Fixed + + # Gradient Boosting hyperparameters (expanded search space) + 'gb__n_estimators': [50, 100, 200], # Expanded range + 'gb__learning_rate': [0.01, 0.05, 0.1], # Expanded range + 'gb__max_depth': [3, 5, 7], # Expanded range + 'gb__min_samples_split': [2, 4], # Expanded range + 'gb__min_samples_leaf': [1, 2], # Expanded range + 'gb__subsample': [0.8, 0.9, 1.0], # Expanded range + 'gb__max_features': ['sqrt', 'log2'], # Fixed + + # Decision Tree hyperparameters (expanded search space) + 'dt__max_depth': [11, 15, 20], # Expanded depth range + 'dt__max_features': ['log2'], # Fixed + 'dt__max_leaf_nodes': [30, 50, 70], # Expanded range + 'dt__min_samples_leaf': [1, 2, 4], # Expanded range + 'dt__min_samples_split': [2, 4, 6], # Expanded range + 'dt__min_weight_fraction_leaf': [0.1, 0.2], # Reduced range but still variable + 'dt__splitter': ['best', 'random'] # Allowing both splitting strategies + } + return model, params \ No newline at end of file diff --git a/src/combine_datasets.py b/src/combine_datasets.py new file mode 100644 index 0000000..3e4e764 --- /dev/null +++ b/src/combine_datasets.py @@ -0,0 +1,92 @@ +import pandas as pd +from loading_climate_data import load_and_process_climate_data +from loading_crop_data import engineer_crop_features +from loading_ndvi_data import process_ndvi_data +from loading_soil_data import get_soil_data + + +def combine_datasets(crop_data_file, + soil_data_file, + climate_data_file, + ndvi_data_file, + ndvi=False): + dataset_loaders = { + 'soil': lambda: get_soil_data(soil_data_file), + 'climate': lambda: load_and_process_climate_data(climate_data_file), + 'ndvi': lambda: process_ndvi_data(ndvi_data_file), + 'crop': lambda: engineer_crop_features(crop_data_file) + } + + merge_keys = { + 'soil': ['PostalCode'], + 'climate': ['PostalCode', 'Year'], + 'ndvi': ['PostalCode', 'Year'], + 'crop': ['PostalCode', 'Year'] + } + + combined_data = dataset_loaders['crop']() + + # Merge climate data + climate_data = dataset_loaders['climate']() + combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left') + + # Merge NDVI data if required + if ndvi: + ndvi_data = dataset_loaders['ndvi']() + combined_data = combined_data[combined_data['Year'] >= 2000] + combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left') + + # Merge soil data + soil_data = dataset_loaders['soil']() + combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left') + postal_codes = combined_data['PostalCode'] + years = combined_data['Year'] + + # Drop irrelevant or redundant columns + features_to_exclude = ['PostalCode', 'Year', 'SoilID'] + + combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore') + + return combined_data + + +def combine_dataset_pc(crop_data_file, + soil_data_file, + climate_data_file, + ndvi_data_file, + ndvi=False): + dataset_loaders = { + 'soil': lambda: get_soil_data(soil_data_file), + 'climate': lambda: load_and_process_climate_data(climate_data_file), + 'ndvi': lambda: process_ndvi_data(ndvi_data_file), + 'crop': lambda: engineer_crop_features(crop_data_file) + } + + merge_keys = { + 'soil': ['PostalCode'], + 'climate': ['PostalCode', 'Year'], + 'ndvi': ['PostalCode', 'Year'], + 'crop': ['PostalCode', 'Year'] + } + + combined_data = dataset_loaders['crop']() + + # Merge climate data + climate_data = dataset_loaders['climate']() + combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left') + + # Merge NDVI data if required + if ndvi: + ndvi_data = dataset_loaders['ndvi']() + combined_data = combined_data[combined_data['Year'] >= 2000] + combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left') + + # Merge soil data + soil_data = dataset_loaders['soil']() + combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left') + + # Drop irrelevant or redundant columns + features_to_exclude = ['Year', 'SoilID'] + combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore') + + return combined_data diff --git a/src/grid_search_batch.sh b/src/grid_search_batch.sh new file mode 100644 index 0000000..feef913 --- /dev/null +++ b/src/grid_search_batch.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH --account=def-xander # Replace with your account +#SBATCH --mem=10G # Memory allocation +#SBATCH --time=21:00:00 # Total run time limit (11 hours) +#SBATCH --cpus-per-task=4 # Number of CPU cores per task +#SBATCH --job-name=grid_search_%A # Job name with job ID appended +#SBATCH --output=%x-%j.out # Standard output and error log +#SBATCH --error=%x-%j.err # Separate error log + +# Load necessary modules +module load python/3.8 + +# Activate your virtual environment +source ~/envs/workdir/bin/activate + +# Parameters +TIME=$1 + +# Run the Python script with the specified time parameter +srun python /home/dvera/scratch/Framework_EXP/grid_search_exp.py --time $TIME + +# Deactivate the virtual environment +deactivate diff --git a/src/grid_search_exp.py b/src/grid_search_exp.py new file mode 100644 index 0000000..1afb764 --- /dev/null +++ b/src/grid_search_exp.py @@ -0,0 +1,186 @@ +import os +import time +import numpy as np +import pandas as pd +import warnings +import shap +from sklearn.model_selection import KFold, ParameterGrid +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer +from sklearn.impute import SimpleImputer +from sklearn.feature_selection import SelectKBest, f_regression +from sklearn.metrics import mean_squared_error +from sklearn.cluster import KMeans +from sklearn.exceptions import ConvergenceWarning +from joblib import dump + +from combine_datasets import combine_datasets +from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso + +warnings.filterwarnings("ignore", category=ConvergenceWarning) + + +# Preprocess the data and separate features and target +def preprocess_data(input_data, pipeline): + X = input_data.drop(columns=['yield_t/ha'], errors='ignore') + y = input_data['yield_t/ha'] + + # Fit-transform the pipeline steps without `y` first + for name, step in pipeline.steps: + if name != 'feature_selection': + X = step.fit_transform(X) + + # Apply `SelectKBest` separately, which requires `y` + if 'feature_selection' in [name for name, _ in pipeline.steps]: + X = pipeline.named_steps['feature_selection'].fit_transform(X, y) + + return X, y + + +# Function to compute SHAP values using KMeans clustering +def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10): + # Fit KMeans to the training data with explicit n_init parameter + kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train) + cluster_centers = kmeans.cluster_centers_ + + # Compute SHAP values on cluster centers + explainer = shap.KernelExplainer(model.predict, cluster_centers) + shap_values = explainer.shap_values(X_test) + + return shap_values + + +# Iterative grid search with time monitoring +def iterative_grid_search(model, param_grid, X, y, cv, time_limit): + best_model = None + best_score = -np.inf + start_time = time.time() + + # Manually iterate over parameter combinations + for params in ParameterGrid(param_grid): + elapsed_time = time.time() - start_time + if elapsed_time > time_limit: + print("Time limit exceeded. Stopping search.") + break + + model.set_params(**params) + scores = [] + + for train_idx, test_idx in cv.split(X, y): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + model.fit(X_train, y_train) + predictions = model.predict(X_test) + score = -mean_squared_error(y_test, predictions) + scores.append(score) + + mean_score = np.mean(scores) + if mean_score > best_score: + best_score = mean_score + best_model = model + + return best_model + + +# Main experiment function +def run_experiment_with_dynamic_grid(total_hours=10.0, output_base_directory="./results"): + time_per_algorithm = total_hours * 3600 / 5 + + # Load and combine datasets + data = combine_datasets( + "./data/potatoes_dataset.csv", + "./data/updated_soil_data_with_awc.csv", + "./data/final_augmented_climate_data.csv", + "./data/NDVI.csv", + ndvi=True + ) + + pipelines = [ + Pipeline([ + ('normalize', StandardScaler()), + ('imputer', SimpleImputer(strategy='mean')), + ('poly_features', PolynomialFeatures(degree=2)), + ('feature_selection', SelectKBest(score_func=f_regression, k=25)) + ]), + Pipeline([ + ('robust', RobustScaler()), + ('feature_selection', SelectKBest(score_func=f_regression, k=25)), + ('power_transformation', PowerTransformer(method='yeo-johnson')) + ]) + ] + + models = { + 'lasso': lasso, + 'ridge': ridge_regressor, + 'random_forest': random_forest, + 'gradient_boosting': gradient_boosting, + 'decision_tree': decision_tree_regressor, + 'stacking_ensemble': stacking_lasso + } + + output_directory = os.path.join(output_base_directory, f"grid_search_results_{total_hours}h") + os.makedirs(output_directory, exist_ok=True) + + kf = KFold(n_splits=5, shuffle=True, random_state=42) + + for model_name, model_func in models.items(): + best_pipeline_result = None + + for pipeline_idx, pipeline in enumerate(pipelines): + X, y = preprocess_data(data, pipeline) + model, params = model_func() + + best_model = iterative_grid_search( + model, params, X, y, cv=kf, time_limit=time_per_algorithm + ) + + shap_stabilities = [] + for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)): + X_train_fold, X_test_fold = X[train_idx], X[test_idx] + y_train_fold = y[train_idx] + + best_model.fit(X_train_fold, y_train_fold) + shap_values = compute_shap_values_with_kmeans(best_model, X_train_fold, X_test_fold) + fold_shap_stability = np.std(shap_values, axis=0).mean() + shap_stabilities.append(fold_shap_stability) + + shap_stability = np.mean(shap_stabilities) + predictions = best_model.predict(X) + mse = mean_squared_error(y, predictions) + + if best_pipeline_result is None or ( + mse < best_pipeline_result['mse'] and shap_stability < best_pipeline_result['shap_stability']): + best_pipeline_result = { + 'model': best_model, + 'pipeline_idx': pipeline_idx, + 'mse': mse, + 'shap_stability': shap_stability + } + + if best_pipeline_result: + model_output_dir = os.path.join(output_directory, model_name) + os.makedirs(model_output_dir, exist_ok=True) + + model_file_path = os.path.join(model_output_dir, f"{model_name}_best_model.joblib") + dump(best_pipeline_result['model'], model_file_path) + + metrics_df = pd.DataFrame({ + 'Model': [model_name], + 'Pipeline': [f"Pipeline_{best_pipeline_result['pipeline_idx'] + 1}"], + 'MSE': [best_pipeline_result['mse']], + 'SHAP_Stability': [best_pipeline_result['shap_stability']] + }) + metrics_csv_path = os.path.join(output_directory, "metrics_summary_GS.csv") + metrics_df.to_csv(metrics_csv_path, index=False, mode='a', header=not os.path.exists(metrics_csv_path)) + + print(f"Saved best results for {model_name} from Pipeline {best_pipeline_result['pipeline_idx'] + 1}") + + print(f"All results saved to {output_directory}") + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Run Grid Search for different models.') + parser.add_argument('--time', type=float, required=True, help='Time for the model to run in hours') + args = parser.parse_args() + run_experiment_with_dynamic_grid(total_hours=args.time, output_base_directory="./results_grid_search") diff --git a/src/h20_autoML.py b/src/h20_autoML.py new file mode 100644 index 0000000..b505d25 --- /dev/null +++ b/src/h20_autoML.py @@ -0,0 +1,166 @@ +import os +import shap +import numpy as np +import pandas as pd +import h2o +from h2o.automl import H2OAutoML +from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.model_selection import KFold +from sklearn.feature_selection import SelectKBest, f_regression +from sklearn.metrics import mean_squared_error +from sklearn.cluster import KMeans + +from combine_datasets import combine_datasets + +# Initialize H2O +h2o.init() + + +# Preprocess the data and separate features and target +def preprocess_data(input_data, pipeline): + X = input_data.drop(columns=['yield_t/ha'], errors='ignore') + y = input_data['yield_t/ha'] + + # Fit-transform the pipeline steps without `y` first + for name, step in pipeline.steps: + if name != 'feature_selection': + X = step.fit_transform(X) + + # Apply `SelectKBest` separately, which requires `y` + if 'feature_selection' in [name for name, _ in pipeline.steps]: + X = pipeline.named_steps['feature_selection'].fit_transform(X, y) + + return X, y + + +# Function to compute SHAP values using KMeans clustering +def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10): + # Fit KMeans to the training data with explicit n_init parameter + kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train) + cluster_centers = kmeans.cluster_centers_ + + # Convert cluster centers to H2OFrame + cluster_centers_h2o = h2o.H2OFrame(cluster_centers) + + # Convert X_test to H2OFrame + X_test_h2o = h2o.H2OFrame(X_test) + + # Compute SHAP values on cluster centers + explainer = shap.KernelExplainer(lambda x: model.predict(h2o.H2OFrame(x)).as_data_frame().values.flatten(), + cluster_centers_h2o.as_data_frame().values) + shap_values = explainer.shap_values(X_test_h2o.as_data_frame().values) + + return shap_values + + +# Function to load and combine datasets +def load_and_combine_datasets(use_ndvi=True): + data = combine_datasets( + "./data/potatoes_dataset.csv", + "./data/updated_soil_data_with_awc.csv", + "./data/final_augmented_climate_data.csv", + "./data/NDVI.csv", + ndvi=use_ndvi + ) + return data + + +# Main experiment function with H2O AutoML +def run_h2o_experiment(use_ndvi=True, automl_time=10, output_base_directory="./results_h2o"): + data = load_and_combine_datasets(use_ndvi=use_ndvi) + pipelines = [ + Pipeline([ + ('normalize', StandardScaler()), + ('imputer', SimpleImputer(strategy='mean')), + ('poly_features', PolynomialFeatures(degree=2)), + ('feature_selection', SelectKBest(score_func=f_regression, k=25)) + ]), + Pipeline([ + ('robust', RobustScaler()), + ('feature_selection', SelectKBest(score_func=f_regression, k=25)), + ('power_transformation', PowerTransformer(method='yeo-johnson')) + ]) + ] + + output_directory = os.path.join(output_base_directory, f"h2o_automl_results_{automl_time}h") + os.makedirs(output_directory, exist_ok=True) + + metrics_df = pd.DataFrame(columns=['Model', 'Pipeline', 'MSE', 'SHAP_Stability']) + + kf = KFold(n_splits=5, shuffle=True, random_state=42) + + for pipeline_idx, pipeline in enumerate(pipelines): + X_processed, y = preprocess_data(data, pipeline) + + X_h2o = h2o.H2OFrame(X_processed) + y_h2o = h2o.H2OFrame(y.to_frame()) + X_h2o['target'] = y_h2o + + shap_stabilities = [] + all_shap_values = [] + + for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed)): + train = X_h2o[train_idx.tolist(), :] + test = X_h2o[test_idx.tolist(), :] + + aml = H2OAutoML(max_runtime_secs=int(automl_time * 3600 / kf.get_n_splits()), max_models=5, seed=42, + sort_metric='MSE') + aml.train(x=list(X_h2o.columns[:-1]), y='target', training_frame=train) + + best_model = aml.leader + + leaderboard = aml.leaderboard.as_data_frame() + leaderboard_csv_path = os.path.join(output_directory, f"leaderboard_pipeline_{pipeline_idx + 1}_fold_{fold + 1}.csv") + leaderboard.to_csv(leaderboard_csv_path, index=False) + + model_output_dir = os.path.join(output_directory, f"pipeline_{pipeline_idx + 1}") + os.makedirs(model_output_dir, exist_ok=True) + model_file_path = os.path.join(model_output_dir, f"h2o_best_model_{pipeline_idx + 1}_fold_{fold + 1}.zip") + h2o.save_model(best_model, path=model_file_path) + + predictions = best_model.predict(test).as_data_frame()['predict'] + + mse = mean_squared_error(test['target'].as_data_frame().values, predictions.values) + + shap_values = compute_shap_values_with_kmeans(best_model, train.as_data_frame().values, + test.as_data_frame().values) + shap_stability = np.std(shap_values, axis=0).mean() + shap_stabilities.append(shap_stability) + all_shap_values.append(shap_values) + + + print(f"Completed H2O AutoML and SHAP computation for Pipeline {pipeline_idx + 1}, Fold {fold + 1}") + + mean_shap_stability = np.mean(shap_stabilities) + for fold_idx, shap_values in enumerate(all_shap_values): + shap_file_name = f"shap_values_pipeline_{pipeline_idx + 1}_fold_{fold_idx + 1}.npy" + shap_file_path = os.path.join(model_output_dir, shap_file_name) + np.save(shap_file_path, shap_values) + + metrics_df = pd.concat([metrics_df, pd.DataFrame([{ + 'Model': best_model.model_id, + 'Pipeline': f"Pipeline_{pipeline_idx + 1}", + 'MSE': mse, + 'SHAP_Stability': mean_shap_stability + }])], ignore_index=True) + + metrics_csv_path = os.path.join(output_directory, "metrics_summary.csv") + metrics_df.to_csv(metrics_csv_path, index=False) + + print(f"All results saved to {output_directory}") + + h2o.shutdown(prompt=False) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Run H2O AutoML for different models.') + parser.add_argument('--time', type=float, required=True, help='Time in hours for the AutoML process to run.') + args = parser.parse_args() + + # Run the H2O AutoML experiment + run_h2o_experiment(use_ndvi=True, automl_time=args.time, output_base_directory="./results_h2o") + diff --git a/src/h20_batch.sh b/src/h20_batch.sh new file mode 100644 index 0000000..63cf20e --- /dev/null +++ b/src/h20_batch.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH --account=def-xander # Replace with your account +#SBATCH --mem=10G # Memory allocation +#SBATCH --time=11:00:00 # Total run time limit (11 hours) +#SBATCH --cpus-per-task=4 # Number of CPU cores per task +#SBATCH --job-name=h20_%A # Job name with job ID appended +#SBATCH --output=%x-%j.out # Standard output and error log +#SBATCH --error=%x-%j.err # Separate error log + +# Load necessary modules +module load python/3.8 + +# Activate your virtual environment +source ~/envs/workdir/bin/activate + +# Parameters +TIME=$1 + +# Run the Python script with the specified time parameter +srun python /home/dvera/scratch/Framework_EXP/h20_autoML.py --time $TIME + +# Deactivate the virtual environment +deactivate diff --git a/src/nsga_batch.sh b/src/nsga_batch.sh new file mode 100644 index 0000000..cc7e455 --- /dev/null +++ b/src/nsga_batch.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --account=def-xander # Replace with your account +#SBATCH --mem=10G # Memory allocation +#SBATCH --time=21:00:00 # Total run time limit (11 hours) +#SBATCH --cpus-per-task=4 # Number of CPU cores per task +#SBATCH --job-name=nsga_%A # Job name with job ID appended +#SBATCH --output=%x-%j.out # Standard output and error log +#SBATCH --error=%x-%j.err # Separate error log + +# Load necessary modules +module load python/3.8 + +# Activate your virtual environment +source ~/envs/workdir/bin/activate + +# Run the Python script with the specified time parameter +srun python /home/dvera/scratch/Framework_EXP/nsga_exp.py + +# Deactivate the virtual environment +deactivate diff --git a/src/nsga_exp.py b/src/nsga_exp.py new file mode 100644 index 0000000..729d2c9 --- /dev/null +++ b/src/nsga_exp.py @@ -0,0 +1,210 @@ +import os +import time +import numpy as np +import pandas as pd +from sklearn.model_selection import KFold, ParameterGrid +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer +from sklearn.impute import SimpleImputer +from sklearn.feature_selection import SelectKBest, f_regression +from sklearn.metrics import mean_squared_error +from sklearn.cluster import KMeans +import shap +from deap import base, creator, tools, algorithms +from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso + +from combine_datasets import combine_datasets + +creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0)) # Minimize both objectives +creator.create("Individual", list, fitness=creator.FitnessMin) + +# Preprocess the data and separate features and target +def preprocess_data(input_data, pipeline, k_value): + pipeline.named_steps['feature_selection'].set_params(k=k_value) + X = input_data.drop(columns=['yield_t/ha'], errors='ignore') + print(X.columns) + y = input_data['yield_t/ha'] + X = pipeline.fit_transform(X, y) + return X, y + +# Load and combine datasets +def load_and_combine_datasets(use_ndvi=True): + data = combine_datasets( + "./data/potatoes_dataset.csv", + "./data/updated_soil_data_with_awc.csv", + "./data/final_augmented_climate_data.csv", + "./data/NDVI.csv", + ndvi=use_ndvi + ) + return data + +# Function to compute SHAP values using KMeans clustering +def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10): + kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train) + cluster_centers = kmeans.cluster_centers_ + + explainer = shap.KernelExplainer(model.predict, cluster_centers) + shap_values = explainer.shap_values(X_test) + + return shap_values + +# Safe model training and evaluation +def safe_model_fit_evaluate(model, X_train, y_train, X_test, y_test): + try: + model.fit(X_train, y_train) + predictions = model.predict(X_test) + mse = mean_squared_error(y_test, predictions) + return mse, predictions + except (ValueError, OverflowError) as e: + print(f"Model evaluation failed due to: {e}") + return float('inf'), None + +# Updated fitness function to evaluate MSE and SHAP stability using KMeans for SHAP +def evaluate_individual(individual, data, kf): + model_name, model_params, pipeline_idx, k_value = individual[0], individual[1], individual[2], individual[3] + + # Get the model and pipeline + model_func = model_space[model_name] + model, params = model_func() + model.set_params(**model_params) + + pipeline = pipelines[pipeline_idx] + X_processed, y_processed = preprocess_data(data, pipeline, k_value) + + mse_scores = [] + shap_stabilities = [] + all_shap_values = [] + + for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed, y_processed)): + X_train_fold, X_test_fold = X_processed[train_idx], X_processed[test_idx] + y_train_fold, y_test_fold = y_processed[train_idx], y_processed[test_idx] + + # Safe model fitting and evaluation + mse, predictions = safe_model_fit_evaluate(model, X_train_fold, y_train_fold, X_test_fold, y_test_fold) + mse_scores.append(mse) + + if predictions is not None: + # Compute SHAP values using KMeans clustering + shap_values = compute_shap_values_with_kmeans(model, X_train_fold, X_test_fold) + shap_stability = np.std(shap_values, axis=0).mean() + shap_stabilities.append(shap_stability) + all_shap_values.append(shap_values) + + # Return average MSE and SHAP stability + return np.mean(mse_scores), np.mean(shap_stabilities), all_shap_values + +# Run NSGA-II experiment with a time limit +def run_nsga_experiment(use_ndvi=True, + output_base_directory="./results_nsga", + population_size=30, + n_generations=50, + time_limit=36000): + # Load and combine dataset + data = load_and_combine_datasets(use_ndvi=use_ndvi) + + # Pipelines for preprocessing + global pipelines + pipelines = [ + Pipeline([ + ('normalize', StandardScaler()), + ('imputer', SimpleImputer(strategy='mean')), + ('poly_features', PolynomialFeatures(degree=2)), + ('feature_selection', SelectKBest(score_func=f_regression, k=25)) + ]), + Pipeline([ + ('robust', RobustScaler()), + ('feature_selection', SelectKBest(score_func=f_regression, k=25)), + ('power_transformation', PowerTransformer(method='yeo-johnson')) + ]) + ] + + # Model space with their corresponding hyperparameter grids + global model_space + model_space = { + 'lasso': lasso, + 'ridge': ridge_regressor, + 'random_forest': random_forest, + 'gradient_boosting': gradient_boosting, + 'decision_tree': decision_tree_regressor, + 'stacking_ensemble': stacking_lasso + } + + # Create output directory with timestamp + timestamp = time.strftime("%Y%m%d_%H%M%S") + output_directory = os.path.join(output_base_directory, f"nsga_{time_limit // 3600}h_{timestamp}") + os.makedirs(output_directory, exist_ok=True) + + # Define cross-validation strategy + kf = KFold(n_splits=5, shuffle=True, random_state=42) + + # Convert parameter grid to a list of tuples for DEAP + param_space = [(model_name, param_values, i, k_value) for model_name, model_func in model_space.items() + for param_values in list(ParameterGrid(model_func()[1])) for i in range(len(pipelines)) + for k_value in range(10, 31, 5)] # Range for k in SelectKBest + + # Initialize DEAP toolbox + toolbox = base.Toolbox() + toolbox.register("attr_float", lambda: np.random.choice(len(param_space), 4)) # Generate a list with 4 elements + toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.attr_float) + toolbox.register("population", tools.initRepeat, list, toolbox.individual) + toolbox.register("mate", tools.cxTwoPoint) + toolbox.register("mutate", tools.mutPolynomialBounded, low=0, up=len(param_space) - 1, eta=0.5, indpb=0.2) + toolbox.register("select", tools.selNSGA2) + toolbox.register("evaluate", lambda ind: evaluate_individual(param_space[int(ind[0])], data, kf)) + + # Run NSGA-II algorithm + start_time = time.time() + pop = toolbox.population(n=population_size) + + # Evaluate the initial population + fits = toolbox.map(toolbox.evaluate, pop) + for fit, ind in zip(fits, pop): + ind.fitness.values = fit[:2] + ind.shap_values = fit[2] + + # Use the number of parents and offspring in the evolution process + for gen in range(n_generations): + if time.time() - start_time > time_limit: + print("Time limit exceeded, stopping evolution.") + break + + offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2) + fits = toolbox.map(toolbox.evaluate, offspring) + + for fit, ind in zip(fits, offspring): + ind.fitness.values = fit[:2] # Ensure only the first two values are assigned to fitness + ind.shap_values = fit[2] + + # Select the next generation of parents from the combined pool of parents and offspring + pop = toolbox.select(pop + offspring, k=population_size) + + for ind in pop: + for fold_idx, shap_values in enumerate(ind.shap_values): + shap_output_path = os.path.join(output_directory, f"shap_values_{int(ind[0])}_fold_{fold_idx + 1}.npy") + np.save(shap_output_path, shap_values) + + # Extract and save the Pareto front + pareto_front = tools.sortNondominated(pop, len(pop), first_front_only=True)[0] + pareto_results_df = pd.DataFrame([param_space[int(ind[0])] + ind.fitness.values for ind in pareto_front], + columns=['Model', 'Params', 'Pipeline', 'K', 'MSE', 'SHAP_Stability']) + pareto_results_df.to_csv(os.path.join(output_directory, "nsga_pareto_front.csv"), index=False) + + # Save the final population + results_df = pd.DataFrame([param_space[int(ind[0])] + ind.fitness.values for ind in pop], + columns=['Model', 'Params', 'Pipeline', 'K', 'MSE', 'SHAP_Stability']) + results_df.to_csv(os.path.join(output_directory, "nsga_results.csv"), index=False) + + print(f"All results saved to {output_directory}") + + +if __name__ == "__main__": + run_nsga_experiment( + use_ndvi=True, + output_base_directory="./results_nsga", + population_size=80, # Larger population size for a comprehensive search + n_generations=100, # Increased number of generations + num_parents=30, # Increased number of parents + num_offspring=50, # Increased number of offspring + time_limit=108000 # 20 hours (20 * 3600 seconds) + ) + diff --git a/src/shap_values_computation.py b/src/shap_values_computation.py new file mode 100644 index 0000000..943220b --- /dev/null +++ b/src/shap_values_computation.py @@ -0,0 +1,32 @@ +import os +import shap +import numpy as np + + +def compute_and_save_shap_values(model, + X_train, + X_test, + output_directory, + file_name): + """ + Computes SHAP values for the provided model and saves them to the specified directory and file. + + Parameters: + - model: The trained machine learning model. + - X_train: The training data used to fit the SHAP explainer. + - X_test: The test data used to compute SHAP values. + - output_directory: The directory where the SHAP values will be saved. + - file_name: The name of the file to save the SHAP values. + """ + # Create the output directory if it doesn't exist + os.makedirs(output_directory, exist_ok=True) + + # Compute SHAP values using the appropriate explainer + explainer = shap.Explainer(model, X_train) + shap_values = explainer(X_test) + + # Save the SHAP values to a .npy file + shap_file_path = os.path.join(output_directory, f"{file_name}.npy") + np.save(shap_file_path, shap_values.values) + + print(f"SHAP values saved to {shap_file_path}")