First upload

This commit is contained in:
Varyngoth
2025-11-11 02:22:12 -04:00
parent f8a9eb0c44
commit b355f6a1bc
16 changed files with 929 additions and 0 deletions

112
src/algorithms.py Normal file
View File

@@ -0,0 +1,112 @@
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
def lasso():
model = Lasso()
params = {
'alpha': [0.001 * (10 ** i) for i in range(5)], # Steps by powers of 10
'max_iter': [1000 + 1000 * i for i in range(10)], # Steps by 1000
'tol': [1e-4 / (10 ** i) for i in range(5)], # Steps by powers of 10
'warm_start': [True, False],
'positive': [True, False],
'selection': ['cyclic', 'random']
}
return model, params
def random_forest(random_state=42):
model = RandomForestRegressor(random_state=random_state)
params = {
'n_estimators': [100 + 100 * i for i in range(10)], # Steps by 100
'max_depth': [10 + 10 * i for i in range(4)], # Steps by 10
'min_samples_split': [2 + i for i in range(3)], # Steps by 1
'min_samples_leaf': [1 + i for i in range(4)], # Steps by 1
'max_features': ['sqrt', 'log2'],
'bootstrap': [True, False],
}
return model, params
def gradient_boosting(random_state=42):
model = GradientBoostingRegressor(random_state=random_state)
params = {
'n_estimators': [50, 100, 150, 200, 250, 300], # Reasonable range for most tasks
'learning_rate': [0.01, 0.05, 0.1, 0.2], # More conservative learning rates
'max_depth': [3, 4, 5, 6], # Keeps complexity manageable
'min_samples_split': [2, 3, 4], # Commonly used values
'min_samples_leaf': [1, 2, 3, 4], # Commonly used values
'subsample': [0.8, 0.9, 1.0], # Reasonable subsample values
'max_features': ['sqrt', 'log2'],
}
return model, params
def decision_tree_regressor():
model = DecisionTreeRegressor()
params = {
"splitter": ["best", "random"],
"max_depth": [1 + 2 * i for i in range(6)], # Steps by 2
"min_samples_split": [2 + i for i in range(3)], # Steps by 1
"min_samples_leaf": [1 + i for i in range(4)], # Steps by 1
"min_weight_fraction_leaf": [0.0 + 0.1 * i for i in range(3)], # Steps by 0.1
"max_features": [None, "sqrt", "log2"],
"max_leaf_nodes": [10 * i for i in range(1, 6)], # Steps by 10
}
return model, params
def ridge_regressor(random_state=42):
model = Ridge(random_state=random_state)
params = {
'alpha': [0.001 * (10 ** i) for i in range(5)], # Steps by powers of 10
'max_iter': [1000 + 1000 * i for i in range(5)], # Steps by 1000
'tol': [1e-4 / (10 ** i) for i in range(5)], # Steps by powers of 10
'solver': ['auto'],
'fit_intercept': [True, False],
'positive': [True, False]
}
return model, params
def stacking_lasso(random_state=42):
base_learners = [
('gb', GradientBoostingRegressor(random_state=random_state)),
('dt', DecisionTreeRegressor(random_state=random_state)),
]
model = StackingRegressor(
estimators=base_learners,
final_estimator=Lasso(random_state=random_state),
passthrough=True
)
params = {
# Lasso hyperparameters (slightly expanded search space)
'final_estimator__alpha': [0.001, 0.01, 0.1], # Expanded to allow some variation
'final_estimator__max_iter': [3000, 5000], # Expanded to allow more iterations
'final_estimator__tol': [1e-4, 1e-5], # Expanded tolerance
'final_estimator__warm_start': [False], # Fixed
'final_estimator__positive': [True], # Fixed
'final_estimator__selection': ['random'], # Fixed
# Gradient Boosting hyperparameters (expanded search space)
'gb__n_estimators': [50, 100, 200], # Expanded range
'gb__learning_rate': [0.01, 0.05, 0.1], # Expanded range
'gb__max_depth': [3, 5, 7], # Expanded range
'gb__min_samples_split': [2, 4], # Expanded range
'gb__min_samples_leaf': [1, 2], # Expanded range
'gb__subsample': [0.8, 0.9, 1.0], # Expanded range
'gb__max_features': ['sqrt', 'log2'], # Fixed
# Decision Tree hyperparameters (expanded search space)
'dt__max_depth': [11, 15, 20], # Expanded depth range
'dt__max_features': ['log2'], # Fixed
'dt__max_leaf_nodes': [30, 50, 70], # Expanded range
'dt__min_samples_leaf': [1, 2, 4], # Expanded range
'dt__min_samples_split': [2, 4, 6], # Expanded range
'dt__min_weight_fraction_leaf': [0.1, 0.2], # Reduced range but still variable
'dt__splitter': ['best', 'random'] # Allowing both splitting strategies
}
return model, params

92
src/combine_datasets.py Normal file
View File

@@ -0,0 +1,92 @@
import pandas as pd
from loading_climate_data import load_and_process_climate_data
from loading_crop_data import engineer_crop_features
from loading_ndvi_data import process_ndvi_data
from loading_soil_data import get_soil_data
def combine_datasets(crop_data_file,
soil_data_file,
climate_data_file,
ndvi_data_file,
ndvi=False):
dataset_loaders = {
'soil': lambda: get_soil_data(soil_data_file),
'climate': lambda: load_and_process_climate_data(climate_data_file),
'ndvi': lambda: process_ndvi_data(ndvi_data_file),
'crop': lambda: engineer_crop_features(crop_data_file)
}
merge_keys = {
'soil': ['PostalCode'],
'climate': ['PostalCode', 'Year'],
'ndvi': ['PostalCode', 'Year'],
'crop': ['PostalCode', 'Year']
}
combined_data = dataset_loaders['crop']()
# Merge climate data
climate_data = dataset_loaders['climate']()
combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
# Merge NDVI data if required
if ndvi:
ndvi_data = dataset_loaders['ndvi']()
combined_data = combined_data[combined_data['Year'] >= 2000]
combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
# Merge soil data
soil_data = dataset_loaders['soil']()
combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
postal_codes = combined_data['PostalCode']
years = combined_data['Year']
# Drop irrelevant or redundant columns
features_to_exclude = ['PostalCode', 'Year', 'SoilID']
combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
return combined_data
def combine_dataset_pc(crop_data_file,
soil_data_file,
climate_data_file,
ndvi_data_file,
ndvi=False):
dataset_loaders = {
'soil': lambda: get_soil_data(soil_data_file),
'climate': lambda: load_and_process_climate_data(climate_data_file),
'ndvi': lambda: process_ndvi_data(ndvi_data_file),
'crop': lambda: engineer_crop_features(crop_data_file)
}
merge_keys = {
'soil': ['PostalCode'],
'climate': ['PostalCode', 'Year'],
'ndvi': ['PostalCode', 'Year'],
'crop': ['PostalCode', 'Year']
}
combined_data = dataset_loaders['crop']()
# Merge climate data
climate_data = dataset_loaders['climate']()
combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
# Merge NDVI data if required
if ndvi:
ndvi_data = dataset_loaders['ndvi']()
combined_data = combined_data[combined_data['Year'] >= 2000]
combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
# Merge soil data
soil_data = dataset_loaders['soil']()
combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
# Drop irrelevant or redundant columns
features_to_exclude = ['Year', 'SoilID']
combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
return combined_data

23
src/grid_search_batch.sh Normal file
View File

@@ -0,0 +1,23 @@
#!/bin/bash
#SBATCH --account=def-xander # Replace with your account
#SBATCH --mem=10G # Memory allocation
#SBATCH --time=21:00:00 # Total run time limit (11 hours)
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
#SBATCH --job-name=grid_search_%A # Job name with job ID appended
#SBATCH --output=%x-%j.out # Standard output and error log
#SBATCH --error=%x-%j.err # Separate error log
# Load necessary modules
module load python/3.8
# Activate your virtual environment
source ~/envs/workdir/bin/activate
# Parameters
TIME=$1
# Run the Python script with the specified time parameter
srun python /home/dvera/scratch/Framework_EXP/grid_search_exp.py --time $TIME
# Deactivate the virtual environment
deactivate

186
src/grid_search_exp.py Normal file
View File

@@ -0,0 +1,186 @@
import os
import time
import numpy as np
import pandas as pd
import warnings
import shap
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.exceptions import ConvergenceWarning
from joblib import dump
from combine_datasets import combine_datasets
from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Preprocess the data and separate features and target
def preprocess_data(input_data, pipeline):
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
y = input_data['yield_t/ha']
# Fit-transform the pipeline steps without `y` first
for name, step in pipeline.steps:
if name != 'feature_selection':
X = step.fit_transform(X)
# Apply `SelectKBest` separately, which requires `y`
if 'feature_selection' in [name for name, _ in pipeline.steps]:
X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
return X, y
# Function to compute SHAP values using KMeans clustering
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
# Fit KMeans to the training data with explicit n_init parameter
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
cluster_centers = kmeans.cluster_centers_
# Compute SHAP values on cluster centers
explainer = shap.KernelExplainer(model.predict, cluster_centers)
shap_values = explainer.shap_values(X_test)
return shap_values
# Iterative grid search with time monitoring
def iterative_grid_search(model, param_grid, X, y, cv, time_limit):
best_model = None
best_score = -np.inf
start_time = time.time()
# Manually iterate over parameter combinations
for params in ParameterGrid(param_grid):
elapsed_time = time.time() - start_time
if elapsed_time > time_limit:
print("Time limit exceeded. Stopping search.")
break
model.set_params(**params)
scores = []
for train_idx, test_idx in cv.split(X, y):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = -mean_squared_error(y_test, predictions)
scores.append(score)
mean_score = np.mean(scores)
if mean_score > best_score:
best_score = mean_score
best_model = model
return best_model
# Main experiment function
def run_experiment_with_dynamic_grid(total_hours=10.0, output_base_directory="./results"):
time_per_algorithm = total_hours * 3600 / 5
# Load and combine datasets
data = combine_datasets(
"./data/potatoes_dataset.csv",
"./data/updated_soil_data_with_awc.csv",
"./data/final_augmented_climate_data.csv",
"./data/NDVI.csv",
ndvi=True
)
pipelines = [
Pipeline([
('normalize', StandardScaler()),
('imputer', SimpleImputer(strategy='mean')),
('poly_features', PolynomialFeatures(degree=2)),
('feature_selection', SelectKBest(score_func=f_regression, k=25))
]),
Pipeline([
('robust', RobustScaler()),
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
('power_transformation', PowerTransformer(method='yeo-johnson'))
])
]
models = {
'lasso': lasso,
'ridge': ridge_regressor,
'random_forest': random_forest,
'gradient_boosting': gradient_boosting,
'decision_tree': decision_tree_regressor,
'stacking_ensemble': stacking_lasso
}
output_directory = os.path.join(output_base_directory, f"grid_search_results_{total_hours}h")
os.makedirs(output_directory, exist_ok=True)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for model_name, model_func in models.items():
best_pipeline_result = None
for pipeline_idx, pipeline in enumerate(pipelines):
X, y = preprocess_data(data, pipeline)
model, params = model_func()
best_model = iterative_grid_search(
model, params, X, y, cv=kf, time_limit=time_per_algorithm
)
shap_stabilities = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
X_train_fold, X_test_fold = X[train_idx], X[test_idx]
y_train_fold = y[train_idx]
best_model.fit(X_train_fold, y_train_fold)
shap_values = compute_shap_values_with_kmeans(best_model, X_train_fold, X_test_fold)
fold_shap_stability = np.std(shap_values, axis=0).mean()
shap_stabilities.append(fold_shap_stability)
shap_stability = np.mean(shap_stabilities)
predictions = best_model.predict(X)
mse = mean_squared_error(y, predictions)
if best_pipeline_result is None or (
mse < best_pipeline_result['mse'] and shap_stability < best_pipeline_result['shap_stability']):
best_pipeline_result = {
'model': best_model,
'pipeline_idx': pipeline_idx,
'mse': mse,
'shap_stability': shap_stability
}
if best_pipeline_result:
model_output_dir = os.path.join(output_directory, model_name)
os.makedirs(model_output_dir, exist_ok=True)
model_file_path = os.path.join(model_output_dir, f"{model_name}_best_model.joblib")
dump(best_pipeline_result['model'], model_file_path)
metrics_df = pd.DataFrame({
'Model': [model_name],
'Pipeline': [f"Pipeline_{best_pipeline_result['pipeline_idx'] + 1}"],
'MSE': [best_pipeline_result['mse']],
'SHAP_Stability': [best_pipeline_result['shap_stability']]
})
metrics_csv_path = os.path.join(output_directory, "metrics_summary_GS.csv")
metrics_df.to_csv(metrics_csv_path, index=False, mode='a', header=not os.path.exists(metrics_csv_path))
print(f"Saved best results for {model_name} from Pipeline {best_pipeline_result['pipeline_idx'] + 1}")
print(f"All results saved to {output_directory}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run Grid Search for different models.')
parser.add_argument('--time', type=float, required=True, help='Time for the model to run in hours')
args = parser.parse_args()
run_experiment_with_dynamic_grid(total_hours=args.time, output_base_directory="./results_grid_search")

166
src/h20_autoML.py Normal file
View File

@@ -0,0 +1,166 @@
import os
import shap
import numpy as np
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from combine_datasets import combine_datasets
# Initialize H2O
h2o.init()
# Preprocess the data and separate features and target
def preprocess_data(input_data, pipeline):
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
y = input_data['yield_t/ha']
# Fit-transform the pipeline steps without `y` first
for name, step in pipeline.steps:
if name != 'feature_selection':
X = step.fit_transform(X)
# Apply `SelectKBest` separately, which requires `y`
if 'feature_selection' in [name for name, _ in pipeline.steps]:
X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
return X, y
# Function to compute SHAP values using KMeans clustering
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
# Fit KMeans to the training data with explicit n_init parameter
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
cluster_centers = kmeans.cluster_centers_
# Convert cluster centers to H2OFrame
cluster_centers_h2o = h2o.H2OFrame(cluster_centers)
# Convert X_test to H2OFrame
X_test_h2o = h2o.H2OFrame(X_test)
# Compute SHAP values on cluster centers
explainer = shap.KernelExplainer(lambda x: model.predict(h2o.H2OFrame(x)).as_data_frame().values.flatten(),
cluster_centers_h2o.as_data_frame().values)
shap_values = explainer.shap_values(X_test_h2o.as_data_frame().values)
return shap_values
# Function to load and combine datasets
def load_and_combine_datasets(use_ndvi=True):
data = combine_datasets(
"./data/potatoes_dataset.csv",
"./data/updated_soil_data_with_awc.csv",
"./data/final_augmented_climate_data.csv",
"./data/NDVI.csv",
ndvi=use_ndvi
)
return data
# Main experiment function with H2O AutoML
def run_h2o_experiment(use_ndvi=True, automl_time=10, output_base_directory="./results_h2o"):
data = load_and_combine_datasets(use_ndvi=use_ndvi)
pipelines = [
Pipeline([
('normalize', StandardScaler()),
('imputer', SimpleImputer(strategy='mean')),
('poly_features', PolynomialFeatures(degree=2)),
('feature_selection', SelectKBest(score_func=f_regression, k=25))
]),
Pipeline([
('robust', RobustScaler()),
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
('power_transformation', PowerTransformer(method='yeo-johnson'))
])
]
output_directory = os.path.join(output_base_directory, f"h2o_automl_results_{automl_time}h")
os.makedirs(output_directory, exist_ok=True)
metrics_df = pd.DataFrame(columns=['Model', 'Pipeline', 'MSE', 'SHAP_Stability'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for pipeline_idx, pipeline in enumerate(pipelines):
X_processed, y = preprocess_data(data, pipeline)
X_h2o = h2o.H2OFrame(X_processed)
y_h2o = h2o.H2OFrame(y.to_frame())
X_h2o['target'] = y_h2o
shap_stabilities = []
all_shap_values = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed)):
train = X_h2o[train_idx.tolist(), :]
test = X_h2o[test_idx.tolist(), :]
aml = H2OAutoML(max_runtime_secs=int(automl_time * 3600 / kf.get_n_splits()), max_models=5, seed=42,
sort_metric='MSE')
aml.train(x=list(X_h2o.columns[:-1]), y='target', training_frame=train)
best_model = aml.leader
leaderboard = aml.leaderboard.as_data_frame()
leaderboard_csv_path = os.path.join(output_directory, f"leaderboard_pipeline_{pipeline_idx + 1}_fold_{fold + 1}.csv")
leaderboard.to_csv(leaderboard_csv_path, index=False)
model_output_dir = os.path.join(output_directory, f"pipeline_{pipeline_idx + 1}")
os.makedirs(model_output_dir, exist_ok=True)
model_file_path = os.path.join(model_output_dir, f"h2o_best_model_{pipeline_idx + 1}_fold_{fold + 1}.zip")
h2o.save_model(best_model, path=model_file_path)
predictions = best_model.predict(test).as_data_frame()['predict']
mse = mean_squared_error(test['target'].as_data_frame().values, predictions.values)
shap_values = compute_shap_values_with_kmeans(best_model, train.as_data_frame().values,
test.as_data_frame().values)
shap_stability = np.std(shap_values, axis=0).mean()
shap_stabilities.append(shap_stability)
all_shap_values.append(shap_values)
print(f"Completed H2O AutoML and SHAP computation for Pipeline {pipeline_idx + 1}, Fold {fold + 1}")
mean_shap_stability = np.mean(shap_stabilities)
for fold_idx, shap_values in enumerate(all_shap_values):
shap_file_name = f"shap_values_pipeline_{pipeline_idx + 1}_fold_{fold_idx + 1}.npy"
shap_file_path = os.path.join(model_output_dir, shap_file_name)
np.save(shap_file_path, shap_values)
metrics_df = pd.concat([metrics_df, pd.DataFrame([{
'Model': best_model.model_id,
'Pipeline': f"Pipeline_{pipeline_idx + 1}",
'MSE': mse,
'SHAP_Stability': mean_shap_stability
}])], ignore_index=True)
metrics_csv_path = os.path.join(output_directory, "metrics_summary.csv")
metrics_df.to_csv(metrics_csv_path, index=False)
print(f"All results saved to {output_directory}")
h2o.shutdown(prompt=False)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run H2O AutoML for different models.')
parser.add_argument('--time', type=float, required=True, help='Time in hours for the AutoML process to run.')
args = parser.parse_args()
# Run the H2O AutoML experiment
run_h2o_experiment(use_ndvi=True, automl_time=args.time, output_base_directory="./results_h2o")

23
src/h20_batch.sh Normal file
View File

@@ -0,0 +1,23 @@
#!/bin/bash
#SBATCH --account=def-xander # Replace with your account
#SBATCH --mem=10G # Memory allocation
#SBATCH --time=11:00:00 # Total run time limit (11 hours)
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
#SBATCH --job-name=h20_%A # Job name with job ID appended
#SBATCH --output=%x-%j.out # Standard output and error log
#SBATCH --error=%x-%j.err # Separate error log
# Load necessary modules
module load python/3.8
# Activate your virtual environment
source ~/envs/workdir/bin/activate
# Parameters
TIME=$1
# Run the Python script with the specified time parameter
srun python /home/dvera/scratch/Framework_EXP/h20_autoML.py --time $TIME
# Deactivate the virtual environment
deactivate

20
src/nsga_batch.sh Normal file
View File

@@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH --account=def-xander # Replace with your account
#SBATCH --mem=10G # Memory allocation
#SBATCH --time=21:00:00 # Total run time limit (11 hours)
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
#SBATCH --job-name=nsga_%A # Job name with job ID appended
#SBATCH --output=%x-%j.out # Standard output and error log
#SBATCH --error=%x-%j.err # Separate error log
# Load necessary modules
module load python/3.8
# Activate your virtual environment
source ~/envs/workdir/bin/activate
# Run the Python script with the specified time parameter
srun python /home/dvera/scratch/Framework_EXP/nsga_exp.py
# Deactivate the virtual environment
deactivate

210
src/nsga_exp.py Normal file
View File

@@ -0,0 +1,210 @@
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
import shap
from deap import base, creator, tools, algorithms
from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
from combine_datasets import combine_datasets
creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0)) # Minimize both objectives
creator.create("Individual", list, fitness=creator.FitnessMin)
# Preprocess the data and separate features and target
def preprocess_data(input_data, pipeline, k_value):
pipeline.named_steps['feature_selection'].set_params(k=k_value)
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
print(X.columns)
y = input_data['yield_t/ha']
X = pipeline.fit_transform(X, y)
return X, y
# Load and combine datasets
def load_and_combine_datasets(use_ndvi=True):
data = combine_datasets(
"./data/potatoes_dataset.csv",
"./data/updated_soil_data_with_awc.csv",
"./data/final_augmented_climate_data.csv",
"./data/NDVI.csv",
ndvi=use_ndvi
)
return data
# Function to compute SHAP values using KMeans clustering
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
cluster_centers = kmeans.cluster_centers_
explainer = shap.KernelExplainer(model.predict, cluster_centers)
shap_values = explainer.shap_values(X_test)
return shap_values
# Safe model training and evaluation
def safe_model_fit_evaluate(model, X_train, y_train, X_test, y_test):
try:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
return mse, predictions
except (ValueError, OverflowError) as e:
print(f"Model evaluation failed due to: {e}")
return float('inf'), None
# Updated fitness function to evaluate MSE and SHAP stability using KMeans for SHAP
def evaluate_individual(individual, data, kf):
model_name, model_params, pipeline_idx, k_value = individual[0], individual[1], individual[2], individual[3]
# Get the model and pipeline
model_func = model_space[model_name]
model, params = model_func()
model.set_params(**model_params)
pipeline = pipelines[pipeline_idx]
X_processed, y_processed = preprocess_data(data, pipeline, k_value)
mse_scores = []
shap_stabilities = []
all_shap_values = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed, y_processed)):
X_train_fold, X_test_fold = X_processed[train_idx], X_processed[test_idx]
y_train_fold, y_test_fold = y_processed[train_idx], y_processed[test_idx]
# Safe model fitting and evaluation
mse, predictions = safe_model_fit_evaluate(model, X_train_fold, y_train_fold, X_test_fold, y_test_fold)
mse_scores.append(mse)
if predictions is not None:
# Compute SHAP values using KMeans clustering
shap_values = compute_shap_values_with_kmeans(model, X_train_fold, X_test_fold)
shap_stability = np.std(shap_values, axis=0).mean()
shap_stabilities.append(shap_stability)
all_shap_values.append(shap_values)
# Return average MSE and SHAP stability
return np.mean(mse_scores), np.mean(shap_stabilities), all_shap_values
# Run NSGA-II experiment with a time limit
def run_nsga_experiment(use_ndvi=True,
output_base_directory="./results_nsga",
population_size=30,
n_generations=50,
time_limit=36000):
# Load and combine dataset
data = load_and_combine_datasets(use_ndvi=use_ndvi)
# Pipelines for preprocessing
global pipelines
pipelines = [
Pipeline([
('normalize', StandardScaler()),
('imputer', SimpleImputer(strategy='mean')),
('poly_features', PolynomialFeatures(degree=2)),
('feature_selection', SelectKBest(score_func=f_regression, k=25))
]),
Pipeline([
('robust', RobustScaler()),
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
('power_transformation', PowerTransformer(method='yeo-johnson'))
])
]
# Model space with their corresponding hyperparameter grids
global model_space
model_space = {
'lasso': lasso,
'ridge': ridge_regressor,
'random_forest': random_forest,
'gradient_boosting': gradient_boosting,
'decision_tree': decision_tree_regressor,
'stacking_ensemble': stacking_lasso
}
# Create output directory with timestamp
timestamp = time.strftime("%Y%m%d_%H%M%S")
output_directory = os.path.join(output_base_directory, f"nsga_{time_limit // 3600}h_{timestamp}")
os.makedirs(output_directory, exist_ok=True)
# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Convert parameter grid to a list of tuples for DEAP
param_space = [(model_name, param_values, i, k_value) for model_name, model_func in model_space.items()
for param_values in list(ParameterGrid(model_func()[1])) for i in range(len(pipelines))
for k_value in range(10, 31, 5)] # Range for k in SelectKBest
# Initialize DEAP toolbox
toolbox = base.Toolbox()
toolbox.register("attr_float", lambda: np.random.choice(len(param_space), 4)) # Generate a list with 4 elements
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.attr_float)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutPolynomialBounded, low=0, up=len(param_space) - 1, eta=0.5, indpb=0.2)
toolbox.register("select", tools.selNSGA2)
toolbox.register("evaluate", lambda ind: evaluate_individual(param_space[int(ind[0])], data, kf))
# Run NSGA-II algorithm
start_time = time.time()
pop = toolbox.population(n=population_size)
# Evaluate the initial population
fits = toolbox.map(toolbox.evaluate, pop)
for fit, ind in zip(fits, pop):
ind.fitness.values = fit[:2]
ind.shap_values = fit[2]
# Use the number of parents and offspring in the evolution process
for gen in range(n_generations):
if time.time() - start_time > time_limit:
print("Time limit exceeded, stopping evolution.")
break
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
fits = toolbox.map(toolbox.evaluate, offspring)
for fit, ind in zip(fits, offspring):
ind.fitness.values = fit[:2] # Ensure only the first two values are assigned to fitness
ind.shap_values = fit[2]
# Select the next generation of parents from the combined pool of parents and offspring
pop = toolbox.select(pop + offspring, k=population_size)
for ind in pop:
for fold_idx, shap_values in enumerate(ind.shap_values):
shap_output_path = os.path.join(output_directory, f"shap_values_{int(ind[0])}_fold_{fold_idx + 1}.npy")
np.save(shap_output_path, shap_values)
# Extract and save the Pareto front
pareto_front = tools.sortNondominated(pop, len(pop), first_front_only=True)[0]
pareto_results_df = pd.DataFrame([param_space[int(ind[0])] + ind.fitness.values for ind in pareto_front],
columns=['Model', 'Params', 'Pipeline', 'K', 'MSE', 'SHAP_Stability'])
pareto_results_df.to_csv(os.path.join(output_directory, "nsga_pareto_front.csv"), index=False)
# Save the final population
results_df = pd.DataFrame([param_space[int(ind[0])] + ind.fitness.values for ind in pop],
columns=['Model', 'Params', 'Pipeline', 'K', 'MSE', 'SHAP_Stability'])
results_df.to_csv(os.path.join(output_directory, "nsga_results.csv"), index=False)
print(f"All results saved to {output_directory}")
if __name__ == "__main__":
run_nsga_experiment(
use_ndvi=True,
output_base_directory="./results_nsga",
population_size=80, # Larger population size for a comprehensive search
n_generations=100, # Increased number of generations
num_parents=30, # Increased number of parents
num_offspring=50, # Increased number of offspring
time_limit=108000 # 20 hours (20 * 3600 seconds)
)

View File

@@ -0,0 +1,32 @@
import os
import shap
import numpy as np
def compute_and_save_shap_values(model,
X_train,
X_test,
output_directory,
file_name):
"""
Computes SHAP values for the provided model and saves them to the specified directory and file.
Parameters:
- model: The trained machine learning model.
- X_train: The training data used to fit the SHAP explainer.
- X_test: The test data used to compute SHAP values.
- output_directory: The directory where the SHAP values will be saved.
- file_name: The name of the file to save the SHAP values.
"""
# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)
# Compute SHAP values using the appropriate explainer
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)
# Save the SHAP values to a .npy file
shap_file_path = os.path.join(output_directory, f"{file_name}.npy")
np.save(shap_file_path, shap_values.values)
print(f"SHAP values saved to {shap_file_path}")