First upload
This commit is contained in:
112
src/algorithms.py
Normal file
112
src/algorithms.py
Normal file
@@ -0,0 +1,112 @@
|
||||
from sklearn.linear_model import Lasso, Ridge
|
||||
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.ensemble import StackingRegressor
|
||||
|
||||
|
||||
def lasso():
|
||||
model = Lasso()
|
||||
params = {
|
||||
'alpha': [0.001 * (10 ** i) for i in range(5)], # Steps by powers of 10
|
||||
'max_iter': [1000 + 1000 * i for i in range(10)], # Steps by 1000
|
||||
'tol': [1e-4 / (10 ** i) for i in range(5)], # Steps by powers of 10
|
||||
'warm_start': [True, False],
|
||||
'positive': [True, False],
|
||||
'selection': ['cyclic', 'random']
|
||||
}
|
||||
return model, params
|
||||
|
||||
|
||||
def random_forest(random_state=42):
|
||||
model = RandomForestRegressor(random_state=random_state)
|
||||
params = {
|
||||
'n_estimators': [100 + 100 * i for i in range(10)], # Steps by 100
|
||||
'max_depth': [10 + 10 * i for i in range(4)], # Steps by 10
|
||||
'min_samples_split': [2 + i for i in range(3)], # Steps by 1
|
||||
'min_samples_leaf': [1 + i for i in range(4)], # Steps by 1
|
||||
'max_features': ['sqrt', 'log2'],
|
||||
'bootstrap': [True, False],
|
||||
}
|
||||
return model, params
|
||||
|
||||
|
||||
def gradient_boosting(random_state=42):
|
||||
model = GradientBoostingRegressor(random_state=random_state)
|
||||
params = {
|
||||
'n_estimators': [50, 100, 150, 200, 250, 300], # Reasonable range for most tasks
|
||||
'learning_rate': [0.01, 0.05, 0.1, 0.2], # More conservative learning rates
|
||||
'max_depth': [3, 4, 5, 6], # Keeps complexity manageable
|
||||
'min_samples_split': [2, 3, 4], # Commonly used values
|
||||
'min_samples_leaf': [1, 2, 3, 4], # Commonly used values
|
||||
'subsample': [0.8, 0.9, 1.0], # Reasonable subsample values
|
||||
'max_features': ['sqrt', 'log2'],
|
||||
}
|
||||
return model, params
|
||||
|
||||
|
||||
def decision_tree_regressor():
|
||||
model = DecisionTreeRegressor()
|
||||
params = {
|
||||
"splitter": ["best", "random"],
|
||||
"max_depth": [1 + 2 * i for i in range(6)], # Steps by 2
|
||||
"min_samples_split": [2 + i for i in range(3)], # Steps by 1
|
||||
"min_samples_leaf": [1 + i for i in range(4)], # Steps by 1
|
||||
"min_weight_fraction_leaf": [0.0 + 0.1 * i for i in range(3)], # Steps by 0.1
|
||||
"max_features": [None, "sqrt", "log2"],
|
||||
"max_leaf_nodes": [10 * i for i in range(1, 6)], # Steps by 10
|
||||
}
|
||||
return model, params
|
||||
|
||||
|
||||
def ridge_regressor(random_state=42):
|
||||
model = Ridge(random_state=random_state)
|
||||
params = {
|
||||
'alpha': [0.001 * (10 ** i) for i in range(5)], # Steps by powers of 10
|
||||
'max_iter': [1000 + 1000 * i for i in range(5)], # Steps by 1000
|
||||
'tol': [1e-4 / (10 ** i) for i in range(5)], # Steps by powers of 10
|
||||
'solver': ['auto'],
|
||||
'fit_intercept': [True, False],
|
||||
'positive': [True, False]
|
||||
}
|
||||
return model, params
|
||||
|
||||
|
||||
def stacking_lasso(random_state=42):
|
||||
base_learners = [
|
||||
('gb', GradientBoostingRegressor(random_state=random_state)),
|
||||
('dt', DecisionTreeRegressor(random_state=random_state)),
|
||||
]
|
||||
|
||||
model = StackingRegressor(
|
||||
estimators=base_learners,
|
||||
final_estimator=Lasso(random_state=random_state),
|
||||
passthrough=True
|
||||
)
|
||||
params = {
|
||||
# Lasso hyperparameters (slightly expanded search space)
|
||||
'final_estimator__alpha': [0.001, 0.01, 0.1], # Expanded to allow some variation
|
||||
'final_estimator__max_iter': [3000, 5000], # Expanded to allow more iterations
|
||||
'final_estimator__tol': [1e-4, 1e-5], # Expanded tolerance
|
||||
'final_estimator__warm_start': [False], # Fixed
|
||||
'final_estimator__positive': [True], # Fixed
|
||||
'final_estimator__selection': ['random'], # Fixed
|
||||
|
||||
# Gradient Boosting hyperparameters (expanded search space)
|
||||
'gb__n_estimators': [50, 100, 200], # Expanded range
|
||||
'gb__learning_rate': [0.01, 0.05, 0.1], # Expanded range
|
||||
'gb__max_depth': [3, 5, 7], # Expanded range
|
||||
'gb__min_samples_split': [2, 4], # Expanded range
|
||||
'gb__min_samples_leaf': [1, 2], # Expanded range
|
||||
'gb__subsample': [0.8, 0.9, 1.0], # Expanded range
|
||||
'gb__max_features': ['sqrt', 'log2'], # Fixed
|
||||
|
||||
# Decision Tree hyperparameters (expanded search space)
|
||||
'dt__max_depth': [11, 15, 20], # Expanded depth range
|
||||
'dt__max_features': ['log2'], # Fixed
|
||||
'dt__max_leaf_nodes': [30, 50, 70], # Expanded range
|
||||
'dt__min_samples_leaf': [1, 2, 4], # Expanded range
|
||||
'dt__min_samples_split': [2, 4, 6], # Expanded range
|
||||
'dt__min_weight_fraction_leaf': [0.1, 0.2], # Reduced range but still variable
|
||||
'dt__splitter': ['best', 'random'] # Allowing both splitting strategies
|
||||
}
|
||||
return model, params
|
||||
92
src/combine_datasets.py
Normal file
92
src/combine_datasets.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import pandas as pd
|
||||
from loading_climate_data import load_and_process_climate_data
|
||||
from loading_crop_data import engineer_crop_features
|
||||
from loading_ndvi_data import process_ndvi_data
|
||||
from loading_soil_data import get_soil_data
|
||||
|
||||
|
||||
def combine_datasets(crop_data_file,
|
||||
soil_data_file,
|
||||
climate_data_file,
|
||||
ndvi_data_file,
|
||||
ndvi=False):
|
||||
dataset_loaders = {
|
||||
'soil': lambda: get_soil_data(soil_data_file),
|
||||
'climate': lambda: load_and_process_climate_data(climate_data_file),
|
||||
'ndvi': lambda: process_ndvi_data(ndvi_data_file),
|
||||
'crop': lambda: engineer_crop_features(crop_data_file)
|
||||
}
|
||||
|
||||
merge_keys = {
|
||||
'soil': ['PostalCode'],
|
||||
'climate': ['PostalCode', 'Year'],
|
||||
'ndvi': ['PostalCode', 'Year'],
|
||||
'crop': ['PostalCode', 'Year']
|
||||
}
|
||||
|
||||
combined_data = dataset_loaders['crop']()
|
||||
|
||||
# Merge climate data
|
||||
climate_data = dataset_loaders['climate']()
|
||||
combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
|
||||
|
||||
# Merge NDVI data if required
|
||||
if ndvi:
|
||||
ndvi_data = dataset_loaders['ndvi']()
|
||||
combined_data = combined_data[combined_data['Year'] >= 2000]
|
||||
combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
|
||||
|
||||
# Merge soil data
|
||||
soil_data = dataset_loaders['soil']()
|
||||
combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
|
||||
postal_codes = combined_data['PostalCode']
|
||||
years = combined_data['Year']
|
||||
|
||||
# Drop irrelevant or redundant columns
|
||||
features_to_exclude = ['PostalCode', 'Year', 'SoilID']
|
||||
|
||||
combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
|
||||
|
||||
return combined_data
|
||||
|
||||
|
||||
def combine_dataset_pc(crop_data_file,
|
||||
soil_data_file,
|
||||
climate_data_file,
|
||||
ndvi_data_file,
|
||||
ndvi=False):
|
||||
dataset_loaders = {
|
||||
'soil': lambda: get_soil_data(soil_data_file),
|
||||
'climate': lambda: load_and_process_climate_data(climate_data_file),
|
||||
'ndvi': lambda: process_ndvi_data(ndvi_data_file),
|
||||
'crop': lambda: engineer_crop_features(crop_data_file)
|
||||
}
|
||||
|
||||
merge_keys = {
|
||||
'soil': ['PostalCode'],
|
||||
'climate': ['PostalCode', 'Year'],
|
||||
'ndvi': ['PostalCode', 'Year'],
|
||||
'crop': ['PostalCode', 'Year']
|
||||
}
|
||||
|
||||
combined_data = dataset_loaders['crop']()
|
||||
|
||||
# Merge climate data
|
||||
climate_data = dataset_loaders['climate']()
|
||||
combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
|
||||
|
||||
# Merge NDVI data if required
|
||||
if ndvi:
|
||||
ndvi_data = dataset_loaders['ndvi']()
|
||||
combined_data = combined_data[combined_data['Year'] >= 2000]
|
||||
combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
|
||||
|
||||
# Merge soil data
|
||||
soil_data = dataset_loaders['soil']()
|
||||
combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
|
||||
|
||||
# Drop irrelevant or redundant columns
|
||||
features_to_exclude = ['Year', 'SoilID']
|
||||
combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
|
||||
|
||||
return combined_data
|
||||
23
src/grid_search_batch.sh
Normal file
23
src/grid_search_batch.sh
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --account=def-xander # Replace with your account
|
||||
#SBATCH --mem=10G # Memory allocation
|
||||
#SBATCH --time=21:00:00 # Total run time limit (11 hours)
|
||||
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
|
||||
#SBATCH --job-name=grid_search_%A # Job name with job ID appended
|
||||
#SBATCH --output=%x-%j.out # Standard output and error log
|
||||
#SBATCH --error=%x-%j.err # Separate error log
|
||||
|
||||
# Load necessary modules
|
||||
module load python/3.8
|
||||
|
||||
# Activate your virtual environment
|
||||
source ~/envs/workdir/bin/activate
|
||||
|
||||
# Parameters
|
||||
TIME=$1
|
||||
|
||||
# Run the Python script with the specified time parameter
|
||||
srun python /home/dvera/scratch/Framework_EXP/grid_search_exp.py --time $TIME
|
||||
|
||||
# Deactivate the virtual environment
|
||||
deactivate
|
||||
186
src/grid_search_exp.py
Normal file
186
src/grid_search_exp.py
Normal file
@@ -0,0 +1,186 @@
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import warnings
|
||||
import shap
|
||||
from sklearn.model_selection import KFold, ParameterGrid
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from joblib import dump
|
||||
|
||||
from combine_datasets import combine_datasets
|
||||
from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
|
||||
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
|
||||
|
||||
# Preprocess the data and separate features and target
|
||||
def preprocess_data(input_data, pipeline):
|
||||
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
|
||||
y = input_data['yield_t/ha']
|
||||
|
||||
# Fit-transform the pipeline steps without `y` first
|
||||
for name, step in pipeline.steps:
|
||||
if name != 'feature_selection':
|
||||
X = step.fit_transform(X)
|
||||
|
||||
# Apply `SelectKBest` separately, which requires `y`
|
||||
if 'feature_selection' in [name for name, _ in pipeline.steps]:
|
||||
X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
|
||||
|
||||
return X, y
|
||||
|
||||
|
||||
# Function to compute SHAP values using KMeans clustering
|
||||
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
|
||||
# Fit KMeans to the training data with explicit n_init parameter
|
||||
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
|
||||
cluster_centers = kmeans.cluster_centers_
|
||||
|
||||
# Compute SHAP values on cluster centers
|
||||
explainer = shap.KernelExplainer(model.predict, cluster_centers)
|
||||
shap_values = explainer.shap_values(X_test)
|
||||
|
||||
return shap_values
|
||||
|
||||
|
||||
# Iterative grid search with time monitoring
|
||||
def iterative_grid_search(model, param_grid, X, y, cv, time_limit):
|
||||
best_model = None
|
||||
best_score = -np.inf
|
||||
start_time = time.time()
|
||||
|
||||
# Manually iterate over parameter combinations
|
||||
for params in ParameterGrid(param_grid):
|
||||
elapsed_time = time.time() - start_time
|
||||
if elapsed_time > time_limit:
|
||||
print("Time limit exceeded. Stopping search.")
|
||||
break
|
||||
|
||||
model.set_params(**params)
|
||||
scores = []
|
||||
|
||||
for train_idx, test_idx in cv.split(X, y):
|
||||
X_train, X_test = X[train_idx], X[test_idx]
|
||||
y_train, y_test = y[train_idx], y[test_idx]
|
||||
model.fit(X_train, y_train)
|
||||
predictions = model.predict(X_test)
|
||||
score = -mean_squared_error(y_test, predictions)
|
||||
scores.append(score)
|
||||
|
||||
mean_score = np.mean(scores)
|
||||
if mean_score > best_score:
|
||||
best_score = mean_score
|
||||
best_model = model
|
||||
|
||||
return best_model
|
||||
|
||||
|
||||
# Main experiment function
|
||||
def run_experiment_with_dynamic_grid(total_hours=10.0, output_base_directory="./results"):
|
||||
time_per_algorithm = total_hours * 3600 / 5
|
||||
|
||||
# Load and combine datasets
|
||||
data = combine_datasets(
|
||||
"./data/potatoes_dataset.csv",
|
||||
"./data/updated_soil_data_with_awc.csv",
|
||||
"./data/final_augmented_climate_data.csv",
|
||||
"./data/NDVI.csv",
|
||||
ndvi=True
|
||||
)
|
||||
|
||||
pipelines = [
|
||||
Pipeline([
|
||||
('normalize', StandardScaler()),
|
||||
('imputer', SimpleImputer(strategy='mean')),
|
||||
('poly_features', PolynomialFeatures(degree=2)),
|
||||
('feature_selection', SelectKBest(score_func=f_regression, k=25))
|
||||
]),
|
||||
Pipeline([
|
||||
('robust', RobustScaler()),
|
||||
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
|
||||
('power_transformation', PowerTransformer(method='yeo-johnson'))
|
||||
])
|
||||
]
|
||||
|
||||
models = {
|
||||
'lasso': lasso,
|
||||
'ridge': ridge_regressor,
|
||||
'random_forest': random_forest,
|
||||
'gradient_boosting': gradient_boosting,
|
||||
'decision_tree': decision_tree_regressor,
|
||||
'stacking_ensemble': stacking_lasso
|
||||
}
|
||||
|
||||
output_directory = os.path.join(output_base_directory, f"grid_search_results_{total_hours}h")
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
|
||||
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||||
|
||||
for model_name, model_func in models.items():
|
||||
best_pipeline_result = None
|
||||
|
||||
for pipeline_idx, pipeline in enumerate(pipelines):
|
||||
X, y = preprocess_data(data, pipeline)
|
||||
model, params = model_func()
|
||||
|
||||
best_model = iterative_grid_search(
|
||||
model, params, X, y, cv=kf, time_limit=time_per_algorithm
|
||||
)
|
||||
|
||||
shap_stabilities = []
|
||||
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
|
||||
X_train_fold, X_test_fold = X[train_idx], X[test_idx]
|
||||
y_train_fold = y[train_idx]
|
||||
|
||||
best_model.fit(X_train_fold, y_train_fold)
|
||||
shap_values = compute_shap_values_with_kmeans(best_model, X_train_fold, X_test_fold)
|
||||
fold_shap_stability = np.std(shap_values, axis=0).mean()
|
||||
shap_stabilities.append(fold_shap_stability)
|
||||
|
||||
shap_stability = np.mean(shap_stabilities)
|
||||
predictions = best_model.predict(X)
|
||||
mse = mean_squared_error(y, predictions)
|
||||
|
||||
if best_pipeline_result is None or (
|
||||
mse < best_pipeline_result['mse'] and shap_stability < best_pipeline_result['shap_stability']):
|
||||
best_pipeline_result = {
|
||||
'model': best_model,
|
||||
'pipeline_idx': pipeline_idx,
|
||||
'mse': mse,
|
||||
'shap_stability': shap_stability
|
||||
}
|
||||
|
||||
if best_pipeline_result:
|
||||
model_output_dir = os.path.join(output_directory, model_name)
|
||||
os.makedirs(model_output_dir, exist_ok=True)
|
||||
|
||||
model_file_path = os.path.join(model_output_dir, f"{model_name}_best_model.joblib")
|
||||
dump(best_pipeline_result['model'], model_file_path)
|
||||
|
||||
metrics_df = pd.DataFrame({
|
||||
'Model': [model_name],
|
||||
'Pipeline': [f"Pipeline_{best_pipeline_result['pipeline_idx'] + 1}"],
|
||||
'MSE': [best_pipeline_result['mse']],
|
||||
'SHAP_Stability': [best_pipeline_result['shap_stability']]
|
||||
})
|
||||
metrics_csv_path = os.path.join(output_directory, "metrics_summary_GS.csv")
|
||||
metrics_df.to_csv(metrics_csv_path, index=False, mode='a', header=not os.path.exists(metrics_csv_path))
|
||||
|
||||
print(f"Saved best results for {model_name} from Pipeline {best_pipeline_result['pipeline_idx'] + 1}")
|
||||
|
||||
print(f"All results saved to {output_directory}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run Grid Search for different models.')
|
||||
parser.add_argument('--time', type=float, required=True, help='Time for the model to run in hours')
|
||||
args = parser.parse_args()
|
||||
run_experiment_with_dynamic_grid(total_hours=args.time, output_base_directory="./results_grid_search")
|
||||
166
src/h20_autoML.py
Normal file
166
src/h20_autoML.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import os
|
||||
import shap
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import h2o
|
||||
from h2o.automl import H2OAutoML
|
||||
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
from combine_datasets import combine_datasets
|
||||
|
||||
# Initialize H2O
|
||||
h2o.init()
|
||||
|
||||
|
||||
# Preprocess the data and separate features and target
|
||||
def preprocess_data(input_data, pipeline):
|
||||
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
|
||||
y = input_data['yield_t/ha']
|
||||
|
||||
# Fit-transform the pipeline steps without `y` first
|
||||
for name, step in pipeline.steps:
|
||||
if name != 'feature_selection':
|
||||
X = step.fit_transform(X)
|
||||
|
||||
# Apply `SelectKBest` separately, which requires `y`
|
||||
if 'feature_selection' in [name for name, _ in pipeline.steps]:
|
||||
X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
|
||||
|
||||
return X, y
|
||||
|
||||
|
||||
# Function to compute SHAP values using KMeans clustering
|
||||
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
|
||||
# Fit KMeans to the training data with explicit n_init parameter
|
||||
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
|
||||
cluster_centers = kmeans.cluster_centers_
|
||||
|
||||
# Convert cluster centers to H2OFrame
|
||||
cluster_centers_h2o = h2o.H2OFrame(cluster_centers)
|
||||
|
||||
# Convert X_test to H2OFrame
|
||||
X_test_h2o = h2o.H2OFrame(X_test)
|
||||
|
||||
# Compute SHAP values on cluster centers
|
||||
explainer = shap.KernelExplainer(lambda x: model.predict(h2o.H2OFrame(x)).as_data_frame().values.flatten(),
|
||||
cluster_centers_h2o.as_data_frame().values)
|
||||
shap_values = explainer.shap_values(X_test_h2o.as_data_frame().values)
|
||||
|
||||
return shap_values
|
||||
|
||||
|
||||
# Function to load and combine datasets
|
||||
def load_and_combine_datasets(use_ndvi=True):
|
||||
data = combine_datasets(
|
||||
"./data/potatoes_dataset.csv",
|
||||
"./data/updated_soil_data_with_awc.csv",
|
||||
"./data/final_augmented_climate_data.csv",
|
||||
"./data/NDVI.csv",
|
||||
ndvi=use_ndvi
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
# Main experiment function with H2O AutoML
|
||||
def run_h2o_experiment(use_ndvi=True, automl_time=10, output_base_directory="./results_h2o"):
|
||||
data = load_and_combine_datasets(use_ndvi=use_ndvi)
|
||||
pipelines = [
|
||||
Pipeline([
|
||||
('normalize', StandardScaler()),
|
||||
('imputer', SimpleImputer(strategy='mean')),
|
||||
('poly_features', PolynomialFeatures(degree=2)),
|
||||
('feature_selection', SelectKBest(score_func=f_regression, k=25))
|
||||
]),
|
||||
Pipeline([
|
||||
('robust', RobustScaler()),
|
||||
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
|
||||
('power_transformation', PowerTransformer(method='yeo-johnson'))
|
||||
])
|
||||
]
|
||||
|
||||
output_directory = os.path.join(output_base_directory, f"h2o_automl_results_{automl_time}h")
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
|
||||
metrics_df = pd.DataFrame(columns=['Model', 'Pipeline', 'MSE', 'SHAP_Stability'])
|
||||
|
||||
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||||
|
||||
for pipeline_idx, pipeline in enumerate(pipelines):
|
||||
X_processed, y = preprocess_data(data, pipeline)
|
||||
|
||||
X_h2o = h2o.H2OFrame(X_processed)
|
||||
y_h2o = h2o.H2OFrame(y.to_frame())
|
||||
X_h2o['target'] = y_h2o
|
||||
|
||||
shap_stabilities = []
|
||||
all_shap_values = []
|
||||
|
||||
for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed)):
|
||||
train = X_h2o[train_idx.tolist(), :]
|
||||
test = X_h2o[test_idx.tolist(), :]
|
||||
|
||||
aml = H2OAutoML(max_runtime_secs=int(automl_time * 3600 / kf.get_n_splits()), max_models=5, seed=42,
|
||||
sort_metric='MSE')
|
||||
aml.train(x=list(X_h2o.columns[:-1]), y='target', training_frame=train)
|
||||
|
||||
best_model = aml.leader
|
||||
|
||||
leaderboard = aml.leaderboard.as_data_frame()
|
||||
leaderboard_csv_path = os.path.join(output_directory, f"leaderboard_pipeline_{pipeline_idx + 1}_fold_{fold + 1}.csv")
|
||||
leaderboard.to_csv(leaderboard_csv_path, index=False)
|
||||
|
||||
model_output_dir = os.path.join(output_directory, f"pipeline_{pipeline_idx + 1}")
|
||||
os.makedirs(model_output_dir, exist_ok=True)
|
||||
model_file_path = os.path.join(model_output_dir, f"h2o_best_model_{pipeline_idx + 1}_fold_{fold + 1}.zip")
|
||||
h2o.save_model(best_model, path=model_file_path)
|
||||
|
||||
predictions = best_model.predict(test).as_data_frame()['predict']
|
||||
|
||||
mse = mean_squared_error(test['target'].as_data_frame().values, predictions.values)
|
||||
|
||||
shap_values = compute_shap_values_with_kmeans(best_model, train.as_data_frame().values,
|
||||
test.as_data_frame().values)
|
||||
shap_stability = np.std(shap_values, axis=0).mean()
|
||||
shap_stabilities.append(shap_stability)
|
||||
all_shap_values.append(shap_values)
|
||||
|
||||
|
||||
print(f"Completed H2O AutoML and SHAP computation for Pipeline {pipeline_idx + 1}, Fold {fold + 1}")
|
||||
|
||||
mean_shap_stability = np.mean(shap_stabilities)
|
||||
for fold_idx, shap_values in enumerate(all_shap_values):
|
||||
shap_file_name = f"shap_values_pipeline_{pipeline_idx + 1}_fold_{fold_idx + 1}.npy"
|
||||
shap_file_path = os.path.join(model_output_dir, shap_file_name)
|
||||
np.save(shap_file_path, shap_values)
|
||||
|
||||
metrics_df = pd.concat([metrics_df, pd.DataFrame([{
|
||||
'Model': best_model.model_id,
|
||||
'Pipeline': f"Pipeline_{pipeline_idx + 1}",
|
||||
'MSE': mse,
|
||||
'SHAP_Stability': mean_shap_stability
|
||||
}])], ignore_index=True)
|
||||
|
||||
metrics_csv_path = os.path.join(output_directory, "metrics_summary.csv")
|
||||
metrics_df.to_csv(metrics_csv_path, index=False)
|
||||
|
||||
print(f"All results saved to {output_directory}")
|
||||
|
||||
h2o.shutdown(prompt=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run H2O AutoML for different models.')
|
||||
parser.add_argument('--time', type=float, required=True, help='Time in hours for the AutoML process to run.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Run the H2O AutoML experiment
|
||||
run_h2o_experiment(use_ndvi=True, automl_time=args.time, output_base_directory="./results_h2o")
|
||||
|
||||
23
src/h20_batch.sh
Normal file
23
src/h20_batch.sh
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --account=def-xander # Replace with your account
|
||||
#SBATCH --mem=10G # Memory allocation
|
||||
#SBATCH --time=11:00:00 # Total run time limit (11 hours)
|
||||
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
|
||||
#SBATCH --job-name=h20_%A # Job name with job ID appended
|
||||
#SBATCH --output=%x-%j.out # Standard output and error log
|
||||
#SBATCH --error=%x-%j.err # Separate error log
|
||||
|
||||
# Load necessary modules
|
||||
module load python/3.8
|
||||
|
||||
# Activate your virtual environment
|
||||
source ~/envs/workdir/bin/activate
|
||||
|
||||
# Parameters
|
||||
TIME=$1
|
||||
|
||||
# Run the Python script with the specified time parameter
|
||||
srun python /home/dvera/scratch/Framework_EXP/h20_autoML.py --time $TIME
|
||||
|
||||
# Deactivate the virtual environment
|
||||
deactivate
|
||||
20
src/nsga_batch.sh
Normal file
20
src/nsga_batch.sh
Normal file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --account=def-xander # Replace with your account
|
||||
#SBATCH --mem=10G # Memory allocation
|
||||
#SBATCH --time=21:00:00 # Total run time limit (11 hours)
|
||||
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
|
||||
#SBATCH --job-name=nsga_%A # Job name with job ID appended
|
||||
#SBATCH --output=%x-%j.out # Standard output and error log
|
||||
#SBATCH --error=%x-%j.err # Separate error log
|
||||
|
||||
# Load necessary modules
|
||||
module load python/3.8
|
||||
|
||||
# Activate your virtual environment
|
||||
source ~/envs/workdir/bin/activate
|
||||
|
||||
# Run the Python script with the specified time parameter
|
||||
srun python /home/dvera/scratch/Framework_EXP/nsga_exp.py
|
||||
|
||||
# Deactivate the virtual environment
|
||||
deactivate
|
||||
210
src/nsga_exp.py
Normal file
210
src/nsga_exp.py
Normal file
@@ -0,0 +1,210 @@
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import KFold, ParameterGrid
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.cluster import KMeans
|
||||
import shap
|
||||
from deap import base, creator, tools, algorithms
|
||||
from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
|
||||
|
||||
from combine_datasets import combine_datasets
|
||||
|
||||
creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0)) # Minimize both objectives
|
||||
creator.create("Individual", list, fitness=creator.FitnessMin)
|
||||
|
||||
# Preprocess the data and separate features and target
|
||||
def preprocess_data(input_data, pipeline, k_value):
|
||||
pipeline.named_steps['feature_selection'].set_params(k=k_value)
|
||||
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
|
||||
print(X.columns)
|
||||
y = input_data['yield_t/ha']
|
||||
X = pipeline.fit_transform(X, y)
|
||||
return X, y
|
||||
|
||||
# Load and combine datasets
|
||||
def load_and_combine_datasets(use_ndvi=True):
|
||||
data = combine_datasets(
|
||||
"./data/potatoes_dataset.csv",
|
||||
"./data/updated_soil_data_with_awc.csv",
|
||||
"./data/final_augmented_climate_data.csv",
|
||||
"./data/NDVI.csv",
|
||||
ndvi=use_ndvi
|
||||
)
|
||||
return data
|
||||
|
||||
# Function to compute SHAP values using KMeans clustering
|
||||
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
|
||||
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
|
||||
cluster_centers = kmeans.cluster_centers_
|
||||
|
||||
explainer = shap.KernelExplainer(model.predict, cluster_centers)
|
||||
shap_values = explainer.shap_values(X_test)
|
||||
|
||||
return shap_values
|
||||
|
||||
# Safe model training and evaluation
|
||||
def safe_model_fit_evaluate(model, X_train, y_train, X_test, y_test):
|
||||
try:
|
||||
model.fit(X_train, y_train)
|
||||
predictions = model.predict(X_test)
|
||||
mse = mean_squared_error(y_test, predictions)
|
||||
return mse, predictions
|
||||
except (ValueError, OverflowError) as e:
|
||||
print(f"Model evaluation failed due to: {e}")
|
||||
return float('inf'), None
|
||||
|
||||
# Updated fitness function to evaluate MSE and SHAP stability using KMeans for SHAP
|
||||
def evaluate_individual(individual, data, kf):
|
||||
model_name, model_params, pipeline_idx, k_value = individual[0], individual[1], individual[2], individual[3]
|
||||
|
||||
# Get the model and pipeline
|
||||
model_func = model_space[model_name]
|
||||
model, params = model_func()
|
||||
model.set_params(**model_params)
|
||||
|
||||
pipeline = pipelines[pipeline_idx]
|
||||
X_processed, y_processed = preprocess_data(data, pipeline, k_value)
|
||||
|
||||
mse_scores = []
|
||||
shap_stabilities = []
|
||||
all_shap_values = []
|
||||
|
||||
for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed, y_processed)):
|
||||
X_train_fold, X_test_fold = X_processed[train_idx], X_processed[test_idx]
|
||||
y_train_fold, y_test_fold = y_processed[train_idx], y_processed[test_idx]
|
||||
|
||||
# Safe model fitting and evaluation
|
||||
mse, predictions = safe_model_fit_evaluate(model, X_train_fold, y_train_fold, X_test_fold, y_test_fold)
|
||||
mse_scores.append(mse)
|
||||
|
||||
if predictions is not None:
|
||||
# Compute SHAP values using KMeans clustering
|
||||
shap_values = compute_shap_values_with_kmeans(model, X_train_fold, X_test_fold)
|
||||
shap_stability = np.std(shap_values, axis=0).mean()
|
||||
shap_stabilities.append(shap_stability)
|
||||
all_shap_values.append(shap_values)
|
||||
|
||||
# Return average MSE and SHAP stability
|
||||
return np.mean(mse_scores), np.mean(shap_stabilities), all_shap_values
|
||||
|
||||
# Run NSGA-II experiment with a time limit
|
||||
def run_nsga_experiment(use_ndvi=True,
|
||||
output_base_directory="./results_nsga",
|
||||
population_size=30,
|
||||
n_generations=50,
|
||||
time_limit=36000):
|
||||
# Load and combine dataset
|
||||
data = load_and_combine_datasets(use_ndvi=use_ndvi)
|
||||
|
||||
# Pipelines for preprocessing
|
||||
global pipelines
|
||||
pipelines = [
|
||||
Pipeline([
|
||||
('normalize', StandardScaler()),
|
||||
('imputer', SimpleImputer(strategy='mean')),
|
||||
('poly_features', PolynomialFeatures(degree=2)),
|
||||
('feature_selection', SelectKBest(score_func=f_regression, k=25))
|
||||
]),
|
||||
Pipeline([
|
||||
('robust', RobustScaler()),
|
||||
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
|
||||
('power_transformation', PowerTransformer(method='yeo-johnson'))
|
||||
])
|
||||
]
|
||||
|
||||
# Model space with their corresponding hyperparameter grids
|
||||
global model_space
|
||||
model_space = {
|
||||
'lasso': lasso,
|
||||
'ridge': ridge_regressor,
|
||||
'random_forest': random_forest,
|
||||
'gradient_boosting': gradient_boosting,
|
||||
'decision_tree': decision_tree_regressor,
|
||||
'stacking_ensemble': stacking_lasso
|
||||
}
|
||||
|
||||
# Create output directory with timestamp
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
output_directory = os.path.join(output_base_directory, f"nsga_{time_limit // 3600}h_{timestamp}")
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
|
||||
# Define cross-validation strategy
|
||||
kf = KFold(n_splits=5, shuffle=True, random_state=42)
|
||||
|
||||
# Convert parameter grid to a list of tuples for DEAP
|
||||
param_space = [(model_name, param_values, i, k_value) for model_name, model_func in model_space.items()
|
||||
for param_values in list(ParameterGrid(model_func()[1])) for i in range(len(pipelines))
|
||||
for k_value in range(10, 31, 5)] # Range for k in SelectKBest
|
||||
|
||||
# Initialize DEAP toolbox
|
||||
toolbox = base.Toolbox()
|
||||
toolbox.register("attr_float", lambda: np.random.choice(len(param_space), 4)) # Generate a list with 4 elements
|
||||
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.attr_float)
|
||||
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
|
||||
toolbox.register("mate", tools.cxTwoPoint)
|
||||
toolbox.register("mutate", tools.mutPolynomialBounded, low=0, up=len(param_space) - 1, eta=0.5, indpb=0.2)
|
||||
toolbox.register("select", tools.selNSGA2)
|
||||
toolbox.register("evaluate", lambda ind: evaluate_individual(param_space[int(ind[0])], data, kf))
|
||||
|
||||
# Run NSGA-II algorithm
|
||||
start_time = time.time()
|
||||
pop = toolbox.population(n=population_size)
|
||||
|
||||
# Evaluate the initial population
|
||||
fits = toolbox.map(toolbox.evaluate, pop)
|
||||
for fit, ind in zip(fits, pop):
|
||||
ind.fitness.values = fit[:2]
|
||||
ind.shap_values = fit[2]
|
||||
|
||||
# Use the number of parents and offspring in the evolution process
|
||||
for gen in range(n_generations):
|
||||
if time.time() - start_time > time_limit:
|
||||
print("Time limit exceeded, stopping evolution.")
|
||||
break
|
||||
|
||||
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
|
||||
fits = toolbox.map(toolbox.evaluate, offspring)
|
||||
|
||||
for fit, ind in zip(fits, offspring):
|
||||
ind.fitness.values = fit[:2] # Ensure only the first two values are assigned to fitness
|
||||
ind.shap_values = fit[2]
|
||||
|
||||
# Select the next generation of parents from the combined pool of parents and offspring
|
||||
pop = toolbox.select(pop + offspring, k=population_size)
|
||||
|
||||
for ind in pop:
|
||||
for fold_idx, shap_values in enumerate(ind.shap_values):
|
||||
shap_output_path = os.path.join(output_directory, f"shap_values_{int(ind[0])}_fold_{fold_idx + 1}.npy")
|
||||
np.save(shap_output_path, shap_values)
|
||||
|
||||
# Extract and save the Pareto front
|
||||
pareto_front = tools.sortNondominated(pop, len(pop), first_front_only=True)[0]
|
||||
pareto_results_df = pd.DataFrame([param_space[int(ind[0])] + ind.fitness.values for ind in pareto_front],
|
||||
columns=['Model', 'Params', 'Pipeline', 'K', 'MSE', 'SHAP_Stability'])
|
||||
pareto_results_df.to_csv(os.path.join(output_directory, "nsga_pareto_front.csv"), index=False)
|
||||
|
||||
# Save the final population
|
||||
results_df = pd.DataFrame([param_space[int(ind[0])] + ind.fitness.values for ind in pop],
|
||||
columns=['Model', 'Params', 'Pipeline', 'K', 'MSE', 'SHAP_Stability'])
|
||||
results_df.to_csv(os.path.join(output_directory, "nsga_results.csv"), index=False)
|
||||
|
||||
print(f"All results saved to {output_directory}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_nsga_experiment(
|
||||
use_ndvi=True,
|
||||
output_base_directory="./results_nsga",
|
||||
population_size=80, # Larger population size for a comprehensive search
|
||||
n_generations=100, # Increased number of generations
|
||||
num_parents=30, # Increased number of parents
|
||||
num_offspring=50, # Increased number of offspring
|
||||
time_limit=108000 # 20 hours (20 * 3600 seconds)
|
||||
)
|
||||
|
||||
32
src/shap_values_computation.py
Normal file
32
src/shap_values_computation.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import os
|
||||
import shap
|
||||
import numpy as np
|
||||
|
||||
|
||||
def compute_and_save_shap_values(model,
|
||||
X_train,
|
||||
X_test,
|
||||
output_directory,
|
||||
file_name):
|
||||
"""
|
||||
Computes SHAP values for the provided model and saves them to the specified directory and file.
|
||||
|
||||
Parameters:
|
||||
- model: The trained machine learning model.
|
||||
- X_train: The training data used to fit the SHAP explainer.
|
||||
- X_test: The test data used to compute SHAP values.
|
||||
- output_directory: The directory where the SHAP values will be saved.
|
||||
- file_name: The name of the file to save the SHAP values.
|
||||
"""
|
||||
# Create the output directory if it doesn't exist
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
|
||||
# Compute SHAP values using the appropriate explainer
|
||||
explainer = shap.Explainer(model, X_train)
|
||||
shap_values = explainer(X_test)
|
||||
|
||||
# Save the SHAP values to a .npy file
|
||||
shap_file_path = os.path.join(output_directory, f"{file_name}.npy")
|
||||
np.save(shap_file_path, shap_values.values)
|
||||
|
||||
print(f"SHAP values saved to {shap_file_path}")
|
||||
Reference in New Issue
Block a user