Removed dangling, non-NSGA experiments

This commit is contained in:
Varyngoth
2025-11-11 20:26:18 -04:00
parent 5775534e22
commit 376bc2a8c5
5 changed files with 41 additions and 479 deletions

View File

@@ -1,92 +1,52 @@
import pandas as pd
from loading_climate_data import load_and_process_climate_data
from loading_crop_data import engineer_crop_features
from loading_ndvi_data import process_ndvi_data
from loading_soil_data import get_soil_data
def combine_datasets_single_csv(
csv_file: str,
target_column: str = None,
exclude_features: list = None
):
"""
Load and process a single CSV file into a DataFrame suitable for modeling.
Mimics the original combine_datasets() structure, returning a single DataFrame.
def combine_datasets(crop_data_file,
soil_data_file,
climate_data_file,
ndvi_data_file,
ndvi=False):
dataset_loaders = {
'soil': lambda: get_soil_data(soil_data_file),
'climate': lambda: load_and_process_climate_data(climate_data_file),
'ndvi': lambda: process_ndvi_data(ndvi_data_file),
'crop': lambda: engineer_crop_features(crop_data_file)
}
Parameters
----------
csv_file : str
Path to the CSV file to load.
target_column : str, optional
Column to use as target. If None, assumes last column is the target.
exclude_features : list of str, optional
List of features to exclude from the DataFrame.
merge_keys = {
'soil': ['PostalCode'],
'climate': ['PostalCode', 'Year'],
'ndvi': ['PostalCode', 'Year'],
'crop': ['PostalCode', 'Year']
}
Returns
-------
combined_data : pd.DataFrame
Processed DataFrame, target column removed if specified.
The format is compatible with previous combine_datasets outputs.
"""
# Load CSV
combined_data = pd.read_csv(csv_file)
combined_data = dataset_loaders['crop']()
# Determine target
if target_column is None:
target_column = combined_data.columns[-1] # default last column
# Merge climate data
climate_data = dataset_loaders['climate']()
combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
if target_column not in combined_data.columns:
raise ValueError(f"Target column '{target_column}' not found in CSV.")
# Merge NDVI data if required
if ndvi:
ndvi_data = dataset_loaders['ndvi']()
combined_data = combined_data[combined_data['Year'] >= 2000]
combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
# Separate target column internally (optional)
target_series = combined_data[target_column]
# Merge soil data
soil_data = dataset_loaders['soil']()
combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
postal_codes = combined_data['PostalCode']
years = combined_data['Year']
# Drop the target from features
combined_data = combined_data.drop(columns=[target_column])
# Drop irrelevant or redundant columns
features_to_exclude = ['PostalCode', 'Year', 'SoilID']
combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
return combined_data
def combine_dataset_pc(crop_data_file,
soil_data_file,
climate_data_file,
ndvi_data_file,
ndvi=False):
dataset_loaders = {
'soil': lambda: get_soil_data(soil_data_file),
'climate': lambda: load_and_process_climate_data(climate_data_file),
'ndvi': lambda: process_ndvi_data(ndvi_data_file),
'crop': lambda: engineer_crop_features(crop_data_file)
}
merge_keys = {
'soil': ['PostalCode'],
'climate': ['PostalCode', 'Year'],
'ndvi': ['PostalCode', 'Year'],
'crop': ['PostalCode', 'Year']
}
combined_data = dataset_loaders['crop']()
# Merge climate data
climate_data = dataset_loaders['climate']()
combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
# Merge NDVI data if required
if ndvi:
ndvi_data = dataset_loaders['ndvi']()
combined_data = combined_data[combined_data['Year'] >= 2000]
combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
# Merge soil data
soil_data = dataset_loaders['soil']()
combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
# Drop irrelevant or redundant columns
features_to_exclude = ['Year', 'SoilID']
combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
# Drop additional user-specified features
if exclude_features:
for col in exclude_features:
if col in combined_data.columns:
combined_data = combined_data.drop(columns=[col])
else:
print(f"Warning: '{col}' not found in CSV and cannot be excluded.")
# Mimic original structure: return combined_data as a DataFrame
return combined_data

View File

@@ -1,23 +0,0 @@
#!/bin/bash
#SBATCH --account=def-xander # Replace with your account
#SBATCH --mem=10G # Memory allocation
#SBATCH --time=21:00:00 # Total run time limit (11 hours)
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
#SBATCH --job-name=grid_search_%A # Job name with job ID appended
#SBATCH --output=%x-%j.out # Standard output and error log
#SBATCH --error=%x-%j.err # Separate error log
# Load necessary modules
#module load python/3.8
# Activate your virtual environment
source /env0/bin/activate
# Parameters
TIME=$1
# Run the Python script with the specified time parameter
srun python $WORK_DIR/src/grid_search_exp.py --time $TIME
# Deactivate the virtual environment
deactivate

View File

@@ -1,186 +0,0 @@
import os
import time
import numpy as np
import pandas as pd
import warnings
import shap
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.exceptions import ConvergenceWarning
from joblib import dump
from combine_datasets import combine_datasets
from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Preprocess the data and separate features and target
def preprocess_data(input_data, pipeline):
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
y = input_data['yield_t/ha']
# Fit-transform the pipeline steps without `y` first
for name, step in pipeline.steps:
if name != 'feature_selection':
X = step.fit_transform(X)
# Apply `SelectKBest` separately, which requires `y`
if 'feature_selection' in [name for name, _ in pipeline.steps]:
X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
return X, y
# Function to compute SHAP values using KMeans clustering
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
# Fit KMeans to the training data with explicit n_init parameter
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
cluster_centers = kmeans.cluster_centers_
# Compute SHAP values on cluster centers
explainer = shap.KernelExplainer(model.predict, cluster_centers)
shap_values = explainer.shap_values(X_test)
return shap_values
# Iterative grid search with time monitoring
def iterative_grid_search(model, param_grid, X, y, cv, time_limit):
best_model = None
best_score = -np.inf
start_time = time.time()
# Manually iterate over parameter combinations
for params in ParameterGrid(param_grid):
elapsed_time = time.time() - start_time
if elapsed_time > time_limit:
print("Time limit exceeded. Stopping search.")
break
model.set_params(**params)
scores = []
for train_idx, test_idx in cv.split(X, y):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = -mean_squared_error(y_test, predictions)
scores.append(score)
mean_score = np.mean(scores)
if mean_score > best_score:
best_score = mean_score
best_model = model
return best_model
# Main experiment function
def run_experiment_with_dynamic_grid(total_hours=10.0, output_base_directory="./results"):
time_per_algorithm = total_hours * 3600 / 5
# Load and combine datasets
data = combine_datasets(
"./data/potatoes_dataset.csv",
"./data/updated_soil_data_with_awc.csv",
"./data/final_augmented_climate_data.csv",
"./data/NDVI.csv",
ndvi=True
)
pipelines = [
Pipeline([
('normalize', StandardScaler()),
('imputer', SimpleImputer(strategy='mean')),
('poly_features', PolynomialFeatures(degree=2)),
('feature_selection', SelectKBest(score_func=f_regression, k=25))
]),
Pipeline([
('robust', RobustScaler()),
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
('power_transformation', PowerTransformer(method='yeo-johnson'))
])
]
models = {
'lasso': lasso,
'ridge': ridge_regressor,
'random_forest': random_forest,
'gradient_boosting': gradient_boosting,
'decision_tree': decision_tree_regressor,
'stacking_ensemble': stacking_lasso
}
output_directory = os.path.join(output_base_directory, f"grid_search_results_{total_hours}h")
os.makedirs(output_directory, exist_ok=True)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for model_name, model_func in models.items():
best_pipeline_result = None
for pipeline_idx, pipeline in enumerate(pipelines):
X, y = preprocess_data(data, pipeline)
model, params = model_func()
best_model = iterative_grid_search(
model, params, X, y, cv=kf, time_limit=time_per_algorithm
)
shap_stabilities = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
X_train_fold, X_test_fold = X[train_idx], X[test_idx]
y_train_fold = y[train_idx]
best_model.fit(X_train_fold, y_train_fold)
shap_values = compute_shap_values_with_kmeans(best_model, X_train_fold, X_test_fold)
fold_shap_stability = np.std(shap_values, axis=0).mean()
shap_stabilities.append(fold_shap_stability)
shap_stability = np.mean(shap_stabilities)
predictions = best_model.predict(X)
mse = mean_squared_error(y, predictions)
if best_pipeline_result is None or (
mse < best_pipeline_result['mse'] and shap_stability < best_pipeline_result['shap_stability']):
best_pipeline_result = {
'model': best_model,
'pipeline_idx': pipeline_idx,
'mse': mse,
'shap_stability': shap_stability
}
if best_pipeline_result:
model_output_dir = os.path.join(output_directory, model_name)
os.makedirs(model_output_dir, exist_ok=True)
model_file_path = os.path.join(model_output_dir, f"{model_name}_best_model.joblib")
dump(best_pipeline_result['model'], model_file_path)
metrics_df = pd.DataFrame({
'Model': [model_name],
'Pipeline': [f"Pipeline_{best_pipeline_result['pipeline_idx'] + 1}"],
'MSE': [best_pipeline_result['mse']],
'SHAP_Stability': [best_pipeline_result['shap_stability']]
})
metrics_csv_path = os.path.join(output_directory, "metrics_summary_GS.csv")
metrics_df.to_csv(metrics_csv_path, index=False, mode='a', header=not os.path.exists(metrics_csv_path))
print(f"Saved best results for {model_name} from Pipeline {best_pipeline_result['pipeline_idx'] + 1}")
print(f"All results saved to {output_directory}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run Grid Search for different models.')
parser.add_argument('--time', type=float, required=True, help='Time for the model to run in hours')
args = parser.parse_args()
run_experiment_with_dynamic_grid(total_hours=args.time, output_base_directory="./results_grid_search")

View File

@@ -1,166 +0,0 @@
import os
import shap
import numpy as np
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from combine_datasets import combine_datasets
# Initialize H2O
h2o.init()
# Preprocess the data and separate features and target
def preprocess_data(input_data, pipeline):
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
y = input_data['yield_t/ha']
# Fit-transform the pipeline steps without `y` first
for name, step in pipeline.steps:
if name != 'feature_selection':
X = step.fit_transform(X)
# Apply `SelectKBest` separately, which requires `y`
if 'feature_selection' in [name for name, _ in pipeline.steps]:
X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
return X, y
# Function to compute SHAP values using KMeans clustering
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
# Fit KMeans to the training data with explicit n_init parameter
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
cluster_centers = kmeans.cluster_centers_
# Convert cluster centers to H2OFrame
cluster_centers_h2o = h2o.H2OFrame(cluster_centers)
# Convert X_test to H2OFrame
X_test_h2o = h2o.H2OFrame(X_test)
# Compute SHAP values on cluster centers
explainer = shap.KernelExplainer(lambda x: model.predict(h2o.H2OFrame(x)).as_data_frame().values.flatten(),
cluster_centers_h2o.as_data_frame().values)
shap_values = explainer.shap_values(X_test_h2o.as_data_frame().values)
return shap_values
# Function to load and combine datasets
def load_and_combine_datasets(use_ndvi=True):
data = combine_datasets(
"./data/potatoes_dataset.csv",
"./data/updated_soil_data_with_awc.csv",
"./data/final_augmented_climate_data.csv",
"./data/NDVI.csv",
ndvi=use_ndvi
)
return data
# Main experiment function with H2O AutoML
def run_h2o_experiment(use_ndvi=True, automl_time=10, output_base_directory="./results_h2o"):
data = load_and_combine_datasets(use_ndvi=use_ndvi)
pipelines = [
Pipeline([
('normalize', StandardScaler()),
('imputer', SimpleImputer(strategy='mean')),
('poly_features', PolynomialFeatures(degree=2)),
('feature_selection', SelectKBest(score_func=f_regression, k=25))
]),
Pipeline([
('robust', RobustScaler()),
('feature_selection', SelectKBest(score_func=f_regression, k=25)),
('power_transformation', PowerTransformer(method='yeo-johnson'))
])
]
output_directory = os.path.join(output_base_directory, f"h2o_automl_results_{automl_time}h")
os.makedirs(output_directory, exist_ok=True)
metrics_df = pd.DataFrame(columns=['Model', 'Pipeline', 'MSE', 'SHAP_Stability'])
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for pipeline_idx, pipeline in enumerate(pipelines):
X_processed, y = preprocess_data(data, pipeline)
X_h2o = h2o.H2OFrame(X_processed)
y_h2o = h2o.H2OFrame(y.to_frame())
X_h2o['target'] = y_h2o
shap_stabilities = []
all_shap_values = []
for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed)):
train = X_h2o[train_idx.tolist(), :]
test = X_h2o[test_idx.tolist(), :]
aml = H2OAutoML(max_runtime_secs=int(automl_time * 3600 / kf.get_n_splits()), max_models=5, seed=42,
sort_metric='MSE')
aml.train(x=list(X_h2o.columns[:-1]), y='target', training_frame=train)
best_model = aml.leader
leaderboard = aml.leaderboard.as_data_frame()
leaderboard_csv_path = os.path.join(output_directory, f"leaderboard_pipeline_{pipeline_idx + 1}_fold_{fold + 1}.csv")
leaderboard.to_csv(leaderboard_csv_path, index=False)
model_output_dir = os.path.join(output_directory, f"pipeline_{pipeline_idx + 1}")
os.makedirs(model_output_dir, exist_ok=True)
model_file_path = os.path.join(model_output_dir, f"h2o_best_model_{pipeline_idx + 1}_fold_{fold + 1}.zip")
h2o.save_model(best_model, path=model_file_path)
predictions = best_model.predict(test).as_data_frame()['predict']
mse = mean_squared_error(test['target'].as_data_frame().values, predictions.values)
shap_values = compute_shap_values_with_kmeans(best_model, train.as_data_frame().values,
test.as_data_frame().values)
shap_stability = np.std(shap_values, axis=0).mean()
shap_stabilities.append(shap_stability)
all_shap_values.append(shap_values)
print(f"Completed H2O AutoML and SHAP computation for Pipeline {pipeline_idx + 1}, Fold {fold + 1}")
mean_shap_stability = np.mean(shap_stabilities)
for fold_idx, shap_values in enumerate(all_shap_values):
shap_file_name = f"shap_values_pipeline_{pipeline_idx + 1}_fold_{fold_idx + 1}.npy"
shap_file_path = os.path.join(model_output_dir, shap_file_name)
np.save(shap_file_path, shap_values)
metrics_df = pd.concat([metrics_df, pd.DataFrame([{
'Model': best_model.model_id,
'Pipeline': f"Pipeline_{pipeline_idx + 1}",
'MSE': mse,
'SHAP_Stability': mean_shap_stability
}])], ignore_index=True)
metrics_csv_path = os.path.join(output_directory, "metrics_summary.csv")
metrics_df.to_csv(metrics_csv_path, index=False)
print(f"All results saved to {output_directory}")
h2o.shutdown(prompt=False)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run H2O AutoML for different models.')
parser.add_argument('--time', type=float, required=True, help='Time in hours for the AutoML process to run.')
args = parser.parse_args()
# Run the H2O AutoML experiment
run_h2o_experiment(use_ndvi=True, automl_time=args.time, output_base_directory="./results_h2o")

View File

@@ -1,23 +0,0 @@
#!/bin/bash
#SBATCH --account=def-xander # Replace with your account
#SBATCH --mem=10G # Memory allocation
#SBATCH --time=11:00:00 # Total run time limit (11 hours)
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
#SBATCH --job-name=h20_%A # Job name with job ID appended
#SBATCH --output=%x-%j.out # Standard output and error log
#SBATCH --error=%x-%j.err # Separate error log
# Load necessary modules
module load python/3.8
# Activate your virtual environment
source ~/envs/workdir/bin/activate
# Parameters
TIME=$1
# Run the Python script with the specified time parameter
srun python /home/dvera/scratch/Framework_EXP/h20_autoML.py --time $TIME
# Deactivate the virtual environment
deactivate