From 376bc2a8c5e339837f55f6a97e7210750725c17d Mon Sep 17 00:00:00 2001 From: Varyngoth Date: Tue, 11 Nov 2025 20:26:18 -0400 Subject: [PATCH] Removed dangling, non-NSGA experiments --- src/combine_datasets.py | 122 +++++++++---------------- src/grid_search_batch.sh | 23 ----- src/grid_search_exp.py | 186 --------------------------------------- src/h20_autoML.py | 166 ---------------------------------- src/h20_batch.sh | 23 ----- 5 files changed, 41 insertions(+), 479 deletions(-) delete mode 100644 src/grid_search_batch.sh delete mode 100644 src/grid_search_exp.py delete mode 100644 src/h20_autoML.py delete mode 100644 src/h20_batch.sh diff --git a/src/combine_datasets.py b/src/combine_datasets.py index 3e4e764..f709061 100644 --- a/src/combine_datasets.py +++ b/src/combine_datasets.py @@ -1,92 +1,52 @@ import pandas as pd -from loading_climate_data import load_and_process_climate_data -from loading_crop_data import engineer_crop_features -from loading_ndvi_data import process_ndvi_data -from loading_soil_data import get_soil_data +def combine_datasets_single_csv( + csv_file: str, + target_column: str = None, + exclude_features: list = None +): + """ + Load and process a single CSV file into a DataFrame suitable for modeling. + Mimics the original combine_datasets() structure, returning a single DataFrame. -def combine_datasets(crop_data_file, - soil_data_file, - climate_data_file, - ndvi_data_file, - ndvi=False): - dataset_loaders = { - 'soil': lambda: get_soil_data(soil_data_file), - 'climate': lambda: load_and_process_climate_data(climate_data_file), - 'ndvi': lambda: process_ndvi_data(ndvi_data_file), - 'crop': lambda: engineer_crop_features(crop_data_file) - } + Parameters + ---------- + csv_file : str + Path to the CSV file to load. + target_column : str, optional + Column to use as target. If None, assumes last column is the target. + exclude_features : list of str, optional + List of features to exclude from the DataFrame. - merge_keys = { - 'soil': ['PostalCode'], - 'climate': ['PostalCode', 'Year'], - 'ndvi': ['PostalCode', 'Year'], - 'crop': ['PostalCode', 'Year'] - } + Returns + ------- + combined_data : pd.DataFrame + Processed DataFrame, target column removed if specified. + The format is compatible with previous combine_datasets outputs. + """ + # Load CSV + combined_data = pd.read_csv(csv_file) - combined_data = dataset_loaders['crop']() + # Determine target + if target_column is None: + target_column = combined_data.columns[-1] # default last column - # Merge climate data - climate_data = dataset_loaders['climate']() - combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left') + if target_column not in combined_data.columns: + raise ValueError(f"Target column '{target_column}' not found in CSV.") - # Merge NDVI data if required - if ndvi: - ndvi_data = dataset_loaders['ndvi']() - combined_data = combined_data[combined_data['Year'] >= 2000] - combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left') + # Separate target column internally (optional) + target_series = combined_data[target_column] - # Merge soil data - soil_data = dataset_loaders['soil']() - combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left') - postal_codes = combined_data['PostalCode'] - years = combined_data['Year'] + # Drop the target from features + combined_data = combined_data.drop(columns=[target_column]) - # Drop irrelevant or redundant columns - features_to_exclude = ['PostalCode', 'Year', 'SoilID'] - - combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore') - - return combined_data - - -def combine_dataset_pc(crop_data_file, - soil_data_file, - climate_data_file, - ndvi_data_file, - ndvi=False): - dataset_loaders = { - 'soil': lambda: get_soil_data(soil_data_file), - 'climate': lambda: load_and_process_climate_data(climate_data_file), - 'ndvi': lambda: process_ndvi_data(ndvi_data_file), - 'crop': lambda: engineer_crop_features(crop_data_file) - } - - merge_keys = { - 'soil': ['PostalCode'], - 'climate': ['PostalCode', 'Year'], - 'ndvi': ['PostalCode', 'Year'], - 'crop': ['PostalCode', 'Year'] - } - - combined_data = dataset_loaders['crop']() - - # Merge climate data - climate_data = dataset_loaders['climate']() - combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left') - - # Merge NDVI data if required - if ndvi: - ndvi_data = dataset_loaders['ndvi']() - combined_data = combined_data[combined_data['Year'] >= 2000] - combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left') - - # Merge soil data - soil_data = dataset_loaders['soil']() - combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left') - - # Drop irrelevant or redundant columns - features_to_exclude = ['Year', 'SoilID'] - combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore') + # Drop additional user-specified features + if exclude_features: + for col in exclude_features: + if col in combined_data.columns: + combined_data = combined_data.drop(columns=[col]) + else: + print(f"Warning: '{col}' not found in CSV and cannot be excluded.") + # Mimic original structure: return combined_data as a DataFrame return combined_data diff --git a/src/grid_search_batch.sh b/src/grid_search_batch.sh deleted file mode 100644 index 22d55d4..0000000 --- a/src/grid_search_batch.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -#SBATCH --account=def-xander # Replace with your account -#SBATCH --mem=10G # Memory allocation -#SBATCH --time=21:00:00 # Total run time limit (11 hours) -#SBATCH --cpus-per-task=4 # Number of CPU cores per task -#SBATCH --job-name=grid_search_%A # Job name with job ID appended -#SBATCH --output=%x-%j.out # Standard output and error log -#SBATCH --error=%x-%j.err # Separate error log - -# Load necessary modules -#module load python/3.8 - -# Activate your virtual environment -source /env0/bin/activate - -# Parameters -TIME=$1 - -# Run the Python script with the specified time parameter -srun python $WORK_DIR/src/grid_search_exp.py --time $TIME - -# Deactivate the virtual environment -deactivate diff --git a/src/grid_search_exp.py b/src/grid_search_exp.py deleted file mode 100644 index 1afb764..0000000 --- a/src/grid_search_exp.py +++ /dev/null @@ -1,186 +0,0 @@ -import os -import time -import numpy as np -import pandas as pd -import warnings -import shap -from sklearn.model_selection import KFold, ParameterGrid -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer -from sklearn.impute import SimpleImputer -from sklearn.feature_selection import SelectKBest, f_regression -from sklearn.metrics import mean_squared_error -from sklearn.cluster import KMeans -from sklearn.exceptions import ConvergenceWarning -from joblib import dump - -from combine_datasets import combine_datasets -from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso - -warnings.filterwarnings("ignore", category=ConvergenceWarning) - - -# Preprocess the data and separate features and target -def preprocess_data(input_data, pipeline): - X = input_data.drop(columns=['yield_t/ha'], errors='ignore') - y = input_data['yield_t/ha'] - - # Fit-transform the pipeline steps without `y` first - for name, step in pipeline.steps: - if name != 'feature_selection': - X = step.fit_transform(X) - - # Apply `SelectKBest` separately, which requires `y` - if 'feature_selection' in [name for name, _ in pipeline.steps]: - X = pipeline.named_steps['feature_selection'].fit_transform(X, y) - - return X, y - - -# Function to compute SHAP values using KMeans clustering -def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10): - # Fit KMeans to the training data with explicit n_init parameter - kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train) - cluster_centers = kmeans.cluster_centers_ - - # Compute SHAP values on cluster centers - explainer = shap.KernelExplainer(model.predict, cluster_centers) - shap_values = explainer.shap_values(X_test) - - return shap_values - - -# Iterative grid search with time monitoring -def iterative_grid_search(model, param_grid, X, y, cv, time_limit): - best_model = None - best_score = -np.inf - start_time = time.time() - - # Manually iterate over parameter combinations - for params in ParameterGrid(param_grid): - elapsed_time = time.time() - start_time - if elapsed_time > time_limit: - print("Time limit exceeded. Stopping search.") - break - - model.set_params(**params) - scores = [] - - for train_idx, test_idx in cv.split(X, y): - X_train, X_test = X[train_idx], X[test_idx] - y_train, y_test = y[train_idx], y[test_idx] - model.fit(X_train, y_train) - predictions = model.predict(X_test) - score = -mean_squared_error(y_test, predictions) - scores.append(score) - - mean_score = np.mean(scores) - if mean_score > best_score: - best_score = mean_score - best_model = model - - return best_model - - -# Main experiment function -def run_experiment_with_dynamic_grid(total_hours=10.0, output_base_directory="./results"): - time_per_algorithm = total_hours * 3600 / 5 - - # Load and combine datasets - data = combine_datasets( - "./data/potatoes_dataset.csv", - "./data/updated_soil_data_with_awc.csv", - "./data/final_augmented_climate_data.csv", - "./data/NDVI.csv", - ndvi=True - ) - - pipelines = [ - Pipeline([ - ('normalize', StandardScaler()), - ('imputer', SimpleImputer(strategy='mean')), - ('poly_features', PolynomialFeatures(degree=2)), - ('feature_selection', SelectKBest(score_func=f_regression, k=25)) - ]), - Pipeline([ - ('robust', RobustScaler()), - ('feature_selection', SelectKBest(score_func=f_regression, k=25)), - ('power_transformation', PowerTransformer(method='yeo-johnson')) - ]) - ] - - models = { - 'lasso': lasso, - 'ridge': ridge_regressor, - 'random_forest': random_forest, - 'gradient_boosting': gradient_boosting, - 'decision_tree': decision_tree_regressor, - 'stacking_ensemble': stacking_lasso - } - - output_directory = os.path.join(output_base_directory, f"grid_search_results_{total_hours}h") - os.makedirs(output_directory, exist_ok=True) - - kf = KFold(n_splits=5, shuffle=True, random_state=42) - - for model_name, model_func in models.items(): - best_pipeline_result = None - - for pipeline_idx, pipeline in enumerate(pipelines): - X, y = preprocess_data(data, pipeline) - model, params = model_func() - - best_model = iterative_grid_search( - model, params, X, y, cv=kf, time_limit=time_per_algorithm - ) - - shap_stabilities = [] - for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)): - X_train_fold, X_test_fold = X[train_idx], X[test_idx] - y_train_fold = y[train_idx] - - best_model.fit(X_train_fold, y_train_fold) - shap_values = compute_shap_values_with_kmeans(best_model, X_train_fold, X_test_fold) - fold_shap_stability = np.std(shap_values, axis=0).mean() - shap_stabilities.append(fold_shap_stability) - - shap_stability = np.mean(shap_stabilities) - predictions = best_model.predict(X) - mse = mean_squared_error(y, predictions) - - if best_pipeline_result is None or ( - mse < best_pipeline_result['mse'] and shap_stability < best_pipeline_result['shap_stability']): - best_pipeline_result = { - 'model': best_model, - 'pipeline_idx': pipeline_idx, - 'mse': mse, - 'shap_stability': shap_stability - } - - if best_pipeline_result: - model_output_dir = os.path.join(output_directory, model_name) - os.makedirs(model_output_dir, exist_ok=True) - - model_file_path = os.path.join(model_output_dir, f"{model_name}_best_model.joblib") - dump(best_pipeline_result['model'], model_file_path) - - metrics_df = pd.DataFrame({ - 'Model': [model_name], - 'Pipeline': [f"Pipeline_{best_pipeline_result['pipeline_idx'] + 1}"], - 'MSE': [best_pipeline_result['mse']], - 'SHAP_Stability': [best_pipeline_result['shap_stability']] - }) - metrics_csv_path = os.path.join(output_directory, "metrics_summary_GS.csv") - metrics_df.to_csv(metrics_csv_path, index=False, mode='a', header=not os.path.exists(metrics_csv_path)) - - print(f"Saved best results for {model_name} from Pipeline {best_pipeline_result['pipeline_idx'] + 1}") - - print(f"All results saved to {output_directory}") - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description='Run Grid Search for different models.') - parser.add_argument('--time', type=float, required=True, help='Time for the model to run in hours') - args = parser.parse_args() - run_experiment_with_dynamic_grid(total_hours=args.time, output_base_directory="./results_grid_search") diff --git a/src/h20_autoML.py b/src/h20_autoML.py deleted file mode 100644 index b505d25..0000000 --- a/src/h20_autoML.py +++ /dev/null @@ -1,166 +0,0 @@ -import os -import shap -import numpy as np -import pandas as pd -import h2o -from h2o.automl import H2OAutoML -from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer -from sklearn.impute import SimpleImputer -from sklearn.pipeline import Pipeline -from sklearn.model_selection import KFold -from sklearn.feature_selection import SelectKBest, f_regression -from sklearn.metrics import mean_squared_error -from sklearn.cluster import KMeans - -from combine_datasets import combine_datasets - -# Initialize H2O -h2o.init() - - -# Preprocess the data and separate features and target -def preprocess_data(input_data, pipeline): - X = input_data.drop(columns=['yield_t/ha'], errors='ignore') - y = input_data['yield_t/ha'] - - # Fit-transform the pipeline steps without `y` first - for name, step in pipeline.steps: - if name != 'feature_selection': - X = step.fit_transform(X) - - # Apply `SelectKBest` separately, which requires `y` - if 'feature_selection' in [name for name, _ in pipeline.steps]: - X = pipeline.named_steps['feature_selection'].fit_transform(X, y) - - return X, y - - -# Function to compute SHAP values using KMeans clustering -def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10): - # Fit KMeans to the training data with explicit n_init parameter - kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train) - cluster_centers = kmeans.cluster_centers_ - - # Convert cluster centers to H2OFrame - cluster_centers_h2o = h2o.H2OFrame(cluster_centers) - - # Convert X_test to H2OFrame - X_test_h2o = h2o.H2OFrame(X_test) - - # Compute SHAP values on cluster centers - explainer = shap.KernelExplainer(lambda x: model.predict(h2o.H2OFrame(x)).as_data_frame().values.flatten(), - cluster_centers_h2o.as_data_frame().values) - shap_values = explainer.shap_values(X_test_h2o.as_data_frame().values) - - return shap_values - - -# Function to load and combine datasets -def load_and_combine_datasets(use_ndvi=True): - data = combine_datasets( - "./data/potatoes_dataset.csv", - "./data/updated_soil_data_with_awc.csv", - "./data/final_augmented_climate_data.csv", - "./data/NDVI.csv", - ndvi=use_ndvi - ) - return data - - -# Main experiment function with H2O AutoML -def run_h2o_experiment(use_ndvi=True, automl_time=10, output_base_directory="./results_h2o"): - data = load_and_combine_datasets(use_ndvi=use_ndvi) - pipelines = [ - Pipeline([ - ('normalize', StandardScaler()), - ('imputer', SimpleImputer(strategy='mean')), - ('poly_features', PolynomialFeatures(degree=2)), - ('feature_selection', SelectKBest(score_func=f_regression, k=25)) - ]), - Pipeline([ - ('robust', RobustScaler()), - ('feature_selection', SelectKBest(score_func=f_regression, k=25)), - ('power_transformation', PowerTransformer(method='yeo-johnson')) - ]) - ] - - output_directory = os.path.join(output_base_directory, f"h2o_automl_results_{automl_time}h") - os.makedirs(output_directory, exist_ok=True) - - metrics_df = pd.DataFrame(columns=['Model', 'Pipeline', 'MSE', 'SHAP_Stability']) - - kf = KFold(n_splits=5, shuffle=True, random_state=42) - - for pipeline_idx, pipeline in enumerate(pipelines): - X_processed, y = preprocess_data(data, pipeline) - - X_h2o = h2o.H2OFrame(X_processed) - y_h2o = h2o.H2OFrame(y.to_frame()) - X_h2o['target'] = y_h2o - - shap_stabilities = [] - all_shap_values = [] - - for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed)): - train = X_h2o[train_idx.tolist(), :] - test = X_h2o[test_idx.tolist(), :] - - aml = H2OAutoML(max_runtime_secs=int(automl_time * 3600 / kf.get_n_splits()), max_models=5, seed=42, - sort_metric='MSE') - aml.train(x=list(X_h2o.columns[:-1]), y='target', training_frame=train) - - best_model = aml.leader - - leaderboard = aml.leaderboard.as_data_frame() - leaderboard_csv_path = os.path.join(output_directory, f"leaderboard_pipeline_{pipeline_idx + 1}_fold_{fold + 1}.csv") - leaderboard.to_csv(leaderboard_csv_path, index=False) - - model_output_dir = os.path.join(output_directory, f"pipeline_{pipeline_idx + 1}") - os.makedirs(model_output_dir, exist_ok=True) - model_file_path = os.path.join(model_output_dir, f"h2o_best_model_{pipeline_idx + 1}_fold_{fold + 1}.zip") - h2o.save_model(best_model, path=model_file_path) - - predictions = best_model.predict(test).as_data_frame()['predict'] - - mse = mean_squared_error(test['target'].as_data_frame().values, predictions.values) - - shap_values = compute_shap_values_with_kmeans(best_model, train.as_data_frame().values, - test.as_data_frame().values) - shap_stability = np.std(shap_values, axis=0).mean() - shap_stabilities.append(shap_stability) - all_shap_values.append(shap_values) - - - print(f"Completed H2O AutoML and SHAP computation for Pipeline {pipeline_idx + 1}, Fold {fold + 1}") - - mean_shap_stability = np.mean(shap_stabilities) - for fold_idx, shap_values in enumerate(all_shap_values): - shap_file_name = f"shap_values_pipeline_{pipeline_idx + 1}_fold_{fold_idx + 1}.npy" - shap_file_path = os.path.join(model_output_dir, shap_file_name) - np.save(shap_file_path, shap_values) - - metrics_df = pd.concat([metrics_df, pd.DataFrame([{ - 'Model': best_model.model_id, - 'Pipeline': f"Pipeline_{pipeline_idx + 1}", - 'MSE': mse, - 'SHAP_Stability': mean_shap_stability - }])], ignore_index=True) - - metrics_csv_path = os.path.join(output_directory, "metrics_summary.csv") - metrics_df.to_csv(metrics_csv_path, index=False) - - print(f"All results saved to {output_directory}") - - h2o.shutdown(prompt=False) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description='Run H2O AutoML for different models.') - parser.add_argument('--time', type=float, required=True, help='Time in hours for the AutoML process to run.') - args = parser.parse_args() - - # Run the H2O AutoML experiment - run_h2o_experiment(use_ndvi=True, automl_time=args.time, output_base_directory="./results_h2o") - diff --git a/src/h20_batch.sh b/src/h20_batch.sh deleted file mode 100644 index 63cf20e..0000000 --- a/src/h20_batch.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -#SBATCH --account=def-xander # Replace with your account -#SBATCH --mem=10G # Memory allocation -#SBATCH --time=11:00:00 # Total run time limit (11 hours) -#SBATCH --cpus-per-task=4 # Number of CPU cores per task -#SBATCH --job-name=h20_%A # Job name with job ID appended -#SBATCH --output=%x-%j.out # Standard output and error log -#SBATCH --error=%x-%j.err # Separate error log - -# Load necessary modules -module load python/3.8 - -# Activate your virtual environment -source ~/envs/workdir/bin/activate - -# Parameters -TIME=$1 - -# Run the Python script with the specified time parameter -srun python /home/dvera/scratch/Framework_EXP/h20_autoML.py --time $TIME - -# Deactivate the virtual environment -deactivate