Removed dangling, non-NSGA experiments

2025-11-11 20:26:18 -04:00
parent 5775534e22
commit 376bc2a8c5
5 changed files with 41 additions and 479 deletions
--- a/src/combine_datasets.py
+++ b/src/combine_datasets.py
@@ -1,92 +1,52 @@
 import pandas as pd
 from loading_climate_data import load_and_process_climate_data
 from loading_crop_data import engineer_crop_features
 from loading_ndvi_data import process_ndvi_data
 from loading_soil_data import get_soil_data
 def combine_datasets_single_csv(
    csv_file: str,
    target_column: str = None,
    exclude_features: list = None
 ):
    """
    Load and process a single CSV file into a DataFrame suitable for modeling.
    Mimics the original combine_datasets() structure, returning a single DataFrame.
-def combine_datasets(crop_data_file,
+    Parameters
-                     soil_data_file,
+    ----------
-                     climate_data_file,
+    csv_file : str
-                     ndvi_data_file,
+        Path to the CSV file to load.
-                     ndvi=False):
+    target_column : str, optional
-    dataset_loaders = {
+        Column to use as target. If None, assumes last column is the target.
-        'soil': lambda: get_soil_data(soil_data_file),
+    exclude_features : list of str, optional
-        'climate': lambda: load_and_process_climate_data(climate_data_file),
+        List of features to exclude from the DataFrame.
        'ndvi': lambda: process_ndvi_data(ndvi_data_file),
        'crop': lambda: engineer_crop_features(crop_data_file)
    }
-    merge_keys = {
+    Returns
-        'soil': ['PostalCode'],
+    -------
-        'climate': ['PostalCode', 'Year'],
+    combined_data : pd.DataFrame
-        'ndvi': ['PostalCode', 'Year'],
+        Processed DataFrame, target column removed if specified.
-        'crop': ['PostalCode', 'Year']
+        The format is compatible with previous combine_datasets outputs.
-    }
+    """
    # Load CSV
    combined_data = pd.read_csv(csv_file)
-    combined_data = dataset_loaders['crop']()
+    # Determine target
    if target_column is None:
        target_column = combined_data.columns[-1]  # default last column
-    # Merge climate data
+    if target_column not in combined_data.columns:
-    climate_data = dataset_loaders['climate']()
+        raise ValueError(f"Target column '{target_column}' not found in CSV.")
    combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
-    # Merge NDVI data if required
+    # Separate target column internally (optional)
-    if ndvi:
+    target_series = combined_data[target_column]
        ndvi_data = dataset_loaders['ndvi']()
        combined_data = combined_data[combined_data['Year'] >= 2000]
        combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
-    # Merge soil data
+    # Drop the target from features
-    soil_data = dataset_loaders['soil']()
+    combined_data = combined_data.drop(columns=[target_column])
    combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
    postal_codes = combined_data['PostalCode']
    years = combined_data['Year']
-    # Drop irrelevant or redundant columns
+    # Drop additional user-specified features
-    features_to_exclude = ['PostalCode', 'Year', 'SoilID']
+    if exclude_features:
-
+        for col in exclude_features:
-    combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
+            if col in combined_data.columns:
-
+                combined_data = combined_data.drop(columns=[col])
-    return combined_data
+            else:
-
+                print(f"Warning: '{col}' not found in CSV and cannot be excluded.")
 def combine_dataset_pc(crop_data_file,
                       soil_data_file,
                       climate_data_file,
                       ndvi_data_file,
                       ndvi=False):
    dataset_loaders = {
        'soil': lambda: get_soil_data(soil_data_file),
        'climate': lambda: load_and_process_climate_data(climate_data_file),
        'ndvi': lambda: process_ndvi_data(ndvi_data_file),
        'crop': lambda: engineer_crop_features(crop_data_file)
    }
    merge_keys = {
        'soil': ['PostalCode'],
        'climate': ['PostalCode', 'Year'],
        'ndvi': ['PostalCode', 'Year'],
        'crop': ['PostalCode', 'Year']
    }
    combined_data = dataset_loaders['crop']()
    # Merge climate data
    climate_data = dataset_loaders['climate']()
    combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
    # Merge NDVI data if required
    if ndvi:
        ndvi_data = dataset_loaders['ndvi']()
        combined_data = combined_data[combined_data['Year'] >= 2000]
        combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
    # Merge soil data
    soil_data = dataset_loaders['soil']()
    combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
    # Drop irrelevant or redundant columns
    features_to_exclude = ['Year', 'SoilID']
    combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
    # Mimic original structure: return combined_data as a DataFrame
    return combined_data
--- a/src/grid_search_batch.sh
+++ b/src/grid_search_batch.sh
@@ -1,23 +0,0 @@
 #!/bin/bash
 #SBATCH --account=def-xander      # Replace with your account
 #SBATCH --mem=10G                 # Memory allocation
 #SBATCH --time=21:00:00           # Total run time limit (11 hours)
 #SBATCH --cpus-per-task=4         # Number of CPU cores per task
 #SBATCH --job-name=grid_search_%A # Job name with job ID appended
 #SBATCH --output=%x-%j.out        # Standard output and error log
 #SBATCH --error=%x-%j.err         # Separate error log
 # Load necessary modules
 #module load python/3.8
 # Activate your virtual environment
 source /env0/bin/activate
 # Parameters
 TIME=$1
 # Run the Python script with the specified time parameter
 srun python $WORK_DIR/src/grid_search_exp.py --time $TIME
 # Deactivate the virtual environment
 deactivate
--- a/src/grid_search_exp.py
+++ b/src/grid_search_exp.py
@@ -1,186 +0,0 @@
 import os
 import time
 import numpy as np
 import pandas as pd
 import warnings
 import shap
 from sklearn.model_selection import KFold, ParameterGrid
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.feature_selection import SelectKBest, f_regression
 from sklearn.metrics import mean_squared_error
 from sklearn.cluster import KMeans
 from sklearn.exceptions import ConvergenceWarning
 from joblib import dump
 from combine_datasets import combine_datasets
 from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
 warnings.filterwarnings("ignore", category=ConvergenceWarning)
 # Preprocess the data and separate features and target
 def preprocess_data(input_data, pipeline):
    X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
    y = input_data['yield_t/ha']
    # Fit-transform the pipeline steps without `y` first
    for name, step in pipeline.steps:
        if name != 'feature_selection':
            X = step.fit_transform(X)
    # Apply `SelectKBest` separately, which requires `y`
    if 'feature_selection' in [name for name, _ in pipeline.steps]:
        X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
    return X, y
 # Function to compute SHAP values using KMeans clustering
 def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
    # Fit KMeans to the training data with explicit n_init parameter
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
    cluster_centers = kmeans.cluster_centers_
    # Compute SHAP values on cluster centers
    explainer = shap.KernelExplainer(model.predict, cluster_centers)
    shap_values = explainer.shap_values(X_test)
    return shap_values
 # Iterative grid search with time monitoring
 def iterative_grid_search(model, param_grid, X, y, cv, time_limit):
    best_model = None
    best_score = -np.inf
    start_time = time.time()
    # Manually iterate over parameter combinations
    for params in ParameterGrid(param_grid):
        elapsed_time = time.time() - start_time
        if elapsed_time > time_limit:
            print("Time limit exceeded. Stopping search.")
            break
        model.set_params(**params)
        scores = []
        for train_idx, test_idx in cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            score = -mean_squared_error(y_test, predictions)
            scores.append(score)
        mean_score = np.mean(scores)
        if mean_score > best_score:
            best_score = mean_score
            best_model = model
    return best_model
 # Main experiment function
 def run_experiment_with_dynamic_grid(total_hours=10.0, output_base_directory="./results"):
    time_per_algorithm = total_hours * 3600 / 5
    # Load and combine datasets
    data = combine_datasets(
        "./data/potatoes_dataset.csv",
        "./data/updated_soil_data_with_awc.csv",
        "./data/final_augmented_climate_data.csv",
        "./data/NDVI.csv",
        ndvi=True
    )
    pipelines = [
        Pipeline([
            ('normalize', StandardScaler()),
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly_features', PolynomialFeatures(degree=2)),
            ('feature_selection', SelectKBest(score_func=f_regression, k=25))
        ]),
        Pipeline([
            ('robust', RobustScaler()),
            ('feature_selection', SelectKBest(score_func=f_regression, k=25)),
            ('power_transformation', PowerTransformer(method='yeo-johnson'))
        ])
    ]
    models = {
        'lasso': lasso,
        'ridge': ridge_regressor,
        'random_forest': random_forest,
        'gradient_boosting': gradient_boosting,
        'decision_tree': decision_tree_regressor,
        'stacking_ensemble': stacking_lasso
    }
    output_directory = os.path.join(output_base_directory, f"grid_search_results_{total_hours}h")
    os.makedirs(output_directory, exist_ok=True)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for model_name, model_func in models.items():
        best_pipeline_result = None
        for pipeline_idx, pipeline in enumerate(pipelines):
            X, y = preprocess_data(data, pipeline)
            model, params = model_func()
            best_model = iterative_grid_search(
                model, params, X, y, cv=kf, time_limit=time_per_algorithm
            )
            shap_stabilities = []
            for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
                X_train_fold, X_test_fold = X[train_idx], X[test_idx]
                y_train_fold = y[train_idx]
                best_model.fit(X_train_fold, y_train_fold)
                shap_values = compute_shap_values_with_kmeans(best_model, X_train_fold, X_test_fold)
                fold_shap_stability = np.std(shap_values, axis=0).mean()
                shap_stabilities.append(fold_shap_stability)
            shap_stability = np.mean(shap_stabilities)
            predictions = best_model.predict(X)
            mse = mean_squared_error(y, predictions)
            if best_pipeline_result is None or (
                    mse < best_pipeline_result['mse'] and shap_stability < best_pipeline_result['shap_stability']):
                best_pipeline_result = {
                    'model': best_model,
                    'pipeline_idx': pipeline_idx,
                    'mse': mse,
                    'shap_stability': shap_stability
                }
        if best_pipeline_result:
            model_output_dir = os.path.join(output_directory, model_name)
            os.makedirs(model_output_dir, exist_ok=True)
            model_file_path = os.path.join(model_output_dir, f"{model_name}_best_model.joblib")
            dump(best_pipeline_result['model'], model_file_path)
            metrics_df = pd.DataFrame({
                'Model': [model_name],
                'Pipeline': [f"Pipeline_{best_pipeline_result['pipeline_idx'] + 1}"],
                'MSE': [best_pipeline_result['mse']],
                'SHAP_Stability': [best_pipeline_result['shap_stability']]
            })
            metrics_csv_path = os.path.join(output_directory, "metrics_summary_GS.csv")
            metrics_df.to_csv(metrics_csv_path, index=False, mode='a', header=not os.path.exists(metrics_csv_path))
            print(f"Saved best results for {model_name} from Pipeline {best_pipeline_result['pipeline_idx'] + 1}")
    print(f"All results saved to {output_directory}")
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='Run Grid Search for different models.')
    parser.add_argument('--time', type=float, required=True, help='Time for the model to run in hours')
    args = parser.parse_args()
    run_experiment_with_dynamic_grid(total_hours=args.time, output_base_directory="./results_grid_search")
--- a/src/h20_autoML.py
+++ b/src/h20_autoML.py
@@ -1,166 +0,0 @@
 import os
 import shap
 import numpy as np
 import pandas as pd
 import h2o
 from h2o.automl import H2OAutoML
 from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import KFold
 from sklearn.feature_selection import SelectKBest, f_regression
 from sklearn.metrics import mean_squared_error
 from sklearn.cluster import KMeans
 from combine_datasets import combine_datasets
 # Initialize H2O
 h2o.init()
 # Preprocess the data and separate features and target
 def preprocess_data(input_data, pipeline):
    X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
    y = input_data['yield_t/ha']
    # Fit-transform the pipeline steps without `y` first
    for name, step in pipeline.steps:
        if name != 'feature_selection':
            X = step.fit_transform(X)
    # Apply `SelectKBest` separately, which requires `y`
    if 'feature_selection' in [name for name, _ in pipeline.steps]:
        X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
    return X, y
 # Function to compute SHAP values using KMeans clustering
 def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
    # Fit KMeans to the training data with explicit n_init parameter
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
    cluster_centers = kmeans.cluster_centers_
    # Convert cluster centers to H2OFrame
    cluster_centers_h2o = h2o.H2OFrame(cluster_centers)
    # Convert X_test to H2OFrame
    X_test_h2o = h2o.H2OFrame(X_test)
    # Compute SHAP values on cluster centers
    explainer = shap.KernelExplainer(lambda x: model.predict(h2o.H2OFrame(x)).as_data_frame().values.flatten(),
                                     cluster_centers_h2o.as_data_frame().values)
    shap_values = explainer.shap_values(X_test_h2o.as_data_frame().values)
    return shap_values
 # Function to load and combine datasets
 def load_and_combine_datasets(use_ndvi=True):
    data = combine_datasets(
        "./data/potatoes_dataset.csv",
        "./data/updated_soil_data_with_awc.csv",
        "./data/final_augmented_climate_data.csv",
        "./data/NDVI.csv",
        ndvi=use_ndvi
    )
    return data
 # Main experiment function with H2O AutoML
 def run_h2o_experiment(use_ndvi=True, automl_time=10, output_base_directory="./results_h2o"):
    data = load_and_combine_datasets(use_ndvi=use_ndvi)
    pipelines = [
        Pipeline([
            ('normalize', StandardScaler()),
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly_features', PolynomialFeatures(degree=2)),
            ('feature_selection', SelectKBest(score_func=f_regression, k=25))
        ]),
        Pipeline([
            ('robust', RobustScaler()),
            ('feature_selection', SelectKBest(score_func=f_regression, k=25)),
            ('power_transformation', PowerTransformer(method='yeo-johnson'))
        ])
    ]
    output_directory = os.path.join(output_base_directory, f"h2o_automl_results_{automl_time}h")
    os.makedirs(output_directory, exist_ok=True)
    metrics_df = pd.DataFrame(columns=['Model', 'Pipeline', 'MSE', 'SHAP_Stability'])
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for pipeline_idx, pipeline in enumerate(pipelines):
        X_processed, y = preprocess_data(data, pipeline)
        X_h2o = h2o.H2OFrame(X_processed)
        y_h2o = h2o.H2OFrame(y.to_frame())
        X_h2o['target'] = y_h2o
        shap_stabilities = []
        all_shap_values = []
        for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed)):
            train = X_h2o[train_idx.tolist(), :]
            test = X_h2o[test_idx.tolist(), :]
            aml = H2OAutoML(max_runtime_secs=int(automl_time * 3600 / kf.get_n_splits()), max_models=5, seed=42,
                            sort_metric='MSE')
            aml.train(x=list(X_h2o.columns[:-1]), y='target', training_frame=train)
            best_model = aml.leader
            leaderboard = aml.leaderboard.as_data_frame()
            leaderboard_csv_path = os.path.join(output_directory, f"leaderboard_pipeline_{pipeline_idx + 1}_fold_{fold + 1}.csv")
            leaderboard.to_csv(leaderboard_csv_path, index=False)
            model_output_dir = os.path.join(output_directory, f"pipeline_{pipeline_idx + 1}")
            os.makedirs(model_output_dir, exist_ok=True)
            model_file_path = os.path.join(model_output_dir, f"h2o_best_model_{pipeline_idx + 1}_fold_{fold + 1}.zip")
            h2o.save_model(best_model, path=model_file_path)
            predictions = best_model.predict(test).as_data_frame()['predict']
            mse = mean_squared_error(test['target'].as_data_frame().values, predictions.values)
            shap_values = compute_shap_values_with_kmeans(best_model, train.as_data_frame().values,
                                                          test.as_data_frame().values)
            shap_stability = np.std(shap_values, axis=0).mean()
            shap_stabilities.append(shap_stability)
            all_shap_values.append(shap_values)
            print(f"Completed H2O AutoML and SHAP computation for Pipeline {pipeline_idx + 1}, Fold {fold + 1}")
        mean_shap_stability = np.mean(shap_stabilities)
        for fold_idx, shap_values in enumerate(all_shap_values):
            shap_file_name = f"shap_values_pipeline_{pipeline_idx + 1}_fold_{fold_idx + 1}.npy"
            shap_file_path = os.path.join(model_output_dir, shap_file_name)
            np.save(shap_file_path, shap_values)
        metrics_df = pd.concat([metrics_df, pd.DataFrame([{
            'Model': best_model.model_id,
            'Pipeline': f"Pipeline_{pipeline_idx + 1}",
            'MSE': mse,
            'SHAP_Stability': mean_shap_stability
        }])], ignore_index=True)
    metrics_csv_path = os.path.join(output_directory, "metrics_summary.csv")
    metrics_df.to_csv(metrics_csv_path, index=False)
    print(f"All results saved to {output_directory}")
    h2o.shutdown(prompt=False)
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='Run H2O AutoML for different models.')
    parser.add_argument('--time', type=float, required=True, help='Time in hours for the AutoML process to run.')
    args = parser.parse_args()
    # Run the H2O AutoML experiment
    run_h2o_experiment(use_ndvi=True, automl_time=args.time, output_base_directory="./results_h2o")
--- a/src/h20_batch.sh
+++ b/src/h20_batch.sh
@@ -1,23 +0,0 @@
 #!/bin/bash
 #SBATCH --account=def-xander      # Replace with your account
 #SBATCH --mem=10G                 # Memory allocation
 #SBATCH --time=11:00:00           # Total run time limit (11 hours)
 #SBATCH --cpus-per-task=4         # Number of CPU cores per task
 #SBATCH --job-name=h20_%A # Job name with job ID appended
 #SBATCH --output=%x-%j.out        # Standard output and error log
 #SBATCH --error=%x-%j.err         # Separate error log
 # Load necessary modules
 module load python/3.8
 # Activate your virtual environment
 source ~/envs/workdir/bin/activate
 # Parameters
 TIME=$1
 # Run the Python script with the specified time parameter
 srun python /home/dvera/scratch/Framework_EXP/h20_autoML.py --time $TIME
 # Deactivate the virtual environment
 deactivate