Removed dangling, non-NSGA experiments

2025-11-11 20:26:18 -04:00
parent 5775534e22
commit 376bc2a8c5
5 changed files with 41 additions and 479 deletions
--- a/src/combine_datasets.py
+++ b/src/combine_datasets.py
@@ -1,92 +1,52 @@
 import pandas as pd
-from loading_climate_data import load_and_process_climate_data
-from loading_crop_data import engineer_crop_features
-from loading_ndvi_data import process_ndvi_data
-from loading_soil_data import get_soil_data

+def combine_datasets_single_csv(
+    csv_file: str,
+    target_column: str = None,
+    exclude_features: list = None
+):
+    """
+    Load and process a single CSV file into a DataFrame suitable for modeling.
+    Mimics the original combine_datasets() structure, returning a single DataFrame.

-def combine_datasets(crop_data_file,
-                     soil_data_file,
-                     climate_data_file,
-                     ndvi_data_file,
-                     ndvi=False):
-    dataset_loaders = {
-        'soil': lambda: get_soil_data(soil_data_file),
-        'climate': lambda: load_and_process_climate_data(climate_data_file),
-        'ndvi': lambda: process_ndvi_data(ndvi_data_file),
-        'crop': lambda: engineer_crop_features(crop_data_file)
-    }
+    Parameters
+    ----------
+    csv_file : str
+        Path to the CSV file to load.
+    target_column : str, optional
+        Column to use as target. If None, assumes last column is the target.
+    exclude_features : list of str, optional
+        List of features to exclude from the DataFrame.

-    merge_keys = {
-        'soil': ['PostalCode'],
-        'climate': ['PostalCode', 'Year'],
-        'ndvi': ['PostalCode', 'Year'],
-        'crop': ['PostalCode', 'Year']
-    }
+    Returns
+    -------
+    combined_data : pd.DataFrame
+        Processed DataFrame, target column removed if specified.
+        The format is compatible with previous combine_datasets outputs.
+    """
+    # Load CSV
+    combined_data = pd.read_csv(csv_file)

-    combined_data = dataset_loaders['crop']()
+    # Determine target
+    if target_column is None:
+        target_column = combined_data.columns[-1]  # default last column

-    # Merge climate data
-    climate_data = dataset_loaders['climate']()
-    combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
+    if target_column not in combined_data.columns:
+        raise ValueError(f"Target column '{target_column}' not found in CSV.")

-    # Merge NDVI data if required
-    if ndvi:
-        ndvi_data = dataset_loaders['ndvi']()
-        combined_data = combined_data[combined_data['Year'] >= 2000]
-        combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
+    # Separate target column internally (optional)
+    target_series = combined_data[target_column]

-    # Merge soil data
-    soil_data = dataset_loaders['soil']()
-    combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
-    postal_codes = combined_data['PostalCode']
-    years = combined_data['Year']
+    # Drop the target from features
+    combined_data = combined_data.drop(columns=[target_column])

-    # Drop irrelevant or redundant columns
-    features_to_exclude = ['PostalCode', 'Year', 'SoilID']
-
-    combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
-
-    return combined_data
-
-
-def combine_dataset_pc(crop_data_file,
-                       soil_data_file,
-                       climate_data_file,
-                       ndvi_data_file,
-                       ndvi=False):
-    dataset_loaders = {
-        'soil': lambda: get_soil_data(soil_data_file),
-        'climate': lambda: load_and_process_climate_data(climate_data_file),
-        'ndvi': lambda: process_ndvi_data(ndvi_data_file),
-        'crop': lambda: engineer_crop_features(crop_data_file)
-    }
-
-    merge_keys = {
-        'soil': ['PostalCode'],
-        'climate': ['PostalCode', 'Year'],
-        'ndvi': ['PostalCode', 'Year'],
-        'crop': ['PostalCode', 'Year']
-    }
-
-    combined_data = dataset_loaders['crop']()
-
-    # Merge climate data
-    climate_data = dataset_loaders['climate']()
-    combined_data = pd.merge(combined_data, climate_data, on=merge_keys['climate'], how='left')
-
-    # Merge NDVI data if required
-    if ndvi:
-        ndvi_data = dataset_loaders['ndvi']()
-        combined_data = combined_data[combined_data['Year'] >= 2000]
-        combined_data = pd.merge(combined_data, ndvi_data, on=merge_keys['ndvi'], how='left')
-
-    # Merge soil data
-    soil_data = dataset_loaders['soil']()
-    combined_data = pd.merge(combined_data, soil_data, on=merge_keys['soil'], how='left')
-
-    # Drop irrelevant or redundant columns
-    features_to_exclude = ['Year', 'SoilID']
-    combined_data = combined_data.drop(columns=features_to_exclude, errors='ignore')
+    # Drop additional user-specified features
+    if exclude_features:
+        for col in exclude_features:
+            if col in combined_data.columns:
+                combined_data = combined_data.drop(columns=[col])
+            else:
+                print(f"Warning: '{col}' not found in CSV and cannot be excluded.")

+    # Mimic original structure: return combined_data as a DataFrame
    return combined_data
--- a/src/grid_search_batch.sh
+++ b/src/grid_search_batch.sh
@@ -1,23 +0,0 @@
-#!/bin/bash
-#SBATCH --account=def-xander      # Replace with your account
-#SBATCH --mem=10G                 # Memory allocation
-#SBATCH --time=21:00:00           # Total run time limit (11 hours)
-#SBATCH --cpus-per-task=4         # Number of CPU cores per task
-#SBATCH --job-name=grid_search_%A # Job name with job ID appended
-#SBATCH --output=%x-%j.out        # Standard output and error log
-#SBATCH --error=%x-%j.err         # Separate error log
-
-# Load necessary modules
-#module load python/3.8
-
-# Activate your virtual environment
-source /env0/bin/activate
-
-# Parameters
-TIME=$1
-
-# Run the Python script with the specified time parameter
-srun python $WORK_DIR/src/grid_search_exp.py --time $TIME
-
-# Deactivate the virtual environment
-deactivate
--- a/src/grid_search_exp.py
+++ b/src/grid_search_exp.py
@@ -1,186 +0,0 @@
-import os
-import time
-import numpy as np
-import pandas as pd
-import warnings
-import shap
-from sklearn.model_selection import KFold, ParameterGrid
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler, PowerTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.feature_selection import SelectKBest, f_regression
-from sklearn.metrics import mean_squared_error
-from sklearn.cluster import KMeans
-from sklearn.exceptions import ConvergenceWarning
-from joblib import dump
-
-from combine_datasets import combine_datasets
-from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
-
-warnings.filterwarnings("ignore", category=ConvergenceWarning)
-
-
-# Preprocess the data and separate features and target
-def preprocess_data(input_data, pipeline):
-    X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
-    y = input_data['yield_t/ha']
-
-    # Fit-transform the pipeline steps without `y` first
-    for name, step in pipeline.steps:
-        if name != 'feature_selection':
-            X = step.fit_transform(X)
-
-    # Apply `SelectKBest` separately, which requires `y`
-    if 'feature_selection' in [name for name, _ in pipeline.steps]:
-        X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
-
-    return X, y
-
-
-# Function to compute SHAP values using KMeans clustering
-def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
-    # Fit KMeans to the training data with explicit n_init parameter
-    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
-    cluster_centers = kmeans.cluster_centers_
-
-    # Compute SHAP values on cluster centers
-    explainer = shap.KernelExplainer(model.predict, cluster_centers)
-    shap_values = explainer.shap_values(X_test)
-
-    return shap_values
-
-
-# Iterative grid search with time monitoring
-def iterative_grid_search(model, param_grid, X, y, cv, time_limit):
-    best_model = None
-    best_score = -np.inf
-    start_time = time.time()
-
-    # Manually iterate over parameter combinations
-    for params in ParameterGrid(param_grid):
-        elapsed_time = time.time() - start_time
-        if elapsed_time > time_limit:
-            print("Time limit exceeded. Stopping search.")
-            break
-
-        model.set_params(**params)
-        scores = []
-
-        for train_idx, test_idx in cv.split(X, y):
-            X_train, X_test = X[train_idx], X[test_idx]
-            y_train, y_test = y[train_idx], y[test_idx]
-            model.fit(X_train, y_train)
-            predictions = model.predict(X_test)
-            score = -mean_squared_error(y_test, predictions)
-            scores.append(score)
-
-        mean_score = np.mean(scores)
-        if mean_score > best_score:
-            best_score = mean_score
-            best_model = model
-
-    return best_model
-
-
-# Main experiment function
-def run_experiment_with_dynamic_grid(total_hours=10.0, output_base_directory="./results"):
-    time_per_algorithm = total_hours * 3600 / 5
-
-    # Load and combine datasets
-    data = combine_datasets(
-        "./data/potatoes_dataset.csv",
-        "./data/updated_soil_data_with_awc.csv",
-        "./data/final_augmented_climate_data.csv",
-        "./data/NDVI.csv",
-        ndvi=True
-    )
-
-    pipelines = [
-        Pipeline([
-            ('normalize', StandardScaler()),
-            ('imputer', SimpleImputer(strategy='mean')),
-            ('poly_features', PolynomialFeatures(degree=2)),
-            ('feature_selection', SelectKBest(score_func=f_regression, k=25))
-        ]),
-        Pipeline([
-            ('robust', RobustScaler()),
-            ('feature_selection', SelectKBest(score_func=f_regression, k=25)),
-            ('power_transformation', PowerTransformer(method='yeo-johnson'))
-        ])
-    ]
-
-    models = {
-        'lasso': lasso,
-        'ridge': ridge_regressor,
-        'random_forest': random_forest,
-        'gradient_boosting': gradient_boosting,
-        'decision_tree': decision_tree_regressor,
-        'stacking_ensemble': stacking_lasso
-    }
-
-    output_directory = os.path.join(output_base_directory, f"grid_search_results_{total_hours}h")
-    os.makedirs(output_directory, exist_ok=True)
-
-    kf = KFold(n_splits=5, shuffle=True, random_state=42)
-
-    for model_name, model_func in models.items():
-        best_pipeline_result = None
-
-        for pipeline_idx, pipeline in enumerate(pipelines):
-            X, y = preprocess_data(data, pipeline)
-            model, params = model_func()
-
-            best_model = iterative_grid_search(
-                model, params, X, y, cv=kf, time_limit=time_per_algorithm
-            )
-
-            shap_stabilities = []
-            for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
-                X_train_fold, X_test_fold = X[train_idx], X[test_idx]
-                y_train_fold = y[train_idx]
-
-                best_model.fit(X_train_fold, y_train_fold)
-                shap_values = compute_shap_values_with_kmeans(best_model, X_train_fold, X_test_fold)
-                fold_shap_stability = np.std(shap_values, axis=0).mean()
-                shap_stabilities.append(fold_shap_stability)
-
-            shap_stability = np.mean(shap_stabilities)
-            predictions = best_model.predict(X)
-            mse = mean_squared_error(y, predictions)
-
-            if best_pipeline_result is None or (
-                    mse < best_pipeline_result['mse'] and shap_stability < best_pipeline_result['shap_stability']):
-                best_pipeline_result = {
-                    'model': best_model,
-                    'pipeline_idx': pipeline_idx,
-                    'mse': mse,
-                    'shap_stability': shap_stability
-                }
-
-        if best_pipeline_result:
-            model_output_dir = os.path.join(output_directory, model_name)
-            os.makedirs(model_output_dir, exist_ok=True)
-
-            model_file_path = os.path.join(model_output_dir, f"{model_name}_best_model.joblib")
-            dump(best_pipeline_result['model'], model_file_path)
-
-            metrics_df = pd.DataFrame({
-                'Model': [model_name],
-                'Pipeline': [f"Pipeline_{best_pipeline_result['pipeline_idx'] + 1}"],
-                'MSE': [best_pipeline_result['mse']],
-                'SHAP_Stability': [best_pipeline_result['shap_stability']]
-            })
-            metrics_csv_path = os.path.join(output_directory, "metrics_summary_GS.csv")
-            metrics_df.to_csv(metrics_csv_path, index=False, mode='a', header=not os.path.exists(metrics_csv_path))
-
-            print(f"Saved best results for {model_name} from Pipeline {best_pipeline_result['pipeline_idx'] + 1}")
-
-    print(f"All results saved to {output_directory}")
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description='Run Grid Search for different models.')
-    parser.add_argument('--time', type=float, required=True, help='Time for the model to run in hours')
-    args = parser.parse_args()
-    run_experiment_with_dynamic_grid(total_hours=args.time, output_base_directory="./results_grid_search")
--- a/src/h20_autoML.py
+++ b/src/h20_autoML.py
@@ -1,166 +0,0 @@
-import os
-import shap
-import numpy as np
-import pandas as pd
-import h2o
-from h2o.automl import H2OAutoML
-from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import KFold
-from sklearn.feature_selection import SelectKBest, f_regression
-from sklearn.metrics import mean_squared_error
-from sklearn.cluster import KMeans
-
-from combine_datasets import combine_datasets
-
-# Initialize H2O
-h2o.init()
-
-
-# Preprocess the data and separate features and target
-def preprocess_data(input_data, pipeline):
-    X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
-    y = input_data['yield_t/ha']
-
-    # Fit-transform the pipeline steps without `y` first
-    for name, step in pipeline.steps:
-        if name != 'feature_selection':
-            X = step.fit_transform(X)
-
-    # Apply `SelectKBest` separately, which requires `y`
-    if 'feature_selection' in [name for name, _ in pipeline.steps]:
-        X = pipeline.named_steps['feature_selection'].fit_transform(X, y)
-
-    return X, y
-
-
-# Function to compute SHAP values using KMeans clustering
-def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
-    # Fit KMeans to the training data with explicit n_init parameter
-    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
-    cluster_centers = kmeans.cluster_centers_
-
-    # Convert cluster centers to H2OFrame
-    cluster_centers_h2o = h2o.H2OFrame(cluster_centers)
-
-    # Convert X_test to H2OFrame
-    X_test_h2o = h2o.H2OFrame(X_test)
-
-    # Compute SHAP values on cluster centers
-    explainer = shap.KernelExplainer(lambda x: model.predict(h2o.H2OFrame(x)).as_data_frame().values.flatten(),
-                                     cluster_centers_h2o.as_data_frame().values)
-    shap_values = explainer.shap_values(X_test_h2o.as_data_frame().values)
-
-    return shap_values
-
-
-# Function to load and combine datasets
-def load_and_combine_datasets(use_ndvi=True):
-    data = combine_datasets(
-        "./data/potatoes_dataset.csv",
-        "./data/updated_soil_data_with_awc.csv",
-        "./data/final_augmented_climate_data.csv",
-        "./data/NDVI.csv",
-        ndvi=use_ndvi
-    )
-    return data
-
-
-# Main experiment function with H2O AutoML
-def run_h2o_experiment(use_ndvi=True, automl_time=10, output_base_directory="./results_h2o"):
-    data = load_and_combine_datasets(use_ndvi=use_ndvi)
-    pipelines = [
-        Pipeline([
-            ('normalize', StandardScaler()),
-            ('imputer', SimpleImputer(strategy='mean')),
-            ('poly_features', PolynomialFeatures(degree=2)),
-            ('feature_selection', SelectKBest(score_func=f_regression, k=25))
-        ]),
-        Pipeline([
-            ('robust', RobustScaler()),
-            ('feature_selection', SelectKBest(score_func=f_regression, k=25)),
-            ('power_transformation', PowerTransformer(method='yeo-johnson'))
-        ])
-    ]
-
-    output_directory = os.path.join(output_base_directory, f"h2o_automl_results_{automl_time}h")
-    os.makedirs(output_directory, exist_ok=True)
-
-    metrics_df = pd.DataFrame(columns=['Model', 'Pipeline', 'MSE', 'SHAP_Stability'])
-
-    kf = KFold(n_splits=5, shuffle=True, random_state=42)
-
-    for pipeline_idx, pipeline in enumerate(pipelines):
-        X_processed, y = preprocess_data(data, pipeline)
-
-        X_h2o = h2o.H2OFrame(X_processed)
-        y_h2o = h2o.H2OFrame(y.to_frame())
-        X_h2o['target'] = y_h2o
-
-        shap_stabilities = []
-        all_shap_values = []
-
-        for fold, (train_idx, test_idx) in enumerate(kf.split(X_processed)):
-            train = X_h2o[train_idx.tolist(), :]
-            test = X_h2o[test_idx.tolist(), :]
-
-            aml = H2OAutoML(max_runtime_secs=int(automl_time * 3600 / kf.get_n_splits()), max_models=5, seed=42,
-                            sort_metric='MSE')
-            aml.train(x=list(X_h2o.columns[:-1]), y='target', training_frame=train)
-
-            best_model = aml.leader
-
-            leaderboard = aml.leaderboard.as_data_frame()
-            leaderboard_csv_path = os.path.join(output_directory, f"leaderboard_pipeline_{pipeline_idx + 1}_fold_{fold + 1}.csv")
-            leaderboard.to_csv(leaderboard_csv_path, index=False)
-
-            model_output_dir = os.path.join(output_directory, f"pipeline_{pipeline_idx + 1}")
-            os.makedirs(model_output_dir, exist_ok=True)
-            model_file_path = os.path.join(model_output_dir, f"h2o_best_model_{pipeline_idx + 1}_fold_{fold + 1}.zip")
-            h2o.save_model(best_model, path=model_file_path)
-
-            predictions = best_model.predict(test).as_data_frame()['predict']
-
-            mse = mean_squared_error(test['target'].as_data_frame().values, predictions.values)
-
-            shap_values = compute_shap_values_with_kmeans(best_model, train.as_data_frame().values,
-                                                          test.as_data_frame().values)
-            shap_stability = np.std(shap_values, axis=0).mean()
-            shap_stabilities.append(shap_stability)
-            all_shap_values.append(shap_values)
-
-
-            print(f"Completed H2O AutoML and SHAP computation for Pipeline {pipeline_idx + 1}, Fold {fold + 1}")
-
-        mean_shap_stability = np.mean(shap_stabilities)
-        for fold_idx, shap_values in enumerate(all_shap_values):
-            shap_file_name = f"shap_values_pipeline_{pipeline_idx + 1}_fold_{fold_idx + 1}.npy"
-            shap_file_path = os.path.join(model_output_dir, shap_file_name)
-            np.save(shap_file_path, shap_values)
-
-        metrics_df = pd.concat([metrics_df, pd.DataFrame([{
-            'Model': best_model.model_id,
-            'Pipeline': f"Pipeline_{pipeline_idx + 1}",
-            'MSE': mse,
-            'SHAP_Stability': mean_shap_stability
-        }])], ignore_index=True)
-
-    metrics_csv_path = os.path.join(output_directory, "metrics_summary.csv")
-    metrics_df.to_csv(metrics_csv_path, index=False)
-
-    print(f"All results saved to {output_directory}")
-
-    h2o.shutdown(prompt=False)
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description='Run H2O AutoML for different models.')
-    parser.add_argument('--time', type=float, required=True, help='Time in hours for the AutoML process to run.')
-    args = parser.parse_args()
-
-    # Run the H2O AutoML experiment
-    run_h2o_experiment(use_ndvi=True, automl_time=args.time, output_base_directory="./results_h2o")
-
--- a/src/h20_batch.sh
+++ b/src/h20_batch.sh
@@ -1,23 +0,0 @@
-#!/bin/bash
-#SBATCH --account=def-xander      # Replace with your account
-#SBATCH --mem=10G                 # Memory allocation
-#SBATCH --time=11:00:00           # Total run time limit (11 hours)
-#SBATCH --cpus-per-task=4         # Number of CPU cores per task
-#SBATCH --job-name=h20_%A # Job name with job ID appended
-#SBATCH --output=%x-%j.out        # Standard output and error log
-#SBATCH --error=%x-%j.err         # Separate error log
-
-# Load necessary modules
-module load python/3.8
-
-# Activate your virtual environment
-source ~/envs/workdir/bin/activate
-
-# Parameters
-TIME=$1
-
-# Run the Python script with the specified time parameter
-srun python /home/dvera/scratch/Framework_EXP/h20_autoML.py --time $TIME
-
-# Deactivate the virtual environment
-deactivate