Major changes for real this time
This commit is contained in:
@@ -1,5 +1,9 @@
|
||||
Codebase:
|
||||
https://gitlab.com/university-of-prince-edward-isalnd/explanation-aware-optimization-and-automl/-/tree/main/src?ref_type=heads
|
||||
|
||||
Previous Analysis:
|
||||
https://gitlab.com/agri-food-canada/potato-yield-predictions-by-postal-code-ml
|
||||
|
||||
Operation:
|
||||
Specify working directory (local repo location), cache directory (dataset download location), and
|
||||
|
||||
@@ -12,15 +16,84 @@ Code File Structure
|
||||
|
||||
Shell scripts
|
||||
|
||||
h20_batch.sh ->
|
||||
nsga_batch.sh ->
|
||||
grid_search_batch.sh ->
|
||||
h20_batch.sh -> h20_autoML.py
|
||||
nsga_batch.sh -> nsga_exp.py
|
||||
grid_search_batch.sh -> grid_search_exp.py
|
||||
|
||||
grid_search_batch calls both algorithms and combine_datasets
|
||||
|
||||
Run order should be
|
||||
|
||||
datasets -> algorithms -> combine_datasets -> 3 .sh files -> shap_values_computation.py
|
||||
############################################################################################################################################################
|
||||
Objective:
|
||||
|
||||
Current code is built to perform ML analysis on a potato yield dataset as shown in Potato Yield Predictions by Postal Code ML
|
||||
The code will need to be modified to work with other datasets
|
||||
1. Modify code to work with California Housing Price dataset found in datasets.py
|
||||
(cal_housing, regression dataset)
|
||||
|
||||
2. Modify code to work with some other classification focused dataset
|
||||
(dataset.py code contains cal_housing for regression and three classification datasets)
|
||||
|
||||
3. Compare the performance of the model in both situations to compare baseline of regression vs. classification.
|
||||
Table should include key performance indicators for both datasets as well as number of objects in each dataset
|
||||
|
||||
4. (Ideally) Make models as easy as possible to migrate between datasets through user prompt.
|
||||
Also cache files for easy referencing and to make sure that data can be analysed properly later
|
||||
|
||||
Files that need changing
|
||||
|
||||
dataset = YES
|
||||
algorithms = NO
|
||||
nsga_exp = YES
|
||||
shap_values_computation = NO(?)
|
||||
|
||||
############################################################################################################################################################
|
||||
Scripting Tasks:
|
||||
datasets -> algorithms -> combine datasets -> nsga_exp.py -> shap_values_computation
|
||||
|
||||
1. Make datasets generalizable
|
||||
|
||||
2. Make combine datasets reference generalizable headers / infer from input
|
||||
|
||||
3. Make nsga_exp.py reference the combine_dataset headers
|
||||
|
||||
4. Make output folders specified by user at runtime / in the slurm bash script
|
||||
|
||||
Operation Tasks:
|
||||
1. Run nsga_exp.py using the California Housing Dataset (regression)
|
||||
|
||||
2. Run the nsga_exp.py script using a separate, classification dataset
|
||||
|
||||
3. Compare results
|
||||
############################################################################################################################################################
|
||||
Code Changes:
|
||||
|
||||
nsga_exp.py
|
||||
- Lines 24 & 26 reference yield_t/ha. This should be a parameter
|
||||
|
||||
- Lines 33-36 reference relative paths to previous soil.csv files
|
||||
|
||||
- Lines 112 and 116 reference a set value of k (k=25). It might be better to set this dynamically based on the size of the dataset
|
||||
|
||||
- Lines 141 - 143 reference models_space, pipelines, and k_value range. Should be generalized for other datasets and features
|
||||
|
||||
- Line 134 references an ngsa output directory. This could be parameterized for other datasets
|
||||
|
||||
- Lines 183, 190, and 195 reference specific output path csv files. This will cause overwriting on subsequent runs. Change to store based on run
|
||||
|
||||
- Lines 124 - 129 reference models and functions from algorithms.py. This could be generalized to allow any model dictionary but not likely beneficial for this study
|
||||
|
||||
datasets.py
|
||||
- User prompt was added to allow users to choose a dataset of the four and list its Type
|
||||
- User prompt was added to choose a target feature and features to exclude
|
||||
- User prompt was added for a save location for the processed csv of the dataset output
|
||||
|
||||
|
||||
|
||||
############################################################################################################################################################
|
||||
Code Changes:
|
||||
Code Optimizations:
|
||||
|
||||
- SHAP KernelExplainer
|
||||
Use shap.TreeExplainer on tree-based models instead
|
||||
|
||||
@@ -11,4 +11,4 @@ joblib
|
||||
|
||||
# Data acquisition
|
||||
openml
|
||||
|
||||
deap
|
||||
|
||||
6
setup.sh
6
setup.sh
@@ -1,12 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
sudo apt install nfs-common -y
|
||||
dnf install -y nfs-utils kernel-modules-extra
|
||||
|
||||
mkdir /mnt/data
|
||||
|
||||
mount 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data
|
||||
mount -t nfs -o vers=3,proto=tcp 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data
|
||||
|
||||
$WORK_DIR=/mnt/data
|
||||
|
||||
WORK_DIR=/mnt/data
|
||||
mkdir -p /mnt/data/cache # ensure directory exists
|
||||
export OPENML_CACHE_DIR=/mnt/data/cache
|
||||
|
||||
111
src/dataset.py
111
src/dataset.py
@@ -3,48 +3,113 @@ import pandas as pd
|
||||
import openml
|
||||
|
||||
# --- CACHE SETUP ---
|
||||
# Change this path to your preferred local cache directory
|
||||
#CACHE_DIR = os.path.expanduser("~/openml_cache")
|
||||
#os.makedirs(CACHE_DIR, exist_ok=True)
|
||||
#openml.config.cache_directory = CACHE_DIR
|
||||
CACHE_DIR = os.path.expanduser("~/openml_cache")
|
||||
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||
openml.config.cache_directory = CACHE_DIR
|
||||
|
||||
# OpenML CC18 classification tasks (task ids)
|
||||
# --- Dataset IDs ---
|
||||
TASKS = {
|
||||
"adult": 7592, # Adult Income classification
|
||||
"spambase": 43, # Spambase classification
|
||||
"optdigits": 28, # Optdigits classification
|
||||
"adult": 7592,
|
||||
"spambase": 43,
|
||||
"optdigits": 28,
|
||||
}
|
||||
|
||||
# Regression dataset (dataset id)
|
||||
DATASETS = {
|
||||
"cal_housing": 44025
|
||||
}
|
||||
|
||||
# --- Load functions ---
|
||||
def _load_task_dataframe(task_id: int):
|
||||
task = openml.tasks.get_task(task_id)
|
||||
dataset_id = task.dataset_id
|
||||
dataset = openml.datasets.get_dataset(dataset_id)
|
||||
X, y, categorical_indicator, _ = dataset.get_data(
|
||||
dataset_format="dataframe",
|
||||
target=task.target_name
|
||||
)
|
||||
# drop rows with NA target if any
|
||||
if isinstance(y, pd.Series):
|
||||
X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=task.target_name)
|
||||
mask = ~y.isna()
|
||||
X, y = X.loc[mask], y.loc[mask]
|
||||
return X, y
|
||||
return X.loc[mask], y.loc[mask]
|
||||
|
||||
def load_dataset(name: str):
|
||||
if name in TASKS:
|
||||
X, y = _load_task_dataframe(TASKS[name])
|
||||
return X, y, "classification"
|
||||
task_type = "classification"
|
||||
elif name in DATASETS:
|
||||
ds_id = DATASETS[name]
|
||||
ds = openml.datasets.get_dataset(ds_id)
|
||||
X, y, categorical_indicator, _ = ds.get_data(
|
||||
dataset_format="dataframe", target=ds.default_target_attribute
|
||||
)
|
||||
mask = ~y.isna()
|
||||
return X.loc[mask], y.loc[mask], "regression"
|
||||
target_col = ds.default_target_attribute
|
||||
X, y, _, _ = ds.get_data(dataset_format="dataframe", target=None)
|
||||
mask = ~X[target_col].isna()
|
||||
X = X.loc[mask]
|
||||
y = X[target_col].loc[mask]
|
||||
task_type = "regression"
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset {name}")
|
||||
return X, y, task_type
|
||||
|
||||
# --- Interactive main ---
|
||||
def main():
|
||||
print("Available datasets:")
|
||||
all_datasets = list(TASKS.keys()) + list(DATASETS.keys())
|
||||
for i, name in enumerate(all_datasets):
|
||||
print(f"{i+1}. {name}")
|
||||
|
||||
selection = input("Enter the dataset name: ").strip()
|
||||
if selection not in all_datasets:
|
||||
raise ValueError(f"Dataset '{selection}' not recognized.")
|
||||
|
||||
X, y, task_type = load_dataset(selection)
|
||||
|
||||
# --- Identify default target ---
|
||||
default_target = y.name
|
||||
print(f"\nDefault target column: {default_target}")
|
||||
|
||||
# --- Print all features (without explanations) ---
|
||||
print("\nFeatures in the dataset:")
|
||||
for col in X.columns.unique():
|
||||
print(f"- {col} ({X[col].dtype})")
|
||||
|
||||
# --- Target selection ---
|
||||
target = input("\nEnter the target feature (or press Enter to use default): ").strip()
|
||||
if target:
|
||||
if target not in X.columns:
|
||||
raise ValueError(f"Target feature '{target}' not found.")
|
||||
y = X[target]
|
||||
X = X.drop(columns=[target], errors="ignore")
|
||||
else:
|
||||
target = default_target
|
||||
X = X.drop(columns=[target], errors="ignore")
|
||||
|
||||
# --- Feature exclusion ---
|
||||
exclude_input = input("\nEnter features to exclude (comma-separated), or press Enter to skip: ").strip()
|
||||
if exclude_input:
|
||||
exclude_cols = [col.strip() for col in exclude_input.split(",")]
|
||||
for col in exclude_cols:
|
||||
if col in X.columns:
|
||||
X = X.drop(columns=[col])
|
||||
else:
|
||||
print(f"Warning: '{col}' not found in dataset and cannot be excluded.")
|
||||
|
||||
# --- Show preview ---
|
||||
print("\nFinal dataset preview (first 5 rows):")
|
||||
print(X.head())
|
||||
print("\nTarget preview (first 5 rows):")
|
||||
print(y.head())
|
||||
print(f"\nTask type: {task_type}")
|
||||
print(f"Target column: {target}")
|
||||
print(f"Number of features: {len(X.columns)}")
|
||||
|
||||
# --- Export to CSV ---
|
||||
output_file = input("\nEnter filename to save dataset as CSV (e.g., dataset.csv): ").strip()
|
||||
if output_file:
|
||||
df_export = X.copy()
|
||||
df_export[target] = y # append target at the end
|
||||
df_export.to_csv(output_file, index=False)
|
||||
print(f"Dataset saved to {output_file} (target column: '{target}')")
|
||||
|
||||
# Save the CSV path to a temporary text file in the current directory
|
||||
temp_path_file = "last_csv_path.txt"
|
||||
full_path = os.path.abspath(output_file)
|
||||
with open(temp_path_file, "w") as f:
|
||||
f.write(full_path)
|
||||
print(f"CSV path written to {temp_path_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --account=def-xander # Replace with your account
|
||||
#SBATCH --mem=10G # Memory allocation
|
||||
#SBATCH --mem=30G # Memory allocation
|
||||
#SBATCH --time=21:00:00 # Total run time limit (11 hours)
|
||||
#SBATCH --cpus-per-task=4 # Number of CPU cores per task
|
||||
#SBATCH --job-name=nsga_%A # Job name with job ID appended
|
||||
#SBATCH --cpus-per-task=8 # Number of CPU cores per task
|
||||
#SBATCH --job-name=nsga # Job name with job ID appended
|
||||
#SBATCH --output=%x-%j.out # Standard output and error log
|
||||
#SBATCH --error=%x-%j.err # Separate error log
|
||||
|
||||
@@ -11,10 +10,10 @@
|
||||
#module load python/3.8
|
||||
|
||||
# Activate your virtual environment
|
||||
source /env0/bin/activate
|
||||
#source /env0/bin/activate
|
||||
|
||||
# Run the Python script with the specified time parameter
|
||||
srun python $WORK_DIR/src/nsga_exp.py
|
||||
srun python /mnt/data/src/nsga_exp.py
|
||||
|
||||
# Deactivate the virtual environment
|
||||
deactivate
|
||||
|
||||
@@ -13,31 +13,33 @@ import shap
|
||||
from deap import base, creator, tools, algorithms
|
||||
from algorithms import lasso, random_forest, gradient_boosting, decision_tree_regressor, ridge_regressor, stacking_lasso
|
||||
|
||||
from combine_datasets import combine_datasets
|
||||
import argparse
|
||||
|
||||
creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0)) # Minimize both objectives
|
||||
creator.create("Individual", list, fitness=creator.FitnessMin)
|
||||
|
||||
def load_dataset():
|
||||
|
||||
# Read the CSV path from the temporary file
|
||||
with open("last_csv_path.txt", "r") as f:
|
||||
csv_path = f.read().strip()
|
||||
|
||||
if not os.path.exists(csv_path):
|
||||
raise FileNotFoundError(f"CSV file not found: {csv_path}")
|
||||
|
||||
# Load the dataset
|
||||
data = pd.read_csv(csv_path)
|
||||
return data
|
||||
|
||||
# Preprocess the data and separate features and target
|
||||
def preprocess_data(input_data, pipeline, k_value):
|
||||
def preprocess_data(input_data, pipeline, k_value, target_column):
|
||||
pipeline.named_steps['feature_selection'].set_params(k=k_value)
|
||||
X = input_data.drop(columns=['yield_t/ha'], errors='ignore')
|
||||
X = input_data.drop(columns=[target_column], errors='ignore')
|
||||
print(X.columns)
|
||||
y = input_data['yield_t/ha']
|
||||
y = input_data[target_column]
|
||||
X = pipeline.fit_transform(X, y)
|
||||
return X, y
|
||||
|
||||
# Load and combine datasets
|
||||
def load_and_combine_datasets(use_ndvi=True):
|
||||
data = combine_datasets(
|
||||
"./data/potatoes_dataset.csv",
|
||||
"./data/updated_soil_data_with_awc.csv",
|
||||
"./data/final_augmented_climate_data.csv",
|
||||
"./data/NDVI.csv",
|
||||
ndvi=use_ndvi
|
||||
)
|
||||
return data
|
||||
|
||||
# Function to compute SHAP values using KMeans clustering
|
||||
def compute_shap_values_with_kmeans(model, X_train, X_test, n_clusters=10):
|
||||
kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X_train)
|
||||
@@ -94,13 +96,12 @@ def evaluate_individual(individual, data, kf):
|
||||
return np.mean(mse_scores), np.mean(shap_stabilities), all_shap_values
|
||||
|
||||
# Run NSGA-II experiment with a time limit
|
||||
def run_nsga_experiment(use_ndvi=True,
|
||||
output_base_directory="./results_nsga",
|
||||
def run_nsga_experiment(output_base_directory="./results_nsga",
|
||||
population_size=30,
|
||||
n_generations=50,
|
||||
time_limit=36000):
|
||||
# Load and combine dataset
|
||||
data = load_and_combine_datasets(use_ndvi=use_ndvi)
|
||||
# Load the dataset
|
||||
data = load_dataset()
|
||||
|
||||
# Pipelines for preprocessing
|
||||
global pipelines
|
||||
@@ -199,7 +200,6 @@ def run_nsga_experiment(use_ndvi=True,
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_nsga_experiment(
|
||||
use_ndvi=True,
|
||||
output_base_directory="./results_nsga",
|
||||
population_size=80, # Larger population size for a comprehensive search
|
||||
n_generations=100, # Increased number of generations
|
||||
|
||||
Reference in New Issue
Block a user