From 59139bab3fe6854f84133b2c9564ecc0e7c6abaa Mon Sep 17 00:00:00 2001 From: Varyngoth Date: Mon, 24 Nov 2025 23:15:00 -0400 Subject: [PATCH] A new beginning --- ACENET_HPC_Guide.md | 135 ++++++++++++++++++++ README.md | 62 +++++++++ requirements.txt | 8 ++ run_deap.py | 170 +++++++++++++++++++++++++ src/data_openml.py | 16 +++ src/models.py | 55 ++++++++ src/objectives.py | 66 ++++++++++ src/preprocessing.py | 138 ++++++++++++++++++++ src/search/nsga_deap.py | 147 ++++++++++++++++++++++ src/stability.py | 273 ++++++++++++++++++++++++++++++++++++++++ 10 files changed, 1070 insertions(+) create mode 100644 ACENET_HPC_Guide.md create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 run_deap.py create mode 100644 src/data_openml.py create mode 100644 src/models.py create mode 100644 src/objectives.py create mode 100644 src/preprocessing.py create mode 100644 src/search/nsga_deap.py create mode 100644 src/stability.py diff --git a/ACENET_HPC_Guide.md b/ACENET_HPC_Guide.md new file mode 100644 index 0000000..c3cd357 --- /dev/null +++ b/ACENET_HPC_Guide.md @@ -0,0 +1,135 @@ +# Using Compute Canada / ACENET HPC + +This guide explains how to connect to the Digital Research Alliance of Canada (Compute Canada) or ACENET clusters, create a working directory in scratch, transfer files with Globus, and submit jobs using SLURM. + +## 1. Connect to the HPC via SSH + +1. Determine which cluster to use (examples): + - Graham: `graham.computecanada.ca` + - Cedar: `cedar.computecanada.ca` + - Beluga: `beluga.computecanada.ca` + - Niagara: `niagara.scinet.utoronto.ca` + - ACENET: `login1.acenet.ca` + +2. Open a terminal and connect via SSH: + + ```bash + ssh username@graham.computecanada.ca + ``` + +3. When prompted, confirm the host fingerprint and enter your password. + +--- + +## 2. Create a Folder in Scratch + +Your `$SCRATCH` directory is a temporary workspace for large data and computations. It is purged after 60 days of inactivity. + +After logging in: + +```bash +cd $SCRATCH +mkdir my_project +cd my_project +``` + +Confirm your path: + +```bash +pwd +# Example output: /scratch/username/my_project +``` + +--- + +## 3. Install and Use Globus for File Transfers + +Globus is a fast, reliable tool for large file transfers. It requires a small local agent called **Globus Connect Personal**. + +### Install Globus Connect Personal + +- **Linux:** + ```bash + wget https://downloads.globus.org/globus-connect-personal/linux/stable/globusconnectpersonal-latest.tgz + tar xzf globusconnectpersonal-latest.tgz + cd globusconnectpersonal* + ./globusconnectpersonal -setup + ``` + +- **macOS:** + Download and install from: [https://www.globus.org/globus-connect-personal](https://www.globus.org/globus-connect-personal) + +- **Windows:** + Download the installer from the same link and follow the setup wizard. + +After installation, your local computer will appear as a **Globus endpoint**. + +### Transfer Files + +1. Visit [https://app.globus.org](https://app.globus.org) and log in using **Compute Canada credentials**. +2. In the web app, choose two endpoints: + - **Source:** Your local computer or institutional storage. + - **Destination:** Your HPC endpoint (for example, *Compute Canada Graham Scratch*). +3. Navigate to your target scratch folder (`/scratch/username/my_project`). +4. Select files and click **Start Transfer**. + +Globus will handle transfers asynchronously and resume interrupted transfers automatically. + +--- + +## 4. Submit Jobs to ACENET with SLURM + +Job submissions use the SLURM scheduler. Create a batch file describing your job resources and commands. + +### Example job script (`job.slurm`) + +```bash +#!/bin/bash +#SBATCH --job-name=my_analysis +#SBATCH --account=def-yourprof +#SBATCH --time=2:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --mem=8G +#SBATCH --output=output_%j.log + +module load python/3.11 +source ~/myenv/bin/activate + +python my_script.py +``` + +### Submit and Monitor Jobs + +```bash +sbatch job.slurm # Submit job +squeue -u username # Check status +scancel job_id # Cancel job +``` + +### View Results + +After completion, check output logs: + +```bash +less output_.log +``` + +--- + +## 5. Useful Commands + +```bash +module avail # List available software modules +module load python/3.11 # Load a module +df -h $SCRATCH # Check scratch usage +quota -s # Check your disk quota +``` + +--- + +## 6. References + +- Alliance Docs: [https://docs.alliancecan.ca/wiki/Technical_documentation](https://docs.alliancecan.ca/wiki/Technical_documentation) +- ACENET Training: [https://www.ace-net.ca/training/](https://www.ace-net.ca/training/) +- Globus Setup: [https://www.globus.org/globus-connect-personal](https://www.globus.org/globus-connect-personal) diff --git a/README.md b/README.md new file mode 100644 index 0000000..3cc19f5 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +# Explanation-Aware Optimization and AutoML (DEAP + SHAP Stability) + +This project implements an **AutoML framework** that uses **DEAP’s NSGA-II** for multi-objective optimization, balancing **model accuracy** and **SHAP-based stability**. +It supports both **classification** and **regression** datasets via OpenML and sklearn. +All results are tracked with **MLflow**. + + +--- + +## 1. Environment Setup (macOS / Linux) + +### Create and activate a virtual environment +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install --upgrade pip wheel setuptools + +pip install \ + numpy==1.26.4 \ + pandas==1.5.3 \ + scikit-learn==1.3.2 \ + shap==0.45.0 \ + deap==1.4.1 \ + openml==0.14.2 \ + mlflow==2.11.3 \ + matplotlib==3.7.5 +``` +## 2. Running Experiments +Classification: Adult Dataset + ```bash +python run_deap.py \ + --dataset adult \ + --generations 5 \ + --pop-size 24 \ + --cv-folds 3 +``` +Regression: California Housing Dataset + +```bash +python run_deap.py \ + --dataset cal_housing \ + --generations 5 \ + --pop-size 24 \ + --cv-folds 3 +``` +Results are saved under: + +```bash +runs//pareto_front.csv +``` +## 3. Viewing Results in MLflow + +```bash +mlflow ui --backend-store-uri ./mlruns --host 0.0.0.0 --port 5000 +``` +Then open: http://localhost:5000 + +You can visualize: + +MSE-like score (lower is better) + +SHAP stability (higher is better) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f179b14 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +scikit-learn +openml +deap +mlflow +shap +numpy +pandas +matplotlib diff --git a/run_deap.py b/run_deap.py new file mode 100644 index 0000000..b647b76 --- /dev/null +++ b/run_deap.py @@ -0,0 +1,170 @@ +import argparse +import random +import pickle +from pathlib import Path + +import numpy as np +import mlflow +from deap import algorithms +from deap.tools.emo import sortNondominated +import pandas as pd + +from src.data_openml import load_dataset +from src.search.nsga_deap import build_toolbox, decode +from src.preprocessing import build_preprocessor +from src.models import make_model +from src.stability import compute_shap_matrix + + +def save_checkpoint(path, gen, pop, seed): + state = { + "gen": gen, + "pop": pop, + "py_random_state": random.getstate(), + "np_random_state": np.random.get_state(), + "seed": seed, + } + with open(path, "wb") as f: + pickle.dump(state, f) + + +def load_checkpoint(path): + with open(path, "rb") as f: + state = pickle.load(f) + random.setstate(state["py_random_state"]) + np.random.set_state(state["np_random_state"]) + return state["gen"], state["pop"], state["seed"] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--generations", type=int, default=10) + ap.add_argument("--pop-size", type=int, default=24) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--cv-folds", type=int, default=3) + ap.add_argument("--experiment", default="deap_nsga_shap") + ap.add_argument("--checkpoint-every", type=int, default=5) + ap.add_argument( + "--shap-pf-eval-rows", + type=int, + default=512, + help="Number of rows from the dataset to use when saving SHAP for Pareto models", + ) + args = ap.parse_args() + + # data and experiment + X, y, task = load_dataset(args.dataset, random_state=args.seed) + mlflow.set_experiment(args.experiment) + + outdir = Path("runs") / args.dataset + outdir.mkdir(parents=True, exist_ok=True) + ckpt_path = outdir / "checkpoint.pkl" + + # seed RNGs + random.seed(args.seed) + np.random.seed(args.seed) + + # toolbox for this run + toolbox = build_toolbox( + X, + y, + task, + seed=args.seed, + cv_folds=args.cv_folds, + mlflow_experiment=args.experiment, + ) + + # initial population or resume from checkpoint + if ckpt_path.exists(): + start_gen, pop, loaded_seed = load_checkpoint(ckpt_path) + if loaded_seed != args.seed: + print( + f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}" + ) + print(f"Resuming from checkpoint at generation {start_gen}") + else: + pop = toolbox.population(n=args.pop_size) + fits = list(map(toolbox.evaluate, pop)) + for ind, fit in zip(pop, fits): + ind.fitness.values = fit + start_gen = 0 + save_checkpoint(ckpt_path, start_gen, pop, args.seed) + print(f"Initial checkpoint saved at generation {start_gen}") + + # GA loop + for gen in range(start_gen, args.generations): + offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2) + fits = list(map(toolbox.evaluate, offspring)) + for ind, fit in zip(offspring, fits): + ind.fitness.values = fit + pop = toolbox.select(pop + offspring, k=args.pop_size) + + if (gen + 1) % args.checkpoint_every == 0: + save_checkpoint(ckpt_path, gen + 1, pop, args.seed) + print(f"Checkpoint saved at generation {gen + 1}") + + # final Pareto front + pf = sortNondominated(pop, len(pop), first_front_only=True)[0] + rows = [] + for ind in pf: + algo, model_params, pre_cfg = decode(ind) + rows.append( + { + "algo": algo, + "mse_like": ind.fitness.values[0], + "stability": ind.fitness.values[1], + **{f"m_{k}": v for k, v in model_params.items()}, + **{f"p_{k}": v for k, v in pre_cfg.items()}, + } + ) + + pareto_path = outdir / "pareto_front.csv" + pd.DataFrame(rows).to_csv(pareto_path, index=False) + print(f"Saved Pareto front to {pareto_path}") + + shap_dir = outdir / "shap" + shap_dir.mkdir(exist_ok=True) + + eval_rows = min(args.shap_pf_eval_rows, len(X)) + rng = np.random.RandomState(args.seed) + eval_idx = rng.choice(len(X), size=eval_rows, replace=False) + X_eval_shap = X.iloc[eval_idx] + y_full = y + + for i, ind in enumerate(pf): + algo, model_params, pre_cfg = decode(ind) + + fixed_poly_degree = pre_cfg.get("poly_degree", 1) + fixed_k = pre_cfg.get("select_k", None) + + preproc = build_preprocessor( + X, + task, + pre_cfg, + fixed_k=fixed_k, + fixed_poly_degree=fixed_poly_degree, + ) + model = make_model(task, algo, model_params, random_state=args.seed) + from sklearn.pipeline import Pipeline as SkPipeline + pipe = SkPipeline([("pre", preproc), ("model", model)]) + + shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( + pipe, + X_fit=X, + y_fit=y_full, + X_eval=X_eval_shap, + task_type=task, + bg_size=128, + max_eval_rows=eval_rows, + rng_seed=args.seed, + ) + + np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals) + np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names)) + + print(f"Saved SHAP arrays for {len(pf)} Pareto models under {shap_dir}") + + +if __name__ == "__main__": + main() diff --git a/src/data_openml.py b/src/data_openml.py new file mode 100644 index 0000000..df2981f --- /dev/null +++ b/src/data_openml.py @@ -0,0 +1,16 @@ +from sklearn.datasets import fetch_california_housing, fetch_openml + +def load_dataset(name: str, random_state: int = 42): + name = name.lower() + if name == "cal_housing": + ds = fetch_california_housing(as_frame=True) + X = ds.data + y = ds.target + return X, y, "regression" + elif name == "adult": + ds = fetch_openml(data_id=1590, as_frame=True) # Adult + X = ds.data + y = (ds.target == ">50K").astype(int) + return X, y, "classification" + else: + raise ValueError("dataset must be adult or cal_housing") diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..a574565 --- /dev/null +++ b/src/models.py @@ -0,0 +1,55 @@ +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier +from sklearn.neural_network import MLPRegressor, MLPClassifier + +def make_model(task, algo, params, random_state=0): + if task == "regression": + if algo == "rf": + return RandomForestRegressor( + n_estimators=int(params["n_estimators"]), + max_depth=int(params["max_depth"]), + max_features=params["max_features"], + random_state=random_state, + n_jobs=1 + ) + elif algo == "gbt": + return GradientBoostingRegressor( + n_estimators=int(params["n_estimators"]), + learning_rate=float(params["learning_rate"]), + max_depth=int(params["max_depth"]), + random_state=random_state + ) + elif algo == "mlp": + return MLPRegressor( + hidden_layer_sizes=tuple(params["hidden_layers"]), + activation=params["activation"], + alpha=float(params["alpha"]), + learning_rate_init=float(params["lr_init"]), + max_iter=int(params.get("max_iter", 200)), + random_state=random_state + ) + else: + if algo == "rf": + return RandomForestClassifier( + n_estimators=int(params["n_estimators"]), + max_depth=int(params["max_depth"]), + max_features=params["max_features"], + random_state=random_state, + n_jobs=1 + ) + elif algo == "gbt": + return GradientBoostingClassifier( + n_estimators=int(params["n_estimators"]), + learning_rate=float(params["learning_rate"]), + max_depth=int(params["max_depth"]), + random_state=random_state + ) + elif algo == "mlp": + return MLPClassifier( + hidden_layer_sizes=tuple(params["hidden_layers"]), + activation=params["activation"], + alpha=float(params["alpha"]), + learning_rate_init=float(params["lr_init"]), + max_iter=int(params.get("max_iter", 200)), + random_state=random_state + ) + raise ValueError("Unknown algo") diff --git a/src/objectives.py b/src/objectives.py new file mode 100644 index 0000000..3e5b90d --- /dev/null +++ b/src/objectives.py @@ -0,0 +1,66 @@ +import numpy as np +from sklearn.pipeline import Pipeline +from sklearn.model_selection import KFold +from sklearn.metrics import mean_squared_error, brier_score_loss + +from .preprocessing import build_preprocessor +from .models import make_model +from .stability import compute_shap_matrix, shap_stability_from_matrices + + +def evaluate_config(X, y, task, algo, model_params, preproc_cfg, cv_folds=3, seed=42): + kf = KFold(n_splits=cv_folds, shuffle=True, random_state=seed) + losses = [] + + # this will store full SHAP matrices and feature names for stability + shap_mats_with_names = [] + + # pick a fixed evaluation pool for SHAP, same rows and order for all folds + rng = np.random.RandomState(seed) + max_eval_rows = 1024 + eval_size = min(max_eval_rows, len(X)) + eval_idx = rng.choice(len(X), size=eval_size, replace=False) + X_eval_fixed = X.iloc[eval_idx] + + # probe preprocessor to compute a safe cap for k + fixed_poly_degree = preproc_cfg.get("fixed_poly_degree", preproc_cfg.get("poly_degree", 1)) + probe_pre = build_preprocessor(X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree) + Xp = probe_pre.fit_transform(X, y) + n_after_prep = Xp.shape[1] + desired_k = preproc_cfg.get("select_k", None) + k_cap = None if desired_k is None else int(min(max(1, desired_k), n_after_prep)) + + for fold_idx, (tr, te) in enumerate(kf.split(X)): + preproc = build_preprocessor(X, task, preproc_cfg, fixed_k=k_cap, fixed_poly_degree=fixed_poly_degree) + model = make_model(task, algo, model_params, random_state=seed + fold_idx) + pipe = Pipeline([("pre", preproc), ("model", model)]) + + # 1) SHAP stability: always use the same X_eval_fixed for all folds + shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( + pipe, + X_fit=X.iloc[tr], + y_fit=y.iloc[tr], + X_eval=X_eval_fixed, + task_type=task, + ) + shap_mats_with_names.append((shap_vals, feat_names)) + + # 2) Loss: still use standard CV split (te) for generalization + if task == "regression": + y_pred = pipe.predict(X.iloc[te]) + loss = float(mean_squared_error(y.iloc[te], y_pred)) + else: + if hasattr(pipe.named_steps["model"], "predict_proba"): + y_prob = pipe.predict_proba(X.iloc[te])[:, 1] + else: + scores = pipe.decision_function(X.iloc[te]) + scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) + y_prob = scores + loss = float(brier_score_loss(y.iloc[te], y_prob)) + losses.append(loss) + + # instance level SHAP stability across folds + agg_std, stability, per_feat_std, per_inst_std = shap_stability_from_matrices(shap_mats_with_names) + + mse_like = float(np.mean(losses)) + return mse_like, float(stability), per_feat_std diff --git a/src/preprocessing.py b/src/preprocessing.py new file mode 100644 index 0000000..c68c03d --- /dev/null +++ b/src/preprocessing.py @@ -0,0 +1,138 @@ +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer, PolynomialFeatures +from sklearn.impute import SimpleImputer +from sklearn.feature_selection import SelectKBest, f_classif, f_regression +from sklearn.feature_selection import VarianceThreshold +from sklearn.base import BaseEstimator, TransformerMixin +import numpy as np + +class SafeSelectK(BaseEstimator, TransformerMixin): + def __init__(self, task: str, k=None): + self.task = task + self.k = k + self.selector_ = None + self.k_effective_ = None + self.support_mask_ = None + self.feature_names_in_ = None + self.feature_names_out_ = None + + def fit(self, X, y=None): + if self.k is None: + self.selector_ = "passthrough" + self.feature_names_out_ = self.feature_names_in_ + return self + n_feats = X.shape[1] + k_eff = int(min(max(1, self.k), n_feats)) + score_func = f_classif if self.task == "classification" else f_regression + sel = SelectKBest(score_func=score_func, k=k_eff).fit(X, y) + self.selector_ = sel + self.k_effective_ = k_eff + mask = np.zeros(n_feats, dtype=bool) + mask[sel.get_support(indices=True)] = True + self.support_mask_ = mask + if self.feature_names_in_ is not None: + self.feature_names_out_ = self.feature_names_in_[mask] + return self + + def set_feature_names_in(self, names): + self.feature_names_in_ = np.asarray(names) + + def transform(self, X): + if self.selector_ == "passthrough": + return X + return self.selector_.transform(X) + + def get_feature_names_out(self, input_features=None): + if getattr(self, "feature_names_out_", None) is not None: + return self.feature_names_out_ + if getattr(self, "support_mask_", None) is not None and input_features is not None: + input_features = np.asarray(input_features) + return input_features[self.support_mask_] + return None + + +class ConstantFilter(BaseEstimator, TransformerMixin): + def __init__(self, eps=0.0): + self.eps = eps + self.mask_ = None + self.feature_names_in_ = None + self.feature_names_out_ = None + + def fit(self, X, y=None): + X = np.asarray(X) + var = X.var(axis=0) + self.mask_ = var > self.eps + if self.feature_names_in_ is not None: + self.feature_names_out_ = np.asarray(self.feature_names_in_)[self.mask_] + return self + + def set_feature_names_in(self, names): + self.feature_names_in_ = np.asarray(names) + + def get_feature_names_out(self): + if self.feature_names_out_ is not None: + return self.feature_names_out_ + # fallback when names were not set + return np.array([f"f{i}" for i, keep in enumerate(self.mask_) if keep]) + + def transform(self, X): + X = np.asarray(X) + return X[:, self.mask_] + + +def build_preprocessor(X_full, task, cfg, fixed_k=None, fixed_poly_degree=None): + cat_cols = X_full.select_dtypes(include=["object", "category", "bool"]).columns.tolist() + num_cols = [c for c in X_full.columns if c not in cat_cols] + + num_imputer = SimpleImputer(strategy=cfg.get("num_impute_strategy", "median")) + cat_imputer = SimpleImputer(strategy=cfg.get("cat_impute_strategy", "most_frequent")) + + scaler_name = cfg.get("scaler", "standard") + if scaler_name == "standard": + num_scaler = StandardScaler(with_mean=True, with_std=True) + elif scaler_name == "robust": + num_scaler = RobustScaler() + elif scaler_name == "minmax": + num_scaler = MinMaxScaler() + elif scaler_name == "power": + num_scaler = PowerTransformer(method="yeo-johnson") + else: + num_scaler = "passthrough" + + poly_degree = fixed_poly_degree if fixed_poly_degree is not None else cfg.get("poly_degree", 1) + poly = PolynomialFeatures(degree=poly_degree, include_bias=False) if poly_degree > 1 else "passthrough" + + # always fix categories from the full dataset + fixed_categories = None + if len(cat_cols) > 0: + fixed_categories = {c: sorted(X_full[c].dropna().astype(str).unique()) for c in cat_cols} + + ohe_kwargs = dict(handle_unknown="ignore") + try: + ohe_kwargs["sparse_output"] = False + except TypeError: + ohe_kwargs["sparse"] = False + if fixed_categories is not None: + ohe_kwargs["categories"] = [fixed_categories[c] for c in cat_cols] + cat_encoder = OneHotEncoder(**ohe_kwargs) + + num_steps = [("impute", num_imputer), ("scale", num_scaler), ("poly", poly)] + if int(cfg.get("use_vt", 0)): + num_steps.append(("vt", VarianceThreshold(threshold=float(cfg.get("vt_thr", 0.0))))) + + ct = ColumnTransformer([ + ("num", Pipeline(steps=num_steps), num_cols), + ("cat", Pipeline(steps=[("impute", cat_imputer), ("oh", cat_encoder)]), cat_cols), + ]) + + select_k = fixed_k if fixed_k is not None else cfg.get("select_k", None) + selector = SafeSelectK(task=task, k=select_k) + + + pre = Pipeline([ + ("prep", ct), + ("drop_const", ConstantFilter(eps=0.0)), + ("select", selector), + ]) + return pre diff --git a/src/search/nsga_deap.py b/src/search/nsga_deap.py new file mode 100644 index 0000000..007a620 --- /dev/null +++ b/src/search/nsga_deap.py @@ -0,0 +1,147 @@ +import mlflow +from deap import base, creator, tools +from sklearn.utils import check_random_state + +from src.objectives import evaluate_config + +SCALERS = ["standard", "robust", "minmax", "power", "none"] +NUM_IMPUTE = ["median", "mean"] +CAT_IMPUTE = ["most_frequent"] +ALGOS = ["rf", "gbt", "mlp"] + + +def decode(ind): + i = 0 + algo = ALGOS[int(ind[i]) % len(ALGOS)] + i += 1 + scaler = SCALERS[int(ind[i]) % len(SCALERS)] + i += 1 + num_imp = NUM_IMPUTE[int(ind[i]) % len(NUM_IMPUTE)] + i += 1 + cat_imp = CAT_IMPUTE[int(ind[i]) % len(CAT_IMPUTE)] + i += 1 + poly_degree = 1 + int(ind[i]) % 2 + i += 1 + use_selectk = int(ind[i]) % 2 + i += 1 + select_k = [None, 16, 32, 64, 128][int(ind[i]) % 5] + i += 1 + if not use_selectk: + select_k = None + + pre_cfg = { + "num_impute_strategy": num_imp, + "cat_impute_strategy": cat_imp, + "scaler": scaler, + "poly_degree": poly_degree, + "select_k": select_k, + } + + if algo == "rf": + n_estimators = [100, 200, 300, 400, 500][int(ind[i]) % 5] + i += 1 + max_depth = [2, 4, 6, 8, 10, 12][int(ind[i]) % 6] + i += 1 + max_features = ["sqrt", "log2", None][int(ind[i]) % 3] + i += 1 + params = { + "n_estimators": n_estimators, + "max_depth": max_depth, + "max_features": max_features, + } + elif algo == "gbt": + n_estimators = [100, 200, 300, 400, 500][int(ind[i]) % 5] + i += 1 + max_depth = [2, 3, 4, 5][int(ind[i]) % 4] + i += 1 + lr = [0.01, 0.02, 0.05, 0.1, 0.2][int(ind[i]) % 5] + i += 1 + params = { + "n_estimators": n_estimators, + "max_depth": max_depth, + "learning_rate": lr, + } + else: + n_layers = [1, 2, 3][int(ind[i]) % 3] + i += 1 + h = [] + for _ in range(n_layers): + h.append([16, 32, 64, 128, 256][int(ind[i]) % 5]) + i += 1 + # skip unused gene slots if fewer than 3 layers + i += max(0, 3 - n_layers) + alpha = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2][int(ind[i]) % 5] + i += 1 + lr_init = [1e-4, 5e-4, 1e-3, 5e-3, 1e-2][int(ind[i]) % 5] + i += 1 + params = { + "hidden_layers": tuple(h), + "activation": "relu", + "alpha": alpha, + "lr_init": lr_init, + "max_iter": 200, + } + + return algo, params, pre_cfg + + +def build_toolbox(X, y, task, seed, cv_folds, mlflow_experiment): + rng = check_random_state(seed) + + # guard against duplicate class creation in the same process + if not hasattr(creator, "FitnessMSEStab"): + creator.create("FitnessMSEStab", base.Fitness, weights=(-1.0, 1.0)) + if not hasattr(creator, "Individual"): + creator.create("Individual", list, fitness=creator.FitnessMSEStab) + + toolbox = base.Toolbox() + toolbox.register("gene", rng.randint, 0, 1000000) + toolbox.register( + "individual", + tools.initRepeat, + creator.Individual, + toolbox.gene, + n=16, + ) + toolbox.register("population", tools.initRepeat, list, toolbox.individual) + + def eval_ind(individual): + algo, model_params, pre_cfg = decode(individual) + + # one run per individual; the outer script sets the experiment + with mlflow.start_run(run_name=f"{algo}", nested=True): + for gi, g in enumerate(individual): + mlflow.log_param(f"g{gi}", int(g)) + mlflow.log_param("algo", algo) + for k, v in model_params.items(): + mlflow.log_param(f"m_{k}", v) + for k, v in pre_cfg.items(): + mlflow.log_param(f"p_{k}", v) + + mse_like, stability, _ = evaluate_config( + X, + y, + task, + algo, + model_params, + pre_cfg, + cv_folds=cv_folds, + seed=seed, + ) + mlflow.log_metric("mse_like", mse_like) + mlflow.log_metric("stability", stability) + + return mse_like, stability + + toolbox.register("evaluate", eval_ind) + toolbox.register("mate", tools.cxTwoPoint) + toolbox.register( + "mutate", + tools.mutUniformInt, + low=0, + up=1000000, + indpb=0.2, + ) + toolbox.register("select", tools.selNSGA2) + + return toolbox diff --git a/src/stability.py b/src/stability.py new file mode 100644 index 0000000..ef3511c --- /dev/null +++ b/src/stability.py @@ -0,0 +1,273 @@ +import numpy as np +import pandas as pd +import shap +import time + + +def compute_shap_matrix( + pipe, + X_fit, + y_fit, + X_eval, + task_type, + bg_size=128, + max_eval_rows=1024, + rng_seed=0, +): + """ + Fit the pipeline on (X_fit, y_fit), then compute SHAP values on X_eval. + + Important: for stability, X_eval should be the same rows and order + across all folds or retrains that you want to compare. + """ + t0 = time.time() + pipe.fit(X_fit, y_fit) + t_fit = time.time() - t0 + + pre = pipe.named_steps["pre"] + model = pipe.named_steps["model"] + prep = pre.named_steps["prep"] + + # derive names after prep using TRAIN data, not eval + X_probe = prep.transform(X_fit[:1]) + names_after_prep = getattr( + prep, + "get_feature_names_out", + lambda: np.array([f"f{i}" for i in range(X_probe.shape[1])]), + )() + + # thread names through constant-dropper + if "drop_const" in pre.named_steps: + dropper = pre.named_steps["drop_const"] + if hasattr(dropper, "set_feature_names_in"): + dropper.set_feature_names_in(names_after_prep) + names_into_select = dropper.get_feature_names_out() + else: + names_into_select = names_after_prep + + # thread names into selector + selector = pre.named_steps["select"] + if hasattr(selector, "set_feature_names_in"): + selector.set_feature_names_in(names_into_select) + + # preprocess eval and train splits + X_eval_proc = pre.transform(X_eval) + X_train_proc = pre.transform(X_fit) + + # cap eval rows for speed, keep deterministic subsample + n_eval = X_eval_proc.shape[0] + if n_eval > max_eval_rows: + rng = np.random.RandomState(rng_seed) + idx = rng.choice(n_eval, size=max_eval_rows, replace=False) + X_eval_proc = X_eval_proc[idx] + + n_cols = X_eval_proc.shape[1] + + # resolve final feature names after selection + feat_names = None + if hasattr(selector, "get_feature_names_out"): + feat_names = selector.get_feature_names_out(input_features=names_into_select) + + if feat_names is None: + supp = getattr(selector, "support_mask_", None) + if ( + supp is not None + and len(names_into_select) == supp.shape[0] + and supp.sum() == n_cols + ): + feat_names = np.asarray(names_into_select)[supp] + else: + feat_names = np.array([f"f{i}" for i in range(n_cols)]) + else: + feat_names = np.asarray(feat_names) + if len(feat_names) != n_cols: + feat_names = np.array([f"f{i}" for i in range(n_cols)]) + + # build background from TRAIN split to avoid leakage + rng = np.random.RandomState(rng_seed) + n_bg_pool = X_train_proc.shape[0] + bg_n = min(bg_size, n_bg_pool) + bg_idx = rng.choice(n_bg_pool, size=bg_n, replace=False) + background = X_train_proc[bg_idx] + + # choose explainer by model type + def _is_tree_model(m): + # sklearn trees and ensembles + if hasattr(m, "tree_") or hasattr(m, "estimators_"): + return True + # xgboost and lightgbm wrappers + try: + import xgboost as _xgb + + if isinstance(m, _xgb.XGBModel): + return True + except Exception: + pass + try: + import lightgbm as _lgb + + if isinstance(getattr(m, "booster_", None), _lgb.basic.Booster): + return True + except Exception: + pass + return False + + # compute SHAP values + t1 = time.time() + vals = None + + if _is_tree_model(model): + # tree specific, interventional mode with background data + explainer = shap.TreeExplainer( + model, + data=background, + feature_perturbation="interventional", + ) + # disable strict additivity check to avoid ExplainerError + sv = explainer.shap_values(X_eval_proc, check_additivity=False) + # shap API can return list for classification. pick positive class if so. + if isinstance(sv, list): + cls_idx = 1 if len(sv) > 1 else 0 + vals = np.asarray(sv[cls_idx]) + else: + vals = np.asarray(sv) + + elif hasattr(model, "coef_"): + # linear models + explainer = shap.LinearExplainer(model, background) + vals = np.asarray(explainer.shap_values(X_eval_proc)) + + else: + # fallback generic with training background masker + masker = shap.maskers.Independent(background) + if task_type == "classification" and hasattr(model, "predict_proba"): + f = lambda M: model.predict_proba(M)[:, 1] + else: + f = lambda M: model.predict(M) + explainer = shap.Explainer(f, masker) + out = explainer(X_eval_proc) + vals = np.asarray(getattr(out, "values", out)) + + t_shap = time.time() - t1 + + # normalize shapes to (n_rows, n_features) + vals = np.asarray(vals) + vals = np.squeeze(vals) + + if vals.ndim == 3 and vals.shape[2] in (2,) and vals.shape[1] == len(feat_names): + vals = vals[..., -1] + if vals.ndim == 3 and vals.shape[-1] == len(feat_names): + vals = vals.reshape(-1, vals.shape[-1]) + if vals.ndim == 2 and vals.shape[0] == len(feat_names): + vals = vals.T + + return vals, t_fit, t_shap, feat_names + + +def mean_abs_shap(shap_matrix, feature_names): + """ + Global mean absolute SHAP per feature. + Still useful for descriptive plots, but not used for instance level stability. + """ + return pd.Series( + np.abs(shap_matrix).mean(axis=0), + index=np.asarray(feature_names), + ) + + +def _align_matrices_to_union(mats_with_names): + """ + mats_with_names: list of (shap_matrix, feature_names) + + Each shap_matrix has shape (n_instances, n_features_k). + feature_names is array-like of length n_features_k. + + Returns: + all_feats: list[str] + T: np.ndarray of shape (n_models, n_instances, n_all_feats) + """ + if not mats_with_names: + raise ValueError("No SHAP matrices provided") + + # check that all matrices have the same number of instances + n_instances = mats_with_names[0][0].shape[0] + for M, names in mats_with_names: + if M.shape[0] != n_instances: + raise ValueError( + f"All SHAP matrices must have same number of rows. " + f"Expected {n_instances}, got {M.shape[0]}" + ) + + # union of all feature names + all_feats = sorted( + set().union(*[set(np.asarray(names)) for _, names in mats_with_names]) + ) + n_models = len(mats_with_names) + n_feats = len(all_feats) + + T = np.zeros((n_models, n_instances, n_feats), dtype=float) + feat_index = {f: j for j, f in enumerate(all_feats)} + + for m_idx, (M, names) in enumerate(mats_with_names): + names = np.asarray(names) + col_map = {name: c for c, name in enumerate(names)} + for fname, j_global in feat_index.items(): + if fname in col_map: + j_local = col_map[fname] + T[m_idx, :, j_global] = M[:, j_local] + # if fname not present, leave zeros + + return all_feats, T + + +def shap_stability_from_matrices(mats_with_names): + """ + mats_with_names: list of tuples (shap_matrix, feature_names) + shap_matrix: np.ndarray of shape (n_instances, n_features_k) + feature_names: list or array of length n_features_k + + This measures instance level stability: + For each instance i and feature j we look at the SHAP values + across models and compute their standard deviation. + + Steps: + 1) Align all matrices on the union of feature names. + 2) Build tensor T of shape (n_models, n_instances, n_features_union). + 3) Compute std across models: per_inst_feat_std = T.std(axis=0). + 4) Aggregate: + agg_std = mean of per_inst_feat_std over instances and features. + stability_score = 1 / (1 + agg_std). + + Returns: + agg_std: float + stability_score: float + per_feat_std: pd.Series with mean std per feature over instances + per_inst_std: np.ndarray with mean std per instance over features + """ + if not mats_with_names: + raise ValueError("No SHAP matrices provided") + + if len(mats_with_names) < 2: + raise ValueError( + f"Need at least 2 models to estimate stability, got {len(mats_with_names)}" + ) + + feat_names_union, T = _align_matrices_to_union(mats_with_names) + + # std across models for each instance and feature + per_inst_feat_std = T.std(axis=0) # shape (n_instances, n_features) + + # aggregate + agg_std = float(per_inst_feat_std.mean()) + stability_score = 1.0 / (1.0 + agg_std) + + # per feature: average std over instances + per_feat_std = per_inst_feat_std.mean(axis=0) + per_feat_std_series = pd.Series(per_feat_std, index=feat_names_union) + + # per instance: average std over features + per_inst_std = per_inst_feat_std.mean(axis=1) + + return agg_std, stability_score, per_feat_std_series, per_inst_std + +