import argparse import random import pickle from pathlib import Path import numpy as np import mlflow from deap import algorithms from deap.tools.emo import sortNondominated import pandas as pd from src.data_openml import load_dataset from src.search.nsga_deap import build_toolbox, decode from src.preprocessing import build_preprocessor from src.models import make_model from src.stability import compute_shap_matrix def save_checkpoint(path, gen, pop, seed): state = { "gen": gen, "pop": pop, "py_random_state": random.getstate(), "np_random_state": np.random.get_state(), "seed": seed, } with open(path, "wb") as f: pickle.dump(state, f) def load_checkpoint(path): with open(path, "rb") as f: state = pickle.load(f) random.setstate(state["py_random_state"]) np.random.set_state(state["np_random_state"]) return state["gen"], state["pop"], state["seed"] def main(): ap = argparse.ArgumentParser() ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) ap.add_argument("--generations", type=int, default=10) ap.add_argument("--pop-size", type=int, default=24) ap.add_argument("--seed", type=int, default=42) ap.add_argument("--cv-folds", type=int, default=3) ap.add_argument("--experiment", default="deap_nsga_shap") ap.add_argument("--checkpoint-every", type=int, default=5) ap.add_argument( "--shap-pf-eval-rows", type=int, default=512, help="Number of rows from the dataset to use when saving SHAP for Pareto models", ) args = ap.parse_args() # data and experiment X, y, task = load_dataset(args.dataset, random_state=args.seed) mlflow.set_experiment(args.experiment) outdir = Path("runs") / args.dataset outdir.mkdir(parents=True, exist_ok=True) ckpt_path = outdir / "checkpoint.pkl" # seed RNGs random.seed(args.seed) np.random.seed(args.seed) # toolbox for this run toolbox = build_toolbox( X, y, task, seed=args.seed, cv_folds=args.cv_folds, mlflow_experiment=args.experiment, ) # initial population or resume from checkpoint if ckpt_path.exists(): start_gen, pop, loaded_seed = load_checkpoint(ckpt_path) if loaded_seed != args.seed: print( f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}" ) print(f"Resuming from checkpoint at generation {start_gen}") else: pop = toolbox.population(n=args.pop_size) fits = list(map(toolbox.evaluate, pop)) for ind, fit in zip(pop, fits): ind.fitness.values = fit start_gen = 0 save_checkpoint(ckpt_path, start_gen, pop, args.seed) print(f"Initial checkpoint saved at generation {start_gen}") # GA loop for gen in range(start_gen, args.generations): offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2) fits = list(map(toolbox.evaluate, offspring)) for ind, fit in zip(offspring, fits): ind.fitness.values = fit pop = toolbox.select(pop + offspring, k=args.pop_size) if (gen + 1) % args.checkpoint_every == 0: save_checkpoint(ckpt_path, gen + 1, pop, args.seed) print(f"Checkpoint saved at generation {gen + 1}") # final Pareto front pf = sortNondominated(pop, len(pop), first_front_only=True)[0] rows = [] for ind in pf: algo, model_params, pre_cfg = decode(ind) rows.append( { "algo": algo, "mse_like": ind.fitness.values[0], "stability": ind.fitness.values[1], **{f"m_{k}": v for k, v in model_params.items()}, **{f"p_{k}": v for k, v in pre_cfg.items()}, } ) pareto_path = outdir / "pareto_front.csv" pd.DataFrame(rows).to_csv(pareto_path, index=False) print(f"Saved Pareto front to {pareto_path}") shap_dir = outdir / "shap" shap_dir.mkdir(exist_ok=True) eval_rows = min(args.shap_pf_eval_rows, len(X)) rng = np.random.RandomState(args.seed) eval_idx = rng.choice(len(X), size=eval_rows, replace=False) X_eval_shap = X.iloc[eval_idx] y_full = y for i, ind in enumerate(pf): algo, model_params, pre_cfg = decode(ind) fixed_poly_degree = pre_cfg.get("poly_degree", 1) fixed_k = pre_cfg.get("select_k", None) preproc = build_preprocessor( X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree, ) model = make_model(task, algo, model_params, random_state=args.seed) from sklearn.pipeline import Pipeline as SkPipeline pipe = SkPipeline([("pre", preproc), ("model", model)]) shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( pipe, X_fit=X, y_fit=y_full, X_eval=X_eval_shap, task_type=task, bg_size=128, max_eval_rows=eval_rows, rng_seed=args.seed, ) np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals) np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names)) print(f"Saved SHAP arrays for {len(pf)} Pareto models under {shap_dir}") if __name__ == "__main__": main()