import argparse import random import pickle from pathlib import Path import numpy as np import mlflow from datetime import datetime from deap import algorithms from deap.tools.emo import sortNondominated import pandas as pd from src.data_openml import load_dataset from src.search.nsga_deap import build_toolbox, decode from src.preprocessing import build_preprocessor from src.models import make_model from src.stability import compute_shap_matrix # Main network # mlflow.set_tracking_uri("http://192.168.2.169:5000") # Cluster Subnet mlflow.set_tracking_uri("http://10.10.0.5:5000") # Network with DNS resolution (specified hosts or Tailnet) #mlflow.set_tracking_uri("http://medea:5000") def save_checkpoint(path, gen, pop, seed): state = { "gen": gen, "pop": pop, "py_random_state": random.getstate(), "np_random_state": np.random.get_state(), "seed": seed, } with open(path, "wb") as f: pickle.dump(state, f) def load_checkpoint(path): with open(path, "rb") as f: state = pickle.load(f) random.setstate(state["py_random_state"]) np.random.set_state(state["np_random_state"]) return state["gen"], state["pop"], state["seed"] def main(): ap = argparse.ArgumentParser() ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) ap.add_argument("--generations", type=int, default=10) ap.add_argument("--pop-size", type=int, default=24) ap.add_argument("--seed", type=int, default=42) ap.add_argument("--cv-folds", type=int, default=3) experiment_name = f"deap_nsga_shap_{datetime.now().strftime('%Y%m%d_%H%M%S')}" ap.add_argument("--experiment", default=experiment_name) ap.add_argument("--checkpoint-every", type=int, default=5) ap.add_argument( "--shap-pf-eval-rows", type=int, default=512, help="Number of rows from the dataset to use when saving SHAP for Pareto models", ) args = ap.parse_args() # data and experiment X, y, task = load_dataset(args.dataset, random_state=args.seed) mlflow.set_experiment(args.experiment) outdir = Path("runs") / args.dataset outdir.mkdir(parents=True, exist_ok=True) ckpt_path = outdir / "checkpoint.pkl" # seed RNGs random.seed(args.seed) np.random.seed(args.seed) # toolbox for this run toolbox = build_toolbox( X, y, task, seed=args.seed, cv_folds=args.cv_folds, mlflow_experiment=args.experiment, ) # initial population or resume from checkpoint if ckpt_path.exists(): start_gen, pop, loaded_seed = load_checkpoint(ckpt_path) if loaded_seed != args.seed: print( f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}" ) print(f"Resuming from checkpoint at generation {start_gen}") else: pop = toolbox.population(n=args.pop_size) fits = list(map(toolbox.evaluate, pop)) for ind, fit in zip(pop, fits): ind.fitness.values = fit start_gen = 0 save_checkpoint(ckpt_path, start_gen, pop, args.seed) print(f"Initial checkpoint saved at generation {start_gen}") # GA loop for gen in range(start_gen, args.generations): offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2) fits = list(map(toolbox.evaluate, offspring)) for ind, fit in zip(offspring, fits): ind.fitness.values = fit pop = toolbox.select(pop + offspring, k=args.pop_size) if (gen + 1) % args.checkpoint_every == 0: save_checkpoint(ckpt_path, gen + 1, pop, args.seed) print(f"Checkpoint saved at generation {gen + 1}") # final Pareto front pf = sortNondominated(pop, len(pop), first_front_only=True)[0] rows = [] for ind in pf: algo, model_params, pre_cfg = decode(ind) rows.append( { "algo": algo, "mse_like": ind.fitness.values[0], "stability": ind.fitness.values[1], **{f"m_{k}": v for k, v in model_params.items()}, **{f"p_{k}": v for k, v in pre_cfg.items()}, } ) pareto_path = outdir / "pareto_front.csv" pd.DataFrame(rows).to_csv(pareto_path, index=False) print(f"Saved Pareto front to {pareto_path}") shap_dir = outdir / "shap" shap_dir.mkdir(exist_ok=True) eval_rows = min(args.shap_pf_eval_rows, len(X)) rng = np.random.RandomState(args.seed) eval_idx = rng.choice(len(X), size=eval_rows, replace=False) X_eval_shap = X.iloc[eval_idx] y_full = y for i, ind in enumerate(pf): algo, model_params, pre_cfg = decode(ind) fixed_poly_degree = pre_cfg.get("poly_degree", 1) fixed_k = pre_cfg.get("select_k", None) preproc = build_preprocessor( X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree, ) model = make_model(task, algo, model_params, random_state=args.seed) from sklearn.pipeline import Pipeline as SkPipeline pipe = SkPipeline([("pre", preproc), ("model", model)]) shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( pipe, X_fit=X, y_fit=y_full, X_eval=X_eval_shap, task_type=task, bg_size=128, max_eval_rows=eval_rows, rng_seed=args.seed, ) np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals) np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names)) print(f"Saved SHAP arrays for {len(pf)} Pareto models under {shap_dir}") if __name__ == "__main__": main()