Files
deap-based-automl-experimen…/run_deap.py
2025-11-24 23:15:00 -04:00

171 lines
5.3 KiB
Python

import argparse
import random
import pickle
from pathlib import Path
import numpy as np
import mlflow
from deap import algorithms
from deap.tools.emo import sortNondominated
import pandas as pd
from src.data_openml import load_dataset
from src.search.nsga_deap import build_toolbox, decode
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix
def save_checkpoint(path, gen, pop, seed):
state = {
"gen": gen,
"pop": pop,
"py_random_state": random.getstate(),
"np_random_state": np.random.get_state(),
"seed": seed,
}
with open(path, "wb") as f:
pickle.dump(state, f)
def load_checkpoint(path):
with open(path, "rb") as f:
state = pickle.load(f)
random.setstate(state["py_random_state"])
np.random.set_state(state["np_random_state"])
return state["gen"], state["pop"], state["seed"]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--generations", type=int, default=10)
ap.add_argument("--pop-size", type=int, default=24)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--cv-folds", type=int, default=3)
ap.add_argument("--experiment", default="deap_nsga_shap")
ap.add_argument("--checkpoint-every", type=int, default=5)
ap.add_argument(
"--shap-pf-eval-rows",
type=int,
default=512,
help="Number of rows from the dataset to use when saving SHAP for Pareto models",
)
args = ap.parse_args()
# data and experiment
X, y, task = load_dataset(args.dataset, random_state=args.seed)
mlflow.set_experiment(args.experiment)
outdir = Path("runs") / args.dataset
outdir.mkdir(parents=True, exist_ok=True)
ckpt_path = outdir / "checkpoint.pkl"
# seed RNGs
random.seed(args.seed)
np.random.seed(args.seed)
# toolbox for this run
toolbox = build_toolbox(
X,
y,
task,
seed=args.seed,
cv_folds=args.cv_folds,
mlflow_experiment=args.experiment,
)
# initial population or resume from checkpoint
if ckpt_path.exists():
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
if loaded_seed != args.seed:
print(
f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}"
)
print(f"Resuming from checkpoint at generation {start_gen}")
else:
pop = toolbox.population(n=args.pop_size)
fits = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fits):
ind.fitness.values = fit
start_gen = 0
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
print(f"Initial checkpoint saved at generation {start_gen}")
# GA loop
for gen in range(start_gen, args.generations):
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
fits = list(map(toolbox.evaluate, offspring))
for ind, fit in zip(offspring, fits):
ind.fitness.values = fit
pop = toolbox.select(pop + offspring, k=args.pop_size)
if (gen + 1) % args.checkpoint_every == 0:
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
print(f"Checkpoint saved at generation {gen + 1}")
# final Pareto front
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
rows = []
for ind in pf:
algo, model_params, pre_cfg = decode(ind)
rows.append(
{
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
**{f"m_{k}": v for k, v in model_params.items()},
**{f"p_{k}": v for k, v in pre_cfg.items()},
}
)
pareto_path = outdir / "pareto_front.csv"
pd.DataFrame(rows).to_csv(pareto_path, index=False)
print(f"Saved Pareto front to {pareto_path}")
shap_dir = outdir / "shap"
shap_dir.mkdir(exist_ok=True)
eval_rows = min(args.shap_pf_eval_rows, len(X))
rng = np.random.RandomState(args.seed)
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
X_eval_shap = X.iloc[eval_idx]
y_full = y
for i, ind in enumerate(pf):
algo, model_params, pre_cfg = decode(ind)
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
fixed_k = pre_cfg.get("select_k", None)
preproc = build_preprocessor(
X,
task,
pre_cfg,
fixed_k=fixed_k,
fixed_poly_degree=fixed_poly_degree,
)
model = make_model(task, algo, model_params, random_state=args.seed)
from sklearn.pipeline import Pipeline as SkPipeline
pipe = SkPipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X,
y_fit=y_full,
X_eval=X_eval_shap,
task_type=task,
bg_size=128,
max_eval_rows=eval_rows,
rng_seed=args.seed,
)
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
print(f"Saved SHAP arrays for {len(pf)} Pareto models under {shap_dir}")
if __name__ == "__main__":
main()