170 lines
5.3 KiB
Python
170 lines
5.3 KiB
Python
import argparse
|
|
import random
|
|
import pickle
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import mlflow
|
|
from deap import algorithms
|
|
from deap.tools.emo import sortNondominated
|
|
import pandas as pd
|
|
|
|
from src.data_openml import load_dataset
|
|
from src.search.nsga_deap import build_toolbox, decode
|
|
from src.preprocessing import build_preprocessor
|
|
from src.models import make_model
|
|
from src.stability import compute_shap_matrix
|
|
|
|
def save_checkpoint(path, gen, pop, seed):
|
|
state = {
|
|
"gen": gen,
|
|
"pop": pop,
|
|
"py_random_state": random.getstate(),
|
|
"np_random_state": np.random.get_state(),
|
|
"seed": seed,
|
|
}
|
|
with open(path, "wb") as f:
|
|
pickle.dump(state, f)
|
|
|
|
|
|
def load_checkpoint(path):
|
|
with open(path, "rb") as f:
|
|
state = pickle.load(f)
|
|
random.setstate(state["py_random_state"])
|
|
np.random.set_state(state["np_random_state"])
|
|
return state["gen"], state["pop"], state["seed"]
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
|
ap.add_argument("--generations", type=int, default=10)
|
|
ap.add_argument("--pop-size", type=int, default=24)
|
|
ap.add_argument("--seed", type=int, default=42)
|
|
ap.add_argument("--cv-folds", type=int, default=3)
|
|
ap.add_argument("--experiment", default="deap_nsga_shap")
|
|
ap.add_argument("--checkpoint-every", type=int, default=5)
|
|
ap.add_argument(
|
|
"--shap-pf-eval-rows",
|
|
type=int,
|
|
default=512,
|
|
help="Number of rows from the dataset to use when saving SHAP for Pareto models",
|
|
)
|
|
args = ap.parse_args()
|
|
|
|
# data and experiment
|
|
X, y, task = load_dataset(args.dataset, random_state=args.seed)
|
|
mlflow.set_experiment(args.experiment)
|
|
|
|
outdir = Path("runs") / args.dataset
|
|
outdir.mkdir(parents=True, exist_ok=True)
|
|
ckpt_path = outdir / "checkpoint.pkl"
|
|
|
|
# seed RNGs
|
|
random.seed(args.seed)
|
|
np.random.seed(args.seed)
|
|
|
|
# toolbox for this run
|
|
toolbox = build_toolbox(
|
|
X,
|
|
y,
|
|
task,
|
|
seed=args.seed,
|
|
cv_folds=args.cv_folds,
|
|
mlflow_experiment=args.experiment,
|
|
)
|
|
|
|
# initial population or resume from checkpoint
|
|
if ckpt_path.exists():
|
|
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
|
|
if loaded_seed != args.seed:
|
|
print(
|
|
f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}"
|
|
)
|
|
print(f"Resuming from checkpoint at generation {start_gen}")
|
|
else:
|
|
pop = toolbox.population(n=args.pop_size)
|
|
fits = list(map(toolbox.evaluate, pop))
|
|
for ind, fit in zip(pop, fits):
|
|
ind.fitness.values = fit
|
|
start_gen = 0
|
|
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
|
|
print(f"Initial checkpoint saved at generation {start_gen}")
|
|
|
|
# GA loop
|
|
for gen in range(start_gen, args.generations):
|
|
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
|
|
fits = list(map(toolbox.evaluate, offspring))
|
|
for ind, fit in zip(offspring, fits):
|
|
ind.fitness.values = fit
|
|
pop = toolbox.select(pop + offspring, k=args.pop_size)
|
|
|
|
if (gen + 1) % args.checkpoint_every == 0:
|
|
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
|
|
print(f"Checkpoint saved at generation {gen + 1}")
|
|
|
|
# final Pareto front
|
|
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
|
|
rows = []
|
|
for ind in pf:
|
|
algo, model_params, pre_cfg = decode(ind)
|
|
rows.append(
|
|
{
|
|
"algo": algo,
|
|
"mse_like": ind.fitness.values[0],
|
|
"stability": ind.fitness.values[1],
|
|
**{f"m_{k}": v for k, v in model_params.items()},
|
|
**{f"p_{k}": v for k, v in pre_cfg.items()},
|
|
}
|
|
)
|
|
|
|
pareto_path = outdir / "pareto_front.csv"
|
|
pd.DataFrame(rows).to_csv(pareto_path, index=False)
|
|
print(f"Saved Pareto front to {pareto_path}")
|
|
|
|
shap_dir = outdir / "shap"
|
|
shap_dir.mkdir(exist_ok=True)
|
|
|
|
eval_rows = min(args.shap_pf_eval_rows, len(X))
|
|
rng = np.random.RandomState(args.seed)
|
|
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
|
|
X_eval_shap = X.iloc[eval_idx]
|
|
y_full = y
|
|
|
|
for i, ind in enumerate(pf):
|
|
algo, model_params, pre_cfg = decode(ind)
|
|
|
|
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
|
|
fixed_k = pre_cfg.get("select_k", None)
|
|
|
|
preproc = build_preprocessor(
|
|
X,
|
|
task,
|
|
pre_cfg,
|
|
fixed_k=fixed_k,
|
|
fixed_poly_degree=fixed_poly_degree,
|
|
)
|
|
model = make_model(task, algo, model_params, random_state=args.seed)
|
|
from sklearn.pipeline import Pipeline as SkPipeline
|
|
pipe = SkPipeline([("pre", preproc), ("model", model)])
|
|
|
|
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
|
pipe,
|
|
X_fit=X,
|
|
y_fit=y_full,
|
|
X_eval=X_eval_shap,
|
|
task_type=task,
|
|
bg_size=128,
|
|
max_eval_rows=eval_rows,
|
|
rng_seed=args.seed,
|
|
)
|
|
|
|
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
|
|
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
|
|
|
|
print(f"Saved SHAP arrays for {len(pf)} Pareto models under {shap_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|