A new beginning
This commit is contained in:
170
run_deap.py
Normal file
170
run_deap.py
Normal file
@@ -0,0 +1,170 @@
|
||||
import argparse
|
||||
import random
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import mlflow
|
||||
from deap import algorithms
|
||||
from deap.tools.emo import sortNondominated
|
||||
import pandas as pd
|
||||
|
||||
from src.data_openml import load_dataset
|
||||
from src.search.nsga_deap import build_toolbox, decode
|
||||
from src.preprocessing import build_preprocessor
|
||||
from src.models import make_model
|
||||
from src.stability import compute_shap_matrix
|
||||
|
||||
|
||||
def save_checkpoint(path, gen, pop, seed):
|
||||
state = {
|
||||
"gen": gen,
|
||||
"pop": pop,
|
||||
"py_random_state": random.getstate(),
|
||||
"np_random_state": np.random.get_state(),
|
||||
"seed": seed,
|
||||
}
|
||||
with open(path, "wb") as f:
|
||||
pickle.dump(state, f)
|
||||
|
||||
|
||||
def load_checkpoint(path):
|
||||
with open(path, "rb") as f:
|
||||
state = pickle.load(f)
|
||||
random.setstate(state["py_random_state"])
|
||||
np.random.set_state(state["np_random_state"])
|
||||
return state["gen"], state["pop"], state["seed"]
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||
ap.add_argument("--generations", type=int, default=10)
|
||||
ap.add_argument("--pop-size", type=int, default=24)
|
||||
ap.add_argument("--seed", type=int, default=42)
|
||||
ap.add_argument("--cv-folds", type=int, default=3)
|
||||
ap.add_argument("--experiment", default="deap_nsga_shap")
|
||||
ap.add_argument("--checkpoint-every", type=int, default=5)
|
||||
ap.add_argument(
|
||||
"--shap-pf-eval-rows",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Number of rows from the dataset to use when saving SHAP for Pareto models",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
# data and experiment
|
||||
X, y, task = load_dataset(args.dataset, random_state=args.seed)
|
||||
mlflow.set_experiment(args.experiment)
|
||||
|
||||
outdir = Path("runs") / args.dataset
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
ckpt_path = outdir / "checkpoint.pkl"
|
||||
|
||||
# seed RNGs
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
|
||||
# toolbox for this run
|
||||
toolbox = build_toolbox(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
seed=args.seed,
|
||||
cv_folds=args.cv_folds,
|
||||
mlflow_experiment=args.experiment,
|
||||
)
|
||||
|
||||
# initial population or resume from checkpoint
|
||||
if ckpt_path.exists():
|
||||
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
|
||||
if loaded_seed != args.seed:
|
||||
print(
|
||||
f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}"
|
||||
)
|
||||
print(f"Resuming from checkpoint at generation {start_gen}")
|
||||
else:
|
||||
pop = toolbox.population(n=args.pop_size)
|
||||
fits = list(map(toolbox.evaluate, pop))
|
||||
for ind, fit in zip(pop, fits):
|
||||
ind.fitness.values = fit
|
||||
start_gen = 0
|
||||
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
|
||||
print(f"Initial checkpoint saved at generation {start_gen}")
|
||||
|
||||
# GA loop
|
||||
for gen in range(start_gen, args.generations):
|
||||
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
|
||||
fits = list(map(toolbox.evaluate, offspring))
|
||||
for ind, fit in zip(offspring, fits):
|
||||
ind.fitness.values = fit
|
||||
pop = toolbox.select(pop + offspring, k=args.pop_size)
|
||||
|
||||
if (gen + 1) % args.checkpoint_every == 0:
|
||||
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
|
||||
print(f"Checkpoint saved at generation {gen + 1}")
|
||||
|
||||
# final Pareto front
|
||||
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
|
||||
rows = []
|
||||
for ind in pf:
|
||||
algo, model_params, pre_cfg = decode(ind)
|
||||
rows.append(
|
||||
{
|
||||
"algo": algo,
|
||||
"mse_like": ind.fitness.values[0],
|
||||
"stability": ind.fitness.values[1],
|
||||
**{f"m_{k}": v for k, v in model_params.items()},
|
||||
**{f"p_{k}": v for k, v in pre_cfg.items()},
|
||||
}
|
||||
)
|
||||
|
||||
pareto_path = outdir / "pareto_front.csv"
|
||||
pd.DataFrame(rows).to_csv(pareto_path, index=False)
|
||||
print(f"Saved Pareto front to {pareto_path}")
|
||||
|
||||
shap_dir = outdir / "shap"
|
||||
shap_dir.mkdir(exist_ok=True)
|
||||
|
||||
eval_rows = min(args.shap_pf_eval_rows, len(X))
|
||||
rng = np.random.RandomState(args.seed)
|
||||
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
|
||||
X_eval_shap = X.iloc[eval_idx]
|
||||
y_full = y
|
||||
|
||||
for i, ind in enumerate(pf):
|
||||
algo, model_params, pre_cfg = decode(ind)
|
||||
|
||||
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
|
||||
fixed_k = pre_cfg.get("select_k", None)
|
||||
|
||||
preproc = build_preprocessor(
|
||||
X,
|
||||
task,
|
||||
pre_cfg,
|
||||
fixed_k=fixed_k,
|
||||
fixed_poly_degree=fixed_poly_degree,
|
||||
)
|
||||
model = make_model(task, algo, model_params, random_state=args.seed)
|
||||
from sklearn.pipeline import Pipeline as SkPipeline
|
||||
pipe = SkPipeline([("pre", preproc), ("model", model)])
|
||||
|
||||
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||
pipe,
|
||||
X_fit=X,
|
||||
y_fit=y_full,
|
||||
X_eval=X_eval_shap,
|
||||
task_type=task,
|
||||
bg_size=128,
|
||||
max_eval_rows=eval_rows,
|
||||
rng_seed=args.seed,
|
||||
)
|
||||
|
||||
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
|
||||
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
|
||||
|
||||
print(f"Saved SHAP arrays for {len(pf)} Pareto models under {shap_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user