A new beginning

This commit is contained in:
Varyngoth
2025-11-24 23:15:00 -04:00
commit 59139bab3f
10 changed files with 1070 additions and 0 deletions

170
run_deap.py Normal file
View File

@@ -0,0 +1,170 @@
import argparse
import random
import pickle
from pathlib import Path
import numpy as np
import mlflow
from deap import algorithms
from deap.tools.emo import sortNondominated
import pandas as pd
from src.data_openml import load_dataset
from src.search.nsga_deap import build_toolbox, decode
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix
def save_checkpoint(path, gen, pop, seed):
state = {
"gen": gen,
"pop": pop,
"py_random_state": random.getstate(),
"np_random_state": np.random.get_state(),
"seed": seed,
}
with open(path, "wb") as f:
pickle.dump(state, f)
def load_checkpoint(path):
with open(path, "rb") as f:
state = pickle.load(f)
random.setstate(state["py_random_state"])
np.random.set_state(state["np_random_state"])
return state["gen"], state["pop"], state["seed"]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--generations", type=int, default=10)
ap.add_argument("--pop-size", type=int, default=24)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--cv-folds", type=int, default=3)
ap.add_argument("--experiment", default="deap_nsga_shap")
ap.add_argument("--checkpoint-every", type=int, default=5)
ap.add_argument(
"--shap-pf-eval-rows",
type=int,
default=512,
help="Number of rows from the dataset to use when saving SHAP for Pareto models",
)
args = ap.parse_args()
# data and experiment
X, y, task = load_dataset(args.dataset, random_state=args.seed)
mlflow.set_experiment(args.experiment)
outdir = Path("runs") / args.dataset
outdir.mkdir(parents=True, exist_ok=True)
ckpt_path = outdir / "checkpoint.pkl"
# seed RNGs
random.seed(args.seed)
np.random.seed(args.seed)
# toolbox for this run
toolbox = build_toolbox(
X,
y,
task,
seed=args.seed,
cv_folds=args.cv_folds,
mlflow_experiment=args.experiment,
)
# initial population or resume from checkpoint
if ckpt_path.exists():
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
if loaded_seed != args.seed:
print(
f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}"
)
print(f"Resuming from checkpoint at generation {start_gen}")
else:
pop = toolbox.population(n=args.pop_size)
fits = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fits):
ind.fitness.values = fit
start_gen = 0
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
print(f"Initial checkpoint saved at generation {start_gen}")
# GA loop
for gen in range(start_gen, args.generations):
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
fits = list(map(toolbox.evaluate, offspring))
for ind, fit in zip(offspring, fits):
ind.fitness.values = fit
pop = toolbox.select(pop + offspring, k=args.pop_size)
if (gen + 1) % args.checkpoint_every == 0:
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
print(f"Checkpoint saved at generation {gen + 1}")
# final Pareto front
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
rows = []
for ind in pf:
algo, model_params, pre_cfg = decode(ind)
rows.append(
{
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
**{f"m_{k}": v for k, v in model_params.items()},
**{f"p_{k}": v for k, v in pre_cfg.items()},
}
)
pareto_path = outdir / "pareto_front.csv"
pd.DataFrame(rows).to_csv(pareto_path, index=False)
print(f"Saved Pareto front to {pareto_path}")
shap_dir = outdir / "shap"
shap_dir.mkdir(exist_ok=True)
eval_rows = min(args.shap_pf_eval_rows, len(X))
rng = np.random.RandomState(args.seed)
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
X_eval_shap = X.iloc[eval_idx]
y_full = y
for i, ind in enumerate(pf):
algo, model_params, pre_cfg = decode(ind)
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
fixed_k = pre_cfg.get("select_k", None)
preproc = build_preprocessor(
X,
task,
pre_cfg,
fixed_k=fixed_k,
fixed_poly_degree=fixed_poly_degree,
)
model = make_model(task, algo, model_params, random_state=args.seed)
from sklearn.pipeline import Pipeline as SkPipeline
pipe = SkPipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X,
y_fit=y_full,
X_eval=X_eval_shap,
task_type=task,
bg_size=128,
max_eval_rows=eval_rows,
rng_seed=args.seed,
)
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
print(f"Saved SHAP arrays for {len(pf)} Pareto models under {shap_dir}")
if __name__ == "__main__":
main()