New Start

This commit is contained in:
Varyngoth
2025-12-04 16:53:58 -04:00
parent 3657e8ea18
commit c5fb865583
9 changed files with 1043 additions and 13 deletions

View File

@@ -5,7 +5,6 @@ from pathlib import Path
import numpy as np
import mlflow
from datetime import datetime
from deap import algorithms
from deap.tools.emo import sortNondominated
import pandas as pd
@@ -16,16 +15,6 @@ from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix
# Main network
# mlflow.set_tracking_uri("http://192.168.2.169:5000")
# Cluster Subnet
mlflow.set_tracking_uri("http://10.10.0.5:5000")
# Network with DNS resolution (specified hosts or Tailnet)
#mlflow.set_tracking_uri("http://medea:5000")
def save_checkpoint(path, gen, pop, seed):
state = {
@@ -54,8 +43,7 @@ def main():
ap.add_argument("--pop-size", type=int, default=24)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--cv-folds", type=int, default=3)
experiment_name = f"deap_nsga_shap_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
ap.add_argument("--experiment", default=experiment_name)
ap.add_argument("--experiment", default="deap_nsga_shap")
ap.add_argument("--checkpoint-every", type=int, default=5)
ap.add_argument(
"--shap-pf-eval-rows",

View File

@@ -0,0 +1,103 @@
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
def evaluate_config_protocol_aware(
X,
y,
task,
algo,
model_params,
pre_cfg,
protocol_fn,
protocol_params,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
# fixed SHAP evaluation pool per individual evaluation
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
# freeze preprocessing dimensionality
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, pre_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = pre_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
losses = []
fit_times = []
shap_times = []
replicates = protocol_fn(X, y, seed=seed, **protocol_params)
for rep_id, rep in enumerate(replicates):
if rep["type"] in ["cv", "bootstrap"]:
tr, te = rep["train_idx"], rep["test_idx"]
X_fit, y_fit = X.iloc[tr], y.iloc[tr]
X_test, y_test = X.iloc[te], y.iloc[te]
else:
X_fit, y_fit = rep["X_noisy"], y
X_test, y_test = X, y
preproc = build_preprocessor(
X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X_fit,
y_fit=y_fit,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
fit_times.append(t_fit)
shap_times.append(t_shap)
if task == "regression":
y_pred = pipe.predict(X_test)
loss = float(mean_squared_error(y_test, y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X_test)[:, 1]
else:
scores = pipe.decision_function(X_test)
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y_test, y_prob))
losses.append(loss)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
mse_like = float(np.mean(losses))
stability_val = float(stability)
meta = {
"loss_std": float(np.std(losses)),
"fit_time_mean": float(np.mean(fit_times)) if fit_times else 0.0,
"shap_time_mean": float(np.mean(shap_times)) if shap_times else 0.0,
"inst_feat_std": float(agg_std),
"n_replicates": len(replicates),
}
return mse_like, stability_val, meta

View File

@@ -0,0 +1,24 @@
from src.protocols_methodology.protocols import (
kfold_indices,
bootstrap_indices,
noise_perturbations,
)
def cv_protocol(X, y, n_folds=5, seed=0):
reps = []
for tr, te in kfold_indices(len(X), n_folds, seed):
reps.append({"type": "cv", "train_idx": tr, "test_idx": te})
return reps
def bootstrap_protocol(X, y, n_bootstrap=30, seed=0):
reps = []
for tr, te in bootstrap_indices(len(X), n_bootstrap, seed):
reps.append({"type": "bootstrap", "train_idx": tr, "test_idx": te})
return reps
def noise_protocol(X, y, n_replicates=30, noise_std=0.01, seed=0):
levels = [noise_std] * n_replicates
reps = []
for sigma, X_noisy in noise_perturbations(X, levels, seed):
reps.append({"type": "noise", "sigma": sigma, "X_noisy": X_noisy})
return reps

View File

@@ -0,0 +1,195 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.data_openml import load_dataset
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.protocols import bootstrap_indices
def run_bootstrap_protocol(
X,
y,
task,
algo,
model_params,
preproc_cfg,
n_bootstrap=30,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = preproc_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
rep_rows = []
for rep_id, (tr, te) in enumerate(bootstrap_indices(len(X), n_bootstrap, seed)):
X_boot = X.iloc[tr]
y_boot = y.iloc[tr]
preproc = build_preprocessor(
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X_boot,
y_fit=y_boot,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
# OOB loss on te to match your earlier logic
if task == "regression":
y_pred = pipe.predict(X.iloc[te])
loss = float(mean_squared_error(y.iloc[te], y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
else:
scores = pipe.decision_function(X.iloc[te])
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y.iloc[te], y_prob))
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
rep_rows.append(
{
"seed": seed,
"replicate_id": rep_id,
"protocol": "bootstrap",
"loss": loss,
"fit_time": float(t_fit),
"shap_time": float(t_shap),
"inst_feat_std_rep": float(agg_std_rep),
"stability_rep": float(stability_rep),
}
)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
summary = {
"seed": seed,
"protocol": "bootstrap",
"n_replicates": n_bootstrap,
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
"inst_feat_std": float(agg_std),
"stability": float(stability),
}
return summary, rep_rows
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
ap.add_argument("--n-bootstrap", type=int, default=30)
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
ap.add_argument("--outdir", default="runs/protocol_bootstrap")
args = ap.parse_args()
X, y, task = load_dataset(args.dataset)
preproc_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
if args.algo == "rf":
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
elif args.algo == "gbt":
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
else:
model_params = {
"hidden_layers": (64, 64),
"activation": "relu",
"alpha": 1e-4,
"lr_init": 1e-3,
"max_iter": 200,
}
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
summaries = []
all_rep_rows = []
for seed in args.seeds:
summary, rep_rows = run_bootstrap_protocol(
X,
y,
task,
algo=args.algo,
model_params=model_params,
preproc_cfg=preproc_cfg,
n_bootstrap=args.n_bootstrap,
seed=seed,
)
summaries.append(summary)
all_rep_rows.extend(rep_rows)
summary_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_summary.csv"
reps_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_replicates.csv"
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_bootstrap.json"
pd.DataFrame(summaries).to_csv(summary_path, index=False)
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
with open(cfg_path, "w") as f:
json.dump(
{
"dataset": args.dataset,
"algo": args.algo,
"task": task,
"protocol": "bootstrap",
"protocol_params": {"n_bootstrap": args.n_bootstrap, "seeds": args.seeds},
"model_params": model_params,
"preproc_cfg": preproc_cfg,
},
f,
indent=2,
)
print("Saved:")
print(summary_path)
print(reps_path)
print(cfg_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,195 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.data_openml import load_dataset
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.protocols import kfold_indices
def run_cv_protocol(
X,
y,
task,
algo,
model_params,
preproc_cfg,
n_folds=5,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
# fixed SHAP evaluation pool per seed
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
# freeze preprocessor dimensions for stability comparability
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = preproc_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
rep_rows = []
# your exact old KFold generator
for rep_id, (tr, te) in enumerate(kfold_indices(len(X), n_folds, seed)):
preproc = build_preprocessor(
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X.iloc[tr],
y_fit=y.iloc[tr],
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
# loss on fold test
if task == "regression":
y_pred = pipe.predict(X.iloc[te])
loss = float(mean_squared_error(y.iloc[te], y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
else:
scores = pipe.decision_function(X.iloc[te])
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y.iloc[te], y_prob))
# running stability so replicates can be plotted as a trajectory
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
rep_rows.append(
{
"seed": seed,
"replicate_id": rep_id,
"protocol": "cv",
"loss": loss,
"fit_time": float(t_fit),
"shap_time": float(t_shap),
"inst_feat_std_rep": float(agg_std_rep),
"stability_rep": float(stability_rep),
}
)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
summary = {
"seed": seed,
"protocol": "cv",
"n_replicates": n_folds,
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
"inst_feat_std": float(agg_std),
"stability": float(stability),
}
return summary, rep_rows
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
ap.add_argument("--n-folds", type=int, default=5)
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
ap.add_argument("--outdir", default="runs/protocol_cv")
args = ap.parse_args()
X, y, task = load_dataset(args.dataset)
preproc_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
# fixed family for methodology experiments
if args.algo == "rf":
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
elif args.algo == "gbt":
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
else:
model_params = {
"hidden_layers": (64, 64),
"activation": "relu",
"alpha": 1e-4,
"lr_init": 1e-3,
"max_iter": 200,
}
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
summaries = []
all_rep_rows = []
for seed in args.seeds:
summary, rep_rows = run_cv_protocol(
X,
y,
task,
algo=args.algo,
model_params=model_params,
preproc_cfg=preproc_cfg,
n_folds=args.n_folds,
seed=seed,
)
summaries.append(summary)
all_rep_rows.extend(rep_rows)
summary_path = outdir / f"{args.dataset}_{args.algo}_cv_summary.csv"
reps_path = outdir / f"{args.dataset}_{args.algo}_cv_replicates.csv"
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_cv.json"
pd.DataFrame(summaries).to_csv(summary_path, index=False)
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
with open(cfg_path, "w") as f:
json.dump(
{
"dataset": args.dataset,
"algo": args.algo,
"task": task,
"protocol": "cv",
"protocol_params": {"n_folds": args.n_folds, "seeds": args.seeds},
"model_params": model_params,
"preproc_cfg": preproc_cfg,
},
f,
indent=2,
)
print("Saved:")
print(summary_path)
print(reps_path)
print(cfg_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,206 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.data_openml import load_dataset
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.protocols import noise_perturbations
def run_noise_protocol(
X,
y,
task,
algo,
model_params,
preproc_cfg,
n_replicates=30,
noise_std=0.01,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = preproc_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
rep_rows = []
# use your old noise generator for consistency
# it expects a list of levels, so repeat noise_std
levels = [noise_std] * n_replicates
noisy_sets = noise_perturbations(X, levels, seed)
for rep_id, (sigma, X_noisy) in enumerate(noisy_sets):
preproc = build_preprocessor(
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X_noisy,
y_fit=y,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
# evaluate on clean X to measure robustness of noisy training
if task == "regression":
y_pred = pipe.predict(X)
loss = float(mean_squared_error(y, y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X)[:, 1]
else:
scores = pipe.decision_function(X)
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y, y_prob))
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
rep_rows.append(
{
"seed": seed,
"replicate_id": rep_id,
"protocol": "noise",
"sigma": float(sigma),
"loss": loss,
"fit_time": float(t_fit),
"shap_time": float(t_shap),
"inst_feat_std_rep": float(agg_std_rep),
"stability_rep": float(stability_rep),
}
)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
summary = {
"seed": seed,
"protocol": "noise",
"n_replicates": n_replicates,
"noise_std": float(noise_std),
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
"inst_feat_std": float(agg_std),
"stability": float(stability),
}
return summary, rep_rows
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
ap.add_argument("--n-replicates", type=int, default=30)
ap.add_argument("--noise-std", type=float, default=0.01)
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
ap.add_argument("--outdir", default="runs/protocol_noise")
args = ap.parse_args()
X, y, task = load_dataset(args.dataset)
preproc_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
if args.algo == "rf":
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
elif args.algo == "gbt":
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
else:
model_params = {
"hidden_layers": (64, 64),
"activation": "relu",
"alpha": 1e-4,
"lr_init": 1e-3,
"max_iter": 200,
}
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
summaries = []
all_rep_rows = []
for seed in args.seeds:
summary, rep_rows = run_noise_protocol(
X,
y,
task,
algo=args.algo,
model_params=model_params,
preproc_cfg=preproc_cfg,
n_replicates=args.n_replicates,
noise_std=args.noise_std,
seed=seed,
)
summaries.append(summary)
all_rep_rows.extend(rep_rows)
summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv"
reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv"
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json"
pd.DataFrame(summaries).to_csv(summary_path, index=False)
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
with open(cfg_path, "w") as f:
json.dump(
{
"dataset": args.dataset,
"algo": args.algo,
"task": task,
"protocol": "noise",
"protocol_params": {
"n_replicates": args.n_replicates,
"noise_std": args.noise_std,
"seeds": args.seeds,
},
"model_params": model_params,
"preproc_cfg": preproc_cfg,
},
f,
indent=2,
)
print("Saved:")
print(summary_path)
print(reps_path)
print(cfg_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,78 @@
import mlflow
from deap import base, creator, tools
from sklearn.utils import check_random_state
from src.search.nsga_deap import decode
from src.protocols_methodology.automl_evaluate import evaluate_config_protocol_aware
def build_toolbox_protocol_aware(
X,
y,
task,
seed,
protocol_fn,
protocol_params,
mlflow_experiment,
):
rng = check_random_state(seed)
if not hasattr(creator, "FitnessMSEStab"):
creator.create("FitnessMSEStab", base.Fitness, weights=(-1.0, 1.0))
if not hasattr(creator, "Individual"):
creator.create("Individual", list, fitness=creator.FitnessMSEStab)
toolbox = base.Toolbox()
toolbox.register("gene", rng.randint, 0, 1000000)
toolbox.register(
"individual",
tools.initRepeat,
creator.Individual,
toolbox.gene,
n=16,
)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def eval_ind(individual):
algo, model_params, pre_cfg = decode(individual)
with mlflow.start_run(run_name=f"{algo}", nested=True):
for gi, g in enumerate(individual):
mlflow.log_param(f"g{gi}", int(g))
mlflow.log_param("algo", algo)
for k, v in model_params.items():
mlflow.log_param(f"m_{k}", v)
for k, v in pre_cfg.items():
mlflow.log_param(f"p_{k}", v)
mse_like, stability, meta = evaluate_config_protocol_aware(
X=X,
y=y,
task=task,
algo=algo,
model_params=model_params,
pre_cfg=pre_cfg,
protocol_fn=protocol_fn,
protocol_params=protocol_params,
seed=seed,
)
mlflow.log_metric("mse_like", mse_like)
mlflow.log_metric("stability", stability)
for mk, mv in meta.items():
mlflow.log_metric(mk, mv)
return mse_like, stability
toolbox.register("evaluate", eval_ind)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register(
"mutate",
tools.mutUniformInt,
low=0,
up=1000000,
indpb=0.2,
)
toolbox.register("select", tools.selNSGA2)
return toolbox

View File

@@ -0,0 +1,37 @@
# src/protocols.py
from typing import Iterable, Tuple, List
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import KFold
from sklearn.utils import resample
def kfold_indices(n: int, k: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
kf = KFold(n_splits=k, shuffle=True, random_state=seed)
for tr, te in kf.split(range(n)):
yield np.array(tr), np.array(te)
def bootstrap_indices(n: int, B: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
rng = np.random.RandomState(seed)
for _ in range(B):
train_idx = resample(np.arange(n), replace=True, n_samples=n, random_state=rng)
mask = np.ones(n, dtype=bool)
mask[train_idx] = False
test_idx = np.where(mask)[0]
if len(test_idx) == 0:
test_idx = rng.choice(n, size=max(1, n // 5), replace=False)
yield train_idx, test_idx
def noise_perturbations(X: pd.DataFrame, levels: List[float], seed: int):
rng = np.random.RandomState(seed)
Xn_list = []
# compute std only on numeric columns
num_cols = [c for c in X.columns if is_numeric_dtype(X[c])]
std = X[num_cols].std().replace(0, 1.0)
for sigma in levels:
Xn = X.copy()
for col in num_cols:
Xn[col] = Xn[col] + rng.normal(0, sigma * std.get(col, 1.0), size=len(Xn))
Xn_list.append((sigma, Xn))
return Xn_list

View File

@@ -0,0 +1,204 @@
import argparse
import random
import pickle
from pathlib import Path
import numpy as np
import mlflow
from deap import algorithms
from deap.tools.emo import sortNondominated
import pandas as pd
from src.data_openml import load_dataset
from src.search.nsga_deap import decode
from src.protocols_methodology.nsga_toolbox_protocols import build_toolbox_protocol_aware
from src.protocols_methodology.automl_protocol_adapters import (
cv_protocol,
bootstrap_protocol,
noise_protocol,
)
def save_checkpoint(path, gen, pop, seed):
state = {
"gen": gen,
"pop": pop,
"py_random_state": random.getstate(),
"np_random_state": np.random.get_state(),
"seed": seed,
}
with open(path, "wb") as f:
pickle.dump(state, f)
def load_checkpoint(path):
with open(path, "rb") as f:
state = pickle.load(f)
random.setstate(state["py_random_state"])
np.random.set_state(state["np_random_state"])
return state["gen"], state["pop"], state["seed"]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--generations", type=int, default=10)
ap.add_argument("--pop-size", type=int, default=24)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--experiment", default="deap_nsga_protocol_study")
ap.add_argument("--checkpoint-every", type=int, default=5)
ap.add_argument("--shap-pf-eval-rows", type=int, default=512)
ap.add_argument("--n-folds", type=int, default=3)
ap.add_argument("--n-bootstrap", type=int, default=30)
ap.add_argument("--n-noise", type=int, default=30)
ap.add_argument("--noise-std", type=float, default=0.01)
args = ap.parse_args()
X, y, task = load_dataset(args.dataset, random_state=args.seed)
mlflow.set_experiment(args.experiment)
protocols = {
"cv": (cv_protocol, {"n_folds": args.n_folds}),
"bootstrap": (bootstrap_protocol, {"n_bootstrap": args.n_bootstrap}),
"noise": (noise_protocol, {"n_replicates": args.n_noise, "noise_std": args.noise_std}),
}
base_outdir = Path("runs") / f"{args.dataset}_protocol_study"
base_outdir.mkdir(parents=True, exist_ok=True)
for pname, (pfn, pparams) in protocols.items():
outdir = base_outdir / pname
outdir.mkdir(parents=True, exist_ok=True)
ckpt_path = outdir / "checkpoint.pkl"
random.seed(args.seed)
np.random.seed(args.seed)
toolbox = build_toolbox_protocol_aware(
X=X,
y=y,
task=task,
seed=args.seed,
protocol_fn=pfn,
protocol_params=pparams,
mlflow_experiment=args.experiment,
)
if ckpt_path.exists():
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
if loaded_seed != args.seed:
print(f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}")
print(f"[{pname}] Resuming from generation {start_gen}")
else:
pop = toolbox.population(n=args.pop_size)
fits = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fits):
ind.fitness.values = fit
start_gen = 0
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
print(f"[{pname}] Initial checkpoint saved")
for gen in range(start_gen, args.generations):
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
fits = list(map(toolbox.evaluate, offspring))
for ind, fit in zip(offspring, fits):
ind.fitness.values = fit
pop = toolbox.select(pop + offspring, k=args.pop_size)
# save PF history for convergence plots
pf_gen = sortNondominated(pop, len(pop), first_front_only=True)[0]
rows_gen = []
for ind in pf_gen:
algo, model_params, pre_cfg = decode(ind)
rows_gen.append({
"gen": gen + 1,
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
})
pd.DataFrame(rows_gen).to_csv(outdir / f"pareto_gen_{gen + 1}.csv", index=False)
if (gen + 1) % args.checkpoint_every == 0:
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
print(f"[{pname}] Checkpoint saved at gen {gen + 1}")
# save final full population for dominated region analysis
all_rows = []
for ind in pop:
algo, model_params, pre_cfg = decode(ind)
all_rows.append({
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
**{f"m_{k}": v for k, v in model_params.items()},
**{f"p_{k}": v for k, v in pre_cfg.items()},
})
pd.DataFrame(all_rows).to_csv(outdir / "final_population.csv", index=False)
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
rows = []
for ind in pf:
algo, model_params, pre_cfg = decode(ind)
rows.append(
{
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
**{f"m_{k}": v for k, v in model_params.items()},
**{f"p_{k}": v for k, v in pre_cfg.items()},
}
)
pareto_path = outdir / "pareto_front.csv"
pd.DataFrame(rows).to_csv(pareto_path, index=False)
print(f"[{pname}] Saved Pareto front to {pareto_path}")
# optional SHAP saving for PF models, same as run_deap
shap_dir = outdir / "shap"
shap_dir.mkdir(exist_ok=True)
eval_rows = min(args.shap_pf_eval_rows, len(X))
rng = np.random.RandomState(args.seed)
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
X_eval_shap = X.iloc[eval_idx]
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix
from sklearn.pipeline import Pipeline as SkPipeline
for i, ind in enumerate(pf):
algo, model_params, pre_cfg = decode(ind)
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
fixed_k = pre_cfg.get("select_k", None)
preproc = build_preprocessor(
X,
task,
pre_cfg,
fixed_k=fixed_k,
fixed_poly_degree=fixed_poly_degree,
)
model = make_model(task, algo, model_params, random_state=args.seed)
pipe = SkPipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X,
y_fit=y,
X_eval=X_eval_shap,
task_type=task,
bg_size=128,
max_eval_rows=eval_rows,
rng_seed=args.seed,
)
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
print(f"[{pname}] Saved SHAP arrays for {len(pf)} PF models")
print(f"Done. All protocol AutoML runs in {base_outdir}")
if __name__ == "__main__":
main()