New Start

This commit is contained in:
Varyngoth
2025-12-04 16:53:58 -04:00
parent 3657e8ea18
commit c5fb865583
9 changed files with 1043 additions and 13 deletions

View File

@@ -5,7 +5,6 @@ from pathlib import Path
import numpy as np import numpy as np
import mlflow import mlflow
from datetime import datetime
from deap import algorithms from deap import algorithms
from deap.tools.emo import sortNondominated from deap.tools.emo import sortNondominated
import pandas as pd import pandas as pd
@@ -16,16 +15,6 @@ from src.preprocessing import build_preprocessor
from src.models import make_model from src.models import make_model
from src.stability import compute_shap_matrix from src.stability import compute_shap_matrix
# Main network
# mlflow.set_tracking_uri("http://192.168.2.169:5000")
# Cluster Subnet
mlflow.set_tracking_uri("http://10.10.0.5:5000")
# Network with DNS resolution (specified hosts or Tailnet)
#mlflow.set_tracking_uri("http://medea:5000")
def save_checkpoint(path, gen, pop, seed): def save_checkpoint(path, gen, pop, seed):
state = { state = {
@@ -54,8 +43,7 @@ def main():
ap.add_argument("--pop-size", type=int, default=24) ap.add_argument("--pop-size", type=int, default=24)
ap.add_argument("--seed", type=int, default=42) ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--cv-folds", type=int, default=3) ap.add_argument("--cv-folds", type=int, default=3)
experiment_name = f"deap_nsga_shap_{datetime.now().strftime('%Y%m%d_%H%M%S')}" ap.add_argument("--experiment", default="deap_nsga_shap")
ap.add_argument("--experiment", default=experiment_name)
ap.add_argument("--checkpoint-every", type=int, default=5) ap.add_argument("--checkpoint-every", type=int, default=5)
ap.add_argument( ap.add_argument(
"--shap-pf-eval-rows", "--shap-pf-eval-rows",

View File

@@ -0,0 +1,103 @@
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
def evaluate_config_protocol_aware(
X,
y,
task,
algo,
model_params,
pre_cfg,
protocol_fn,
protocol_params,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
# fixed SHAP evaluation pool per individual evaluation
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
# freeze preprocessing dimensionality
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, pre_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = pre_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
losses = []
fit_times = []
shap_times = []
replicates = protocol_fn(X, y, seed=seed, **protocol_params)
for rep_id, rep in enumerate(replicates):
if rep["type"] in ["cv", "bootstrap"]:
tr, te = rep["train_idx"], rep["test_idx"]
X_fit, y_fit = X.iloc[tr], y.iloc[tr]
X_test, y_test = X.iloc[te], y.iloc[te]
else:
X_fit, y_fit = rep["X_noisy"], y
X_test, y_test = X, y
preproc = build_preprocessor(
X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X_fit,
y_fit=y_fit,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
fit_times.append(t_fit)
shap_times.append(t_shap)
if task == "regression":
y_pred = pipe.predict(X_test)
loss = float(mean_squared_error(y_test, y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X_test)[:, 1]
else:
scores = pipe.decision_function(X_test)
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y_test, y_prob))
losses.append(loss)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
mse_like = float(np.mean(losses))
stability_val = float(stability)
meta = {
"loss_std": float(np.std(losses)),
"fit_time_mean": float(np.mean(fit_times)) if fit_times else 0.0,
"shap_time_mean": float(np.mean(shap_times)) if shap_times else 0.0,
"inst_feat_std": float(agg_std),
"n_replicates": len(replicates),
}
return mse_like, stability_val, meta

View File

@@ -0,0 +1,24 @@
from src.protocols_methodology.protocols import (
kfold_indices,
bootstrap_indices,
noise_perturbations,
)
def cv_protocol(X, y, n_folds=5, seed=0):
reps = []
for tr, te in kfold_indices(len(X), n_folds, seed):
reps.append({"type": "cv", "train_idx": tr, "test_idx": te})
return reps
def bootstrap_protocol(X, y, n_bootstrap=30, seed=0):
reps = []
for tr, te in bootstrap_indices(len(X), n_bootstrap, seed):
reps.append({"type": "bootstrap", "train_idx": tr, "test_idx": te})
return reps
def noise_protocol(X, y, n_replicates=30, noise_std=0.01, seed=0):
levels = [noise_std] * n_replicates
reps = []
for sigma, X_noisy in noise_perturbations(X, levels, seed):
reps.append({"type": "noise", "sigma": sigma, "X_noisy": X_noisy})
return reps

View File

@@ -0,0 +1,195 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.data_openml import load_dataset
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.protocols import bootstrap_indices
def run_bootstrap_protocol(
X,
y,
task,
algo,
model_params,
preproc_cfg,
n_bootstrap=30,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = preproc_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
rep_rows = []
for rep_id, (tr, te) in enumerate(bootstrap_indices(len(X), n_bootstrap, seed)):
X_boot = X.iloc[tr]
y_boot = y.iloc[tr]
preproc = build_preprocessor(
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X_boot,
y_fit=y_boot,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
# OOB loss on te to match your earlier logic
if task == "regression":
y_pred = pipe.predict(X.iloc[te])
loss = float(mean_squared_error(y.iloc[te], y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
else:
scores = pipe.decision_function(X.iloc[te])
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y.iloc[te], y_prob))
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
rep_rows.append(
{
"seed": seed,
"replicate_id": rep_id,
"protocol": "bootstrap",
"loss": loss,
"fit_time": float(t_fit),
"shap_time": float(t_shap),
"inst_feat_std_rep": float(agg_std_rep),
"stability_rep": float(stability_rep),
}
)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
summary = {
"seed": seed,
"protocol": "bootstrap",
"n_replicates": n_bootstrap,
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
"inst_feat_std": float(agg_std),
"stability": float(stability),
}
return summary, rep_rows
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
ap.add_argument("--n-bootstrap", type=int, default=30)
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
ap.add_argument("--outdir", default="runs/protocol_bootstrap")
args = ap.parse_args()
X, y, task = load_dataset(args.dataset)
preproc_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
if args.algo == "rf":
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
elif args.algo == "gbt":
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
else:
model_params = {
"hidden_layers": (64, 64),
"activation": "relu",
"alpha": 1e-4,
"lr_init": 1e-3,
"max_iter": 200,
}
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
summaries = []
all_rep_rows = []
for seed in args.seeds:
summary, rep_rows = run_bootstrap_protocol(
X,
y,
task,
algo=args.algo,
model_params=model_params,
preproc_cfg=preproc_cfg,
n_bootstrap=args.n_bootstrap,
seed=seed,
)
summaries.append(summary)
all_rep_rows.extend(rep_rows)
summary_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_summary.csv"
reps_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_replicates.csv"
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_bootstrap.json"
pd.DataFrame(summaries).to_csv(summary_path, index=False)
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
with open(cfg_path, "w") as f:
json.dump(
{
"dataset": args.dataset,
"algo": args.algo,
"task": task,
"protocol": "bootstrap",
"protocol_params": {"n_bootstrap": args.n_bootstrap, "seeds": args.seeds},
"model_params": model_params,
"preproc_cfg": preproc_cfg,
},
f,
indent=2,
)
print("Saved:")
print(summary_path)
print(reps_path)
print(cfg_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,195 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.data_openml import load_dataset
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.protocols import kfold_indices
def run_cv_protocol(
X,
y,
task,
algo,
model_params,
preproc_cfg,
n_folds=5,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
# fixed SHAP evaluation pool per seed
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
# freeze preprocessor dimensions for stability comparability
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = preproc_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
rep_rows = []
# your exact old KFold generator
for rep_id, (tr, te) in enumerate(kfold_indices(len(X), n_folds, seed)):
preproc = build_preprocessor(
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X.iloc[tr],
y_fit=y.iloc[tr],
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
# loss on fold test
if task == "regression":
y_pred = pipe.predict(X.iloc[te])
loss = float(mean_squared_error(y.iloc[te], y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
else:
scores = pipe.decision_function(X.iloc[te])
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y.iloc[te], y_prob))
# running stability so replicates can be plotted as a trajectory
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
rep_rows.append(
{
"seed": seed,
"replicate_id": rep_id,
"protocol": "cv",
"loss": loss,
"fit_time": float(t_fit),
"shap_time": float(t_shap),
"inst_feat_std_rep": float(agg_std_rep),
"stability_rep": float(stability_rep),
}
)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
summary = {
"seed": seed,
"protocol": "cv",
"n_replicates": n_folds,
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
"inst_feat_std": float(agg_std),
"stability": float(stability),
}
return summary, rep_rows
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
ap.add_argument("--n-folds", type=int, default=5)
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
ap.add_argument("--outdir", default="runs/protocol_cv")
args = ap.parse_args()
X, y, task = load_dataset(args.dataset)
preproc_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
# fixed family for methodology experiments
if args.algo == "rf":
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
elif args.algo == "gbt":
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
else:
model_params = {
"hidden_layers": (64, 64),
"activation": "relu",
"alpha": 1e-4,
"lr_init": 1e-3,
"max_iter": 200,
}
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
summaries = []
all_rep_rows = []
for seed in args.seeds:
summary, rep_rows = run_cv_protocol(
X,
y,
task,
algo=args.algo,
model_params=model_params,
preproc_cfg=preproc_cfg,
n_folds=args.n_folds,
seed=seed,
)
summaries.append(summary)
all_rep_rows.extend(rep_rows)
summary_path = outdir / f"{args.dataset}_{args.algo}_cv_summary.csv"
reps_path = outdir / f"{args.dataset}_{args.algo}_cv_replicates.csv"
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_cv.json"
pd.DataFrame(summaries).to_csv(summary_path, index=False)
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
with open(cfg_path, "w") as f:
json.dump(
{
"dataset": args.dataset,
"algo": args.algo,
"task": task,
"protocol": "cv",
"protocol_params": {"n_folds": args.n_folds, "seeds": args.seeds},
"model_params": model_params,
"preproc_cfg": preproc_cfg,
},
f,
indent=2,
)
print("Saved:")
print(summary_path)
print(reps_path)
print(cfg_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,206 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.data_openml import load_dataset
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.protocols import noise_perturbations
def run_noise_protocol(
X,
y,
task,
algo,
model_params,
preproc_cfg,
n_replicates=30,
noise_std=0.01,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = preproc_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
rep_rows = []
# use your old noise generator for consistency
# it expects a list of levels, so repeat noise_std
levels = [noise_std] * n_replicates
noisy_sets = noise_perturbations(X, levels, seed)
for rep_id, (sigma, X_noisy) in enumerate(noisy_sets):
preproc = build_preprocessor(
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X_noisy,
y_fit=y,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
# evaluate on clean X to measure robustness of noisy training
if task == "regression":
y_pred = pipe.predict(X)
loss = float(mean_squared_error(y, y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X)[:, 1]
else:
scores = pipe.decision_function(X)
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y, y_prob))
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
rep_rows.append(
{
"seed": seed,
"replicate_id": rep_id,
"protocol": "noise",
"sigma": float(sigma),
"loss": loss,
"fit_time": float(t_fit),
"shap_time": float(t_shap),
"inst_feat_std_rep": float(agg_std_rep),
"stability_rep": float(stability_rep),
}
)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
summary = {
"seed": seed,
"protocol": "noise",
"n_replicates": n_replicates,
"noise_std": float(noise_std),
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
"inst_feat_std": float(agg_std),
"stability": float(stability),
}
return summary, rep_rows
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
ap.add_argument("--n-replicates", type=int, default=30)
ap.add_argument("--noise-std", type=float, default=0.01)
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
ap.add_argument("--outdir", default="runs/protocol_noise")
args = ap.parse_args()
X, y, task = load_dataset(args.dataset)
preproc_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
if args.algo == "rf":
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
elif args.algo == "gbt":
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
else:
model_params = {
"hidden_layers": (64, 64),
"activation": "relu",
"alpha": 1e-4,
"lr_init": 1e-3,
"max_iter": 200,
}
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
summaries = []
all_rep_rows = []
for seed in args.seeds:
summary, rep_rows = run_noise_protocol(
X,
y,
task,
algo=args.algo,
model_params=model_params,
preproc_cfg=preproc_cfg,
n_replicates=args.n_replicates,
noise_std=args.noise_std,
seed=seed,
)
summaries.append(summary)
all_rep_rows.extend(rep_rows)
summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv"
reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv"
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json"
pd.DataFrame(summaries).to_csv(summary_path, index=False)
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
with open(cfg_path, "w") as f:
json.dump(
{
"dataset": args.dataset,
"algo": args.algo,
"task": task,
"protocol": "noise",
"protocol_params": {
"n_replicates": args.n_replicates,
"noise_std": args.noise_std,
"seeds": args.seeds,
},
"model_params": model_params,
"preproc_cfg": preproc_cfg,
},
f,
indent=2,
)
print("Saved:")
print(summary_path)
print(reps_path)
print(cfg_path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,78 @@
import mlflow
from deap import base, creator, tools
from sklearn.utils import check_random_state
from src.search.nsga_deap import decode
from src.protocols_methodology.automl_evaluate import evaluate_config_protocol_aware
def build_toolbox_protocol_aware(
X,
y,
task,
seed,
protocol_fn,
protocol_params,
mlflow_experiment,
):
rng = check_random_state(seed)
if not hasattr(creator, "FitnessMSEStab"):
creator.create("FitnessMSEStab", base.Fitness, weights=(-1.0, 1.0))
if not hasattr(creator, "Individual"):
creator.create("Individual", list, fitness=creator.FitnessMSEStab)
toolbox = base.Toolbox()
toolbox.register("gene", rng.randint, 0, 1000000)
toolbox.register(
"individual",
tools.initRepeat,
creator.Individual,
toolbox.gene,
n=16,
)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def eval_ind(individual):
algo, model_params, pre_cfg = decode(individual)
with mlflow.start_run(run_name=f"{algo}", nested=True):
for gi, g in enumerate(individual):
mlflow.log_param(f"g{gi}", int(g))
mlflow.log_param("algo", algo)
for k, v in model_params.items():
mlflow.log_param(f"m_{k}", v)
for k, v in pre_cfg.items():
mlflow.log_param(f"p_{k}", v)
mse_like, stability, meta = evaluate_config_protocol_aware(
X=X,
y=y,
task=task,
algo=algo,
model_params=model_params,
pre_cfg=pre_cfg,
protocol_fn=protocol_fn,
protocol_params=protocol_params,
seed=seed,
)
mlflow.log_metric("mse_like", mse_like)
mlflow.log_metric("stability", stability)
for mk, mv in meta.items():
mlflow.log_metric(mk, mv)
return mse_like, stability
toolbox.register("evaluate", eval_ind)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register(
"mutate",
tools.mutUniformInt,
low=0,
up=1000000,
indpb=0.2,
)
toolbox.register("select", tools.selNSGA2)
return toolbox

View File

@@ -0,0 +1,37 @@
# src/protocols.py
from typing import Iterable, Tuple, List
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import KFold
from sklearn.utils import resample
def kfold_indices(n: int, k: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
kf = KFold(n_splits=k, shuffle=True, random_state=seed)
for tr, te in kf.split(range(n)):
yield np.array(tr), np.array(te)
def bootstrap_indices(n: int, B: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
rng = np.random.RandomState(seed)
for _ in range(B):
train_idx = resample(np.arange(n), replace=True, n_samples=n, random_state=rng)
mask = np.ones(n, dtype=bool)
mask[train_idx] = False
test_idx = np.where(mask)[0]
if len(test_idx) == 0:
test_idx = rng.choice(n, size=max(1, n // 5), replace=False)
yield train_idx, test_idx
def noise_perturbations(X: pd.DataFrame, levels: List[float], seed: int):
rng = np.random.RandomState(seed)
Xn_list = []
# compute std only on numeric columns
num_cols = [c for c in X.columns if is_numeric_dtype(X[c])]
std = X[num_cols].std().replace(0, 1.0)
for sigma in levels:
Xn = X.copy()
for col in num_cols:
Xn[col] = Xn[col] + rng.normal(0, sigma * std.get(col, 1.0), size=len(Xn))
Xn_list.append((sigma, Xn))
return Xn_list

View File

@@ -0,0 +1,204 @@
import argparse
import random
import pickle
from pathlib import Path
import numpy as np
import mlflow
from deap import algorithms
from deap.tools.emo import sortNondominated
import pandas as pd
from src.data_openml import load_dataset
from src.search.nsga_deap import decode
from src.protocols_methodology.nsga_toolbox_protocols import build_toolbox_protocol_aware
from src.protocols_methodology.automl_protocol_adapters import (
cv_protocol,
bootstrap_protocol,
noise_protocol,
)
def save_checkpoint(path, gen, pop, seed):
state = {
"gen": gen,
"pop": pop,
"py_random_state": random.getstate(),
"np_random_state": np.random.get_state(),
"seed": seed,
}
with open(path, "wb") as f:
pickle.dump(state, f)
def load_checkpoint(path):
with open(path, "rb") as f:
state = pickle.load(f)
random.setstate(state["py_random_state"])
np.random.set_state(state["np_random_state"])
return state["gen"], state["pop"], state["seed"]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--generations", type=int, default=10)
ap.add_argument("--pop-size", type=int, default=24)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--experiment", default="deap_nsga_protocol_study")
ap.add_argument("--checkpoint-every", type=int, default=5)
ap.add_argument("--shap-pf-eval-rows", type=int, default=512)
ap.add_argument("--n-folds", type=int, default=3)
ap.add_argument("--n-bootstrap", type=int, default=30)
ap.add_argument("--n-noise", type=int, default=30)
ap.add_argument("--noise-std", type=float, default=0.01)
args = ap.parse_args()
X, y, task = load_dataset(args.dataset, random_state=args.seed)
mlflow.set_experiment(args.experiment)
protocols = {
"cv": (cv_protocol, {"n_folds": args.n_folds}),
"bootstrap": (bootstrap_protocol, {"n_bootstrap": args.n_bootstrap}),
"noise": (noise_protocol, {"n_replicates": args.n_noise, "noise_std": args.noise_std}),
}
base_outdir = Path("runs") / f"{args.dataset}_protocol_study"
base_outdir.mkdir(parents=True, exist_ok=True)
for pname, (pfn, pparams) in protocols.items():
outdir = base_outdir / pname
outdir.mkdir(parents=True, exist_ok=True)
ckpt_path = outdir / "checkpoint.pkl"
random.seed(args.seed)
np.random.seed(args.seed)
toolbox = build_toolbox_protocol_aware(
X=X,
y=y,
task=task,
seed=args.seed,
protocol_fn=pfn,
protocol_params=pparams,
mlflow_experiment=args.experiment,
)
if ckpt_path.exists():
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
if loaded_seed != args.seed:
print(f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}")
print(f"[{pname}] Resuming from generation {start_gen}")
else:
pop = toolbox.population(n=args.pop_size)
fits = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fits):
ind.fitness.values = fit
start_gen = 0
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
print(f"[{pname}] Initial checkpoint saved")
for gen in range(start_gen, args.generations):
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
fits = list(map(toolbox.evaluate, offspring))
for ind, fit in zip(offspring, fits):
ind.fitness.values = fit
pop = toolbox.select(pop + offspring, k=args.pop_size)
# save PF history for convergence plots
pf_gen = sortNondominated(pop, len(pop), first_front_only=True)[0]
rows_gen = []
for ind in pf_gen:
algo, model_params, pre_cfg = decode(ind)
rows_gen.append({
"gen": gen + 1,
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
})
pd.DataFrame(rows_gen).to_csv(outdir / f"pareto_gen_{gen + 1}.csv", index=False)
if (gen + 1) % args.checkpoint_every == 0:
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
print(f"[{pname}] Checkpoint saved at gen {gen + 1}")
# save final full population for dominated region analysis
all_rows = []
for ind in pop:
algo, model_params, pre_cfg = decode(ind)
all_rows.append({
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
**{f"m_{k}": v for k, v in model_params.items()},
**{f"p_{k}": v for k, v in pre_cfg.items()},
})
pd.DataFrame(all_rows).to_csv(outdir / "final_population.csv", index=False)
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
rows = []
for ind in pf:
algo, model_params, pre_cfg = decode(ind)
rows.append(
{
"algo": algo,
"mse_like": ind.fitness.values[0],
"stability": ind.fitness.values[1],
**{f"m_{k}": v for k, v in model_params.items()},
**{f"p_{k}": v for k, v in pre_cfg.items()},
}
)
pareto_path = outdir / "pareto_front.csv"
pd.DataFrame(rows).to_csv(pareto_path, index=False)
print(f"[{pname}] Saved Pareto front to {pareto_path}")
# optional SHAP saving for PF models, same as run_deap
shap_dir = outdir / "shap"
shap_dir.mkdir(exist_ok=True)
eval_rows = min(args.shap_pf_eval_rows, len(X))
rng = np.random.RandomState(args.seed)
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
X_eval_shap = X.iloc[eval_idx]
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix
from sklearn.pipeline import Pipeline as SkPipeline
for i, ind in enumerate(pf):
algo, model_params, pre_cfg = decode(ind)
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
fixed_k = pre_cfg.get("select_k", None)
preproc = build_preprocessor(
X,
task,
pre_cfg,
fixed_k=fixed_k,
fixed_poly_degree=fixed_poly_degree,
)
model = make_model(task, algo, model_params, random_state=args.seed)
pipe = SkPipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X,
y_fit=y,
X_eval=X_eval_shap,
task_type=task,
bg_size=128,
max_eval_rows=eval_rows,
rng_seed=args.seed,
)
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
print(f"[{pname}] Saved SHAP arrays for {len(pf)} PF models")
print(f"Done. All protocol AutoML runs in {base_outdir}")
if __name__ == "__main__":
main()