New Start
This commit is contained in:
14
run_deap.py
14
run_deap.py
@@ -5,7 +5,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import mlflow
|
import mlflow
|
||||||
from datetime import datetime
|
|
||||||
from deap import algorithms
|
from deap import algorithms
|
||||||
from deap.tools.emo import sortNondominated
|
from deap.tools.emo import sortNondominated
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -16,16 +15,6 @@ from src.preprocessing import build_preprocessor
|
|||||||
from src.models import make_model
|
from src.models import make_model
|
||||||
from src.stability import compute_shap_matrix
|
from src.stability import compute_shap_matrix
|
||||||
|
|
||||||
# Main network
|
|
||||||
# mlflow.set_tracking_uri("http://192.168.2.169:5000")
|
|
||||||
|
|
||||||
# Cluster Subnet
|
|
||||||
mlflow.set_tracking_uri("http://10.10.0.5:5000")
|
|
||||||
|
|
||||||
# Network with DNS resolution (specified hosts or Tailnet)
|
|
||||||
#mlflow.set_tracking_uri("http://medea:5000")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def save_checkpoint(path, gen, pop, seed):
|
def save_checkpoint(path, gen, pop, seed):
|
||||||
state = {
|
state = {
|
||||||
@@ -54,8 +43,7 @@ def main():
|
|||||||
ap.add_argument("--pop-size", type=int, default=24)
|
ap.add_argument("--pop-size", type=int, default=24)
|
||||||
ap.add_argument("--seed", type=int, default=42)
|
ap.add_argument("--seed", type=int, default=42)
|
||||||
ap.add_argument("--cv-folds", type=int, default=3)
|
ap.add_argument("--cv-folds", type=int, default=3)
|
||||||
experiment_name = f"deap_nsga_shap_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
ap.add_argument("--experiment", default="deap_nsga_shap")
|
||||||
ap.add_argument("--experiment", default=experiment_name)
|
|
||||||
ap.add_argument("--checkpoint-every", type=int, default=5)
|
ap.add_argument("--checkpoint-every", type=int, default=5)
|
||||||
ap.add_argument(
|
ap.add_argument(
|
||||||
"--shap-pf-eval-rows",
|
"--shap-pf-eval-rows",
|
||||||
|
|||||||
103
src/protocols_methodology/automl_evaluate.py
Normal file
103
src/protocols_methodology/automl_evaluate.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
import numpy as np
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||||
|
|
||||||
|
from src.preprocessing import build_preprocessor
|
||||||
|
from src.models import make_model
|
||||||
|
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_config_protocol_aware(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
algo,
|
||||||
|
model_params,
|
||||||
|
pre_cfg,
|
||||||
|
protocol_fn,
|
||||||
|
protocol_params,
|
||||||
|
seed=0,
|
||||||
|
max_eval_rows=1024,
|
||||||
|
bg_size=128,
|
||||||
|
):
|
||||||
|
rng = np.random.RandomState(seed)
|
||||||
|
|
||||||
|
# fixed SHAP evaluation pool per individual evaluation
|
||||||
|
eval_size = min(max_eval_rows, len(X))
|
||||||
|
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||||
|
X_eval_fixed = X.iloc[eval_idx]
|
||||||
|
|
||||||
|
# freeze preprocessing dimensionality
|
||||||
|
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
|
||||||
|
probe_pre = build_preprocessor(
|
||||||
|
X, task, pre_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
Xp = probe_pre.fit_transform(X, y)
|
||||||
|
n_after_prep = Xp.shape[1]
|
||||||
|
desired_k = pre_cfg.get("select_k", None)
|
||||||
|
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||||
|
|
||||||
|
shap_mats_with_names = []
|
||||||
|
losses = []
|
||||||
|
fit_times = []
|
||||||
|
shap_times = []
|
||||||
|
|
||||||
|
replicates = protocol_fn(X, y, seed=seed, **protocol_params)
|
||||||
|
|
||||||
|
for rep_id, rep in enumerate(replicates):
|
||||||
|
if rep["type"] in ["cv", "bootstrap"]:
|
||||||
|
tr, te = rep["train_idx"], rep["test_idx"]
|
||||||
|
X_fit, y_fit = X.iloc[tr], y.iloc[tr]
|
||||||
|
X_test, y_test = X.iloc[te], y.iloc[te]
|
||||||
|
else:
|
||||||
|
X_fit, y_fit = rep["X_noisy"], y
|
||||||
|
X_test, y_test = X, y
|
||||||
|
|
||||||
|
preproc = build_preprocessor(
|
||||||
|
X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||||
|
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||||
|
|
||||||
|
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||||
|
pipe,
|
||||||
|
X_fit=X_fit,
|
||||||
|
y_fit=y_fit,
|
||||||
|
X_eval=X_eval_fixed,
|
||||||
|
task_type=task,
|
||||||
|
bg_size=bg_size,
|
||||||
|
max_eval_rows=max_eval_rows,
|
||||||
|
rng_seed=seed,
|
||||||
|
)
|
||||||
|
shap_mats_with_names.append((shap_vals, feat_names))
|
||||||
|
fit_times.append(t_fit)
|
||||||
|
shap_times.append(t_shap)
|
||||||
|
|
||||||
|
if task == "regression":
|
||||||
|
y_pred = pipe.predict(X_test)
|
||||||
|
loss = float(mean_squared_error(y_test, y_pred))
|
||||||
|
else:
|
||||||
|
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||||
|
y_prob = pipe.predict_proba(X_test)[:, 1]
|
||||||
|
else:
|
||||||
|
scores = pipe.decision_function(X_test)
|
||||||
|
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||||
|
y_prob = scores
|
||||||
|
loss = float(brier_score_loss(y_test, y_prob))
|
||||||
|
|
||||||
|
losses.append(loss)
|
||||||
|
|
||||||
|
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||||
|
|
||||||
|
mse_like = float(np.mean(losses))
|
||||||
|
stability_val = float(stability)
|
||||||
|
|
||||||
|
meta = {
|
||||||
|
"loss_std": float(np.std(losses)),
|
||||||
|
"fit_time_mean": float(np.mean(fit_times)) if fit_times else 0.0,
|
||||||
|
"shap_time_mean": float(np.mean(shap_times)) if shap_times else 0.0,
|
||||||
|
"inst_feat_std": float(agg_std),
|
||||||
|
"n_replicates": len(replicates),
|
||||||
|
}
|
||||||
|
|
||||||
|
return mse_like, stability_val, meta
|
||||||
24
src/protocols_methodology/automl_protocol_adapters.py
Normal file
24
src/protocols_methodology/automl_protocol_adapters.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from src.protocols_methodology.protocols import (
|
||||||
|
kfold_indices,
|
||||||
|
bootstrap_indices,
|
||||||
|
noise_perturbations,
|
||||||
|
)
|
||||||
|
|
||||||
|
def cv_protocol(X, y, n_folds=5, seed=0):
|
||||||
|
reps = []
|
||||||
|
for tr, te in kfold_indices(len(X), n_folds, seed):
|
||||||
|
reps.append({"type": "cv", "train_idx": tr, "test_idx": te})
|
||||||
|
return reps
|
||||||
|
|
||||||
|
def bootstrap_protocol(X, y, n_bootstrap=30, seed=0):
|
||||||
|
reps = []
|
||||||
|
for tr, te in bootstrap_indices(len(X), n_bootstrap, seed):
|
||||||
|
reps.append({"type": "bootstrap", "train_idx": tr, "test_idx": te})
|
||||||
|
return reps
|
||||||
|
|
||||||
|
def noise_protocol(X, y, n_replicates=30, noise_std=0.01, seed=0):
|
||||||
|
levels = [noise_std] * n_replicates
|
||||||
|
reps = []
|
||||||
|
for sigma, X_noisy in noise_perturbations(X, levels, seed):
|
||||||
|
reps.append({"type": "noise", "sigma": sigma, "X_noisy": X_noisy})
|
||||||
|
return reps
|
||||||
195
src/protocols_methodology/exp_bootstrap.py
Normal file
195
src/protocols_methodology/exp_bootstrap.py
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||||
|
|
||||||
|
from src.data_openml import load_dataset
|
||||||
|
from src.preprocessing import build_preprocessor
|
||||||
|
from src.models import make_model
|
||||||
|
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||||
|
from src.protocols_methodology.protocols import bootstrap_indices
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def run_bootstrap_protocol(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
algo,
|
||||||
|
model_params,
|
||||||
|
preproc_cfg,
|
||||||
|
n_bootstrap=30,
|
||||||
|
seed=0,
|
||||||
|
max_eval_rows=1024,
|
||||||
|
bg_size=128,
|
||||||
|
):
|
||||||
|
rng = np.random.RandomState(seed)
|
||||||
|
|
||||||
|
eval_size = min(max_eval_rows, len(X))
|
||||||
|
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||||
|
X_eval_fixed = X.iloc[eval_idx]
|
||||||
|
|
||||||
|
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
|
||||||
|
probe_pre = build_preprocessor(
|
||||||
|
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
Xp = probe_pre.fit_transform(X, y)
|
||||||
|
n_after_prep = Xp.shape[1]
|
||||||
|
desired_k = preproc_cfg.get("select_k", None)
|
||||||
|
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||||
|
|
||||||
|
shap_mats_with_names = []
|
||||||
|
rep_rows = []
|
||||||
|
|
||||||
|
for rep_id, (tr, te) in enumerate(bootstrap_indices(len(X), n_bootstrap, seed)):
|
||||||
|
X_boot = X.iloc[tr]
|
||||||
|
y_boot = y.iloc[tr]
|
||||||
|
|
||||||
|
preproc = build_preprocessor(
|
||||||
|
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||||
|
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||||
|
|
||||||
|
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||||
|
pipe,
|
||||||
|
X_fit=X_boot,
|
||||||
|
y_fit=y_boot,
|
||||||
|
X_eval=X_eval_fixed,
|
||||||
|
task_type=task,
|
||||||
|
bg_size=bg_size,
|
||||||
|
max_eval_rows=max_eval_rows,
|
||||||
|
rng_seed=seed,
|
||||||
|
)
|
||||||
|
shap_mats_with_names.append((shap_vals, feat_names))
|
||||||
|
|
||||||
|
# OOB loss on te to match your earlier logic
|
||||||
|
if task == "regression":
|
||||||
|
y_pred = pipe.predict(X.iloc[te])
|
||||||
|
loss = float(mean_squared_error(y.iloc[te], y_pred))
|
||||||
|
else:
|
||||||
|
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||||
|
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
|
||||||
|
else:
|
||||||
|
scores = pipe.decision_function(X.iloc[te])
|
||||||
|
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||||
|
y_prob = scores
|
||||||
|
loss = float(brier_score_loss(y.iloc[te], y_prob))
|
||||||
|
|
||||||
|
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||||
|
|
||||||
|
rep_rows.append(
|
||||||
|
{
|
||||||
|
"seed": seed,
|
||||||
|
"replicate_id": rep_id,
|
||||||
|
"protocol": "bootstrap",
|
||||||
|
"loss": loss,
|
||||||
|
"fit_time": float(t_fit),
|
||||||
|
"shap_time": float(t_shap),
|
||||||
|
"inst_feat_std_rep": float(agg_std_rep),
|
||||||
|
"stability_rep": float(stability_rep),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"seed": seed,
|
||||||
|
"protocol": "bootstrap",
|
||||||
|
"n_replicates": n_bootstrap,
|
||||||
|
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
|
||||||
|
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
|
||||||
|
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
|
||||||
|
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
|
||||||
|
"inst_feat_std": float(agg_std),
|
||||||
|
"stability": float(stability),
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary, rep_rows
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||||
|
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
|
||||||
|
ap.add_argument("--n-bootstrap", type=int, default=30)
|
||||||
|
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
|
||||||
|
ap.add_argument("--outdir", default="runs/protocol_bootstrap")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
X, y, task = load_dataset(args.dataset)
|
||||||
|
|
||||||
|
preproc_cfg = {
|
||||||
|
"num_impute_strategy": "median",
|
||||||
|
"cat_impute_strategy": "most_frequent",
|
||||||
|
"scaler": "standard",
|
||||||
|
"poly_degree": 1,
|
||||||
|
"select_k": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.algo == "rf":
|
||||||
|
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
|
||||||
|
elif args.algo == "gbt":
|
||||||
|
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
|
||||||
|
else:
|
||||||
|
model_params = {
|
||||||
|
"hidden_layers": (64, 64),
|
||||||
|
"activation": "relu",
|
||||||
|
"alpha": 1e-4,
|
||||||
|
"lr_init": 1e-3,
|
||||||
|
"max_iter": 200,
|
||||||
|
}
|
||||||
|
|
||||||
|
outdir = Path(args.outdir)
|
||||||
|
outdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
summaries = []
|
||||||
|
all_rep_rows = []
|
||||||
|
|
||||||
|
for seed in args.seeds:
|
||||||
|
summary, rep_rows = run_bootstrap_protocol(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
algo=args.algo,
|
||||||
|
model_params=model_params,
|
||||||
|
preproc_cfg=preproc_cfg,
|
||||||
|
n_bootstrap=args.n_bootstrap,
|
||||||
|
seed=seed,
|
||||||
|
)
|
||||||
|
summaries.append(summary)
|
||||||
|
all_rep_rows.extend(rep_rows)
|
||||||
|
|
||||||
|
summary_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_summary.csv"
|
||||||
|
reps_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_replicates.csv"
|
||||||
|
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_bootstrap.json"
|
||||||
|
|
||||||
|
pd.DataFrame(summaries).to_csv(summary_path, index=False)
|
||||||
|
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
|
||||||
|
|
||||||
|
with open(cfg_path, "w") as f:
|
||||||
|
json.dump(
|
||||||
|
{
|
||||||
|
"dataset": args.dataset,
|
||||||
|
"algo": args.algo,
|
||||||
|
"task": task,
|
||||||
|
"protocol": "bootstrap",
|
||||||
|
"protocol_params": {"n_bootstrap": args.n_bootstrap, "seeds": args.seeds},
|
||||||
|
"model_params": model_params,
|
||||||
|
"preproc_cfg": preproc_cfg,
|
||||||
|
},
|
||||||
|
f,
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Saved:")
|
||||||
|
print(summary_path)
|
||||||
|
print(reps_path)
|
||||||
|
print(cfg_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
195
src/protocols_methodology/exp_cv.py
Normal file
195
src/protocols_methodology/exp_cv.py
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||||
|
|
||||||
|
from src.data_openml import load_dataset
|
||||||
|
from src.preprocessing import build_preprocessor
|
||||||
|
from src.models import make_model
|
||||||
|
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||||
|
from src.protocols_methodology.protocols import kfold_indices
|
||||||
|
|
||||||
|
def run_cv_protocol(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
algo,
|
||||||
|
model_params,
|
||||||
|
preproc_cfg,
|
||||||
|
n_folds=5,
|
||||||
|
seed=0,
|
||||||
|
max_eval_rows=1024,
|
||||||
|
bg_size=128,
|
||||||
|
):
|
||||||
|
rng = np.random.RandomState(seed)
|
||||||
|
|
||||||
|
# fixed SHAP evaluation pool per seed
|
||||||
|
eval_size = min(max_eval_rows, len(X))
|
||||||
|
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||||
|
X_eval_fixed = X.iloc[eval_idx]
|
||||||
|
|
||||||
|
# freeze preprocessor dimensions for stability comparability
|
||||||
|
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
|
||||||
|
probe_pre = build_preprocessor(
|
||||||
|
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
Xp = probe_pre.fit_transform(X, y)
|
||||||
|
n_after_prep = Xp.shape[1]
|
||||||
|
desired_k = preproc_cfg.get("select_k", None)
|
||||||
|
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||||
|
|
||||||
|
shap_mats_with_names = []
|
||||||
|
rep_rows = []
|
||||||
|
|
||||||
|
# your exact old KFold generator
|
||||||
|
for rep_id, (tr, te) in enumerate(kfold_indices(len(X), n_folds, seed)):
|
||||||
|
preproc = build_preprocessor(
|
||||||
|
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||||
|
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||||
|
|
||||||
|
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||||
|
pipe,
|
||||||
|
X_fit=X.iloc[tr],
|
||||||
|
y_fit=y.iloc[tr],
|
||||||
|
X_eval=X_eval_fixed,
|
||||||
|
task_type=task,
|
||||||
|
bg_size=bg_size,
|
||||||
|
max_eval_rows=max_eval_rows,
|
||||||
|
rng_seed=seed,
|
||||||
|
)
|
||||||
|
shap_mats_with_names.append((shap_vals, feat_names))
|
||||||
|
|
||||||
|
# loss on fold test
|
||||||
|
if task == "regression":
|
||||||
|
y_pred = pipe.predict(X.iloc[te])
|
||||||
|
loss = float(mean_squared_error(y.iloc[te], y_pred))
|
||||||
|
else:
|
||||||
|
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||||
|
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
|
||||||
|
else:
|
||||||
|
scores = pipe.decision_function(X.iloc[te])
|
||||||
|
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||||
|
y_prob = scores
|
||||||
|
loss = float(brier_score_loss(y.iloc[te], y_prob))
|
||||||
|
|
||||||
|
# running stability so replicates can be plotted as a trajectory
|
||||||
|
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||||
|
|
||||||
|
rep_rows.append(
|
||||||
|
{
|
||||||
|
"seed": seed,
|
||||||
|
"replicate_id": rep_id,
|
||||||
|
"protocol": "cv",
|
||||||
|
"loss": loss,
|
||||||
|
"fit_time": float(t_fit),
|
||||||
|
"shap_time": float(t_shap),
|
||||||
|
"inst_feat_std_rep": float(agg_std_rep),
|
||||||
|
"stability_rep": float(stability_rep),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"seed": seed,
|
||||||
|
"protocol": "cv",
|
||||||
|
"n_replicates": n_folds,
|
||||||
|
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
|
||||||
|
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
|
||||||
|
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
|
||||||
|
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
|
||||||
|
"inst_feat_std": float(agg_std),
|
||||||
|
"stability": float(stability),
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary, rep_rows
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||||
|
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
|
||||||
|
ap.add_argument("--n-folds", type=int, default=5)
|
||||||
|
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
|
||||||
|
ap.add_argument("--outdir", default="runs/protocol_cv")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
X, y, task = load_dataset(args.dataset)
|
||||||
|
|
||||||
|
preproc_cfg = {
|
||||||
|
"num_impute_strategy": "median",
|
||||||
|
"cat_impute_strategy": "most_frequent",
|
||||||
|
"scaler": "standard",
|
||||||
|
"poly_degree": 1,
|
||||||
|
"select_k": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# fixed family for methodology experiments
|
||||||
|
if args.algo == "rf":
|
||||||
|
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
|
||||||
|
elif args.algo == "gbt":
|
||||||
|
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
|
||||||
|
else:
|
||||||
|
model_params = {
|
||||||
|
"hidden_layers": (64, 64),
|
||||||
|
"activation": "relu",
|
||||||
|
"alpha": 1e-4,
|
||||||
|
"lr_init": 1e-3,
|
||||||
|
"max_iter": 200,
|
||||||
|
}
|
||||||
|
|
||||||
|
outdir = Path(args.outdir)
|
||||||
|
outdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
summaries = []
|
||||||
|
all_rep_rows = []
|
||||||
|
|
||||||
|
for seed in args.seeds:
|
||||||
|
summary, rep_rows = run_cv_protocol(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
algo=args.algo,
|
||||||
|
model_params=model_params,
|
||||||
|
preproc_cfg=preproc_cfg,
|
||||||
|
n_folds=args.n_folds,
|
||||||
|
seed=seed,
|
||||||
|
)
|
||||||
|
summaries.append(summary)
|
||||||
|
all_rep_rows.extend(rep_rows)
|
||||||
|
|
||||||
|
summary_path = outdir / f"{args.dataset}_{args.algo}_cv_summary.csv"
|
||||||
|
reps_path = outdir / f"{args.dataset}_{args.algo}_cv_replicates.csv"
|
||||||
|
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_cv.json"
|
||||||
|
|
||||||
|
pd.DataFrame(summaries).to_csv(summary_path, index=False)
|
||||||
|
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
|
||||||
|
|
||||||
|
with open(cfg_path, "w") as f:
|
||||||
|
json.dump(
|
||||||
|
{
|
||||||
|
"dataset": args.dataset,
|
||||||
|
"algo": args.algo,
|
||||||
|
"task": task,
|
||||||
|
"protocol": "cv",
|
||||||
|
"protocol_params": {"n_folds": args.n_folds, "seeds": args.seeds},
|
||||||
|
"model_params": model_params,
|
||||||
|
"preproc_cfg": preproc_cfg,
|
||||||
|
},
|
||||||
|
f,
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Saved:")
|
||||||
|
print(summary_path)
|
||||||
|
print(reps_path)
|
||||||
|
print(cfg_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
206
src/protocols_methodology/exp_noise.py
Normal file
206
src/protocols_methodology/exp_noise.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||||
|
|
||||||
|
from src.data_openml import load_dataset
|
||||||
|
from src.preprocessing import build_preprocessor
|
||||||
|
from src.models import make_model
|
||||||
|
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||||
|
from src.protocols_methodology.protocols import noise_perturbations
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def run_noise_protocol(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
algo,
|
||||||
|
model_params,
|
||||||
|
preproc_cfg,
|
||||||
|
n_replicates=30,
|
||||||
|
noise_std=0.01,
|
||||||
|
seed=0,
|
||||||
|
max_eval_rows=1024,
|
||||||
|
bg_size=128,
|
||||||
|
):
|
||||||
|
rng = np.random.RandomState(seed)
|
||||||
|
|
||||||
|
eval_size = min(max_eval_rows, len(X))
|
||||||
|
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||||
|
X_eval_fixed = X.iloc[eval_idx]
|
||||||
|
|
||||||
|
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
|
||||||
|
probe_pre = build_preprocessor(
|
||||||
|
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
Xp = probe_pre.fit_transform(X, y)
|
||||||
|
n_after_prep = Xp.shape[1]
|
||||||
|
desired_k = preproc_cfg.get("select_k", None)
|
||||||
|
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||||
|
|
||||||
|
shap_mats_with_names = []
|
||||||
|
rep_rows = []
|
||||||
|
|
||||||
|
# use your old noise generator for consistency
|
||||||
|
# it expects a list of levels, so repeat noise_std
|
||||||
|
levels = [noise_std] * n_replicates
|
||||||
|
noisy_sets = noise_perturbations(X, levels, seed)
|
||||||
|
|
||||||
|
for rep_id, (sigma, X_noisy) in enumerate(noisy_sets):
|
||||||
|
preproc = build_preprocessor(
|
||||||
|
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||||
|
)
|
||||||
|
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||||
|
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||||
|
|
||||||
|
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||||
|
pipe,
|
||||||
|
X_fit=X_noisy,
|
||||||
|
y_fit=y,
|
||||||
|
X_eval=X_eval_fixed,
|
||||||
|
task_type=task,
|
||||||
|
bg_size=bg_size,
|
||||||
|
max_eval_rows=max_eval_rows,
|
||||||
|
rng_seed=seed,
|
||||||
|
)
|
||||||
|
shap_mats_with_names.append((shap_vals, feat_names))
|
||||||
|
|
||||||
|
# evaluate on clean X to measure robustness of noisy training
|
||||||
|
if task == "regression":
|
||||||
|
y_pred = pipe.predict(X)
|
||||||
|
loss = float(mean_squared_error(y, y_pred))
|
||||||
|
else:
|
||||||
|
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||||
|
y_prob = pipe.predict_proba(X)[:, 1]
|
||||||
|
else:
|
||||||
|
scores = pipe.decision_function(X)
|
||||||
|
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||||
|
y_prob = scores
|
||||||
|
loss = float(brier_score_loss(y, y_prob))
|
||||||
|
|
||||||
|
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||||
|
|
||||||
|
rep_rows.append(
|
||||||
|
{
|
||||||
|
"seed": seed,
|
||||||
|
"replicate_id": rep_id,
|
||||||
|
"protocol": "noise",
|
||||||
|
"sigma": float(sigma),
|
||||||
|
"loss": loss,
|
||||||
|
"fit_time": float(t_fit),
|
||||||
|
"shap_time": float(t_shap),
|
||||||
|
"inst_feat_std_rep": float(agg_std_rep),
|
||||||
|
"stability_rep": float(stability_rep),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"seed": seed,
|
||||||
|
"protocol": "noise",
|
||||||
|
"n_replicates": n_replicates,
|
||||||
|
"noise_std": float(noise_std),
|
||||||
|
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
|
||||||
|
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
|
||||||
|
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
|
||||||
|
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
|
||||||
|
"inst_feat_std": float(agg_std),
|
||||||
|
"stability": float(stability),
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary, rep_rows
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||||
|
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
|
||||||
|
ap.add_argument("--n-replicates", type=int, default=30)
|
||||||
|
ap.add_argument("--noise-std", type=float, default=0.01)
|
||||||
|
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
|
||||||
|
ap.add_argument("--outdir", default="runs/protocol_noise")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
X, y, task = load_dataset(args.dataset)
|
||||||
|
|
||||||
|
preproc_cfg = {
|
||||||
|
"num_impute_strategy": "median",
|
||||||
|
"cat_impute_strategy": "most_frequent",
|
||||||
|
"scaler": "standard",
|
||||||
|
"poly_degree": 1,
|
||||||
|
"select_k": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.algo == "rf":
|
||||||
|
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
|
||||||
|
elif args.algo == "gbt":
|
||||||
|
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
|
||||||
|
else:
|
||||||
|
model_params = {
|
||||||
|
"hidden_layers": (64, 64),
|
||||||
|
"activation": "relu",
|
||||||
|
"alpha": 1e-4,
|
||||||
|
"lr_init": 1e-3,
|
||||||
|
"max_iter": 200,
|
||||||
|
}
|
||||||
|
|
||||||
|
outdir = Path(args.outdir)
|
||||||
|
outdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
summaries = []
|
||||||
|
all_rep_rows = []
|
||||||
|
|
||||||
|
for seed in args.seeds:
|
||||||
|
summary, rep_rows = run_noise_protocol(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
algo=args.algo,
|
||||||
|
model_params=model_params,
|
||||||
|
preproc_cfg=preproc_cfg,
|
||||||
|
n_replicates=args.n_replicates,
|
||||||
|
noise_std=args.noise_std,
|
||||||
|
seed=seed,
|
||||||
|
)
|
||||||
|
summaries.append(summary)
|
||||||
|
all_rep_rows.extend(rep_rows)
|
||||||
|
|
||||||
|
summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv"
|
||||||
|
reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv"
|
||||||
|
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json"
|
||||||
|
|
||||||
|
pd.DataFrame(summaries).to_csv(summary_path, index=False)
|
||||||
|
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
|
||||||
|
|
||||||
|
with open(cfg_path, "w") as f:
|
||||||
|
json.dump(
|
||||||
|
{
|
||||||
|
"dataset": args.dataset,
|
||||||
|
"algo": args.algo,
|
||||||
|
"task": task,
|
||||||
|
"protocol": "noise",
|
||||||
|
"protocol_params": {
|
||||||
|
"n_replicates": args.n_replicates,
|
||||||
|
"noise_std": args.noise_std,
|
||||||
|
"seeds": args.seeds,
|
||||||
|
},
|
||||||
|
"model_params": model_params,
|
||||||
|
"preproc_cfg": preproc_cfg,
|
||||||
|
},
|
||||||
|
f,
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Saved:")
|
||||||
|
print(summary_path)
|
||||||
|
print(reps_path)
|
||||||
|
print(cfg_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
78
src/protocols_methodology/nsga_toolbox_protocols.py
Normal file
78
src/protocols_methodology/nsga_toolbox_protocols.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import mlflow
|
||||||
|
from deap import base, creator, tools
|
||||||
|
from sklearn.utils import check_random_state
|
||||||
|
|
||||||
|
from src.search.nsga_deap import decode
|
||||||
|
from src.protocols_methodology.automl_evaluate import evaluate_config_protocol_aware
|
||||||
|
|
||||||
|
|
||||||
|
def build_toolbox_protocol_aware(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
task,
|
||||||
|
seed,
|
||||||
|
protocol_fn,
|
||||||
|
protocol_params,
|
||||||
|
mlflow_experiment,
|
||||||
|
):
|
||||||
|
rng = check_random_state(seed)
|
||||||
|
|
||||||
|
if not hasattr(creator, "FitnessMSEStab"):
|
||||||
|
creator.create("FitnessMSEStab", base.Fitness, weights=(-1.0, 1.0))
|
||||||
|
if not hasattr(creator, "Individual"):
|
||||||
|
creator.create("Individual", list, fitness=creator.FitnessMSEStab)
|
||||||
|
|
||||||
|
toolbox = base.Toolbox()
|
||||||
|
toolbox.register("gene", rng.randint, 0, 1000000)
|
||||||
|
toolbox.register(
|
||||||
|
"individual",
|
||||||
|
tools.initRepeat,
|
||||||
|
creator.Individual,
|
||||||
|
toolbox.gene,
|
||||||
|
n=16,
|
||||||
|
)
|
||||||
|
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
|
||||||
|
|
||||||
|
def eval_ind(individual):
|
||||||
|
algo, model_params, pre_cfg = decode(individual)
|
||||||
|
|
||||||
|
with mlflow.start_run(run_name=f"{algo}", nested=True):
|
||||||
|
for gi, g in enumerate(individual):
|
||||||
|
mlflow.log_param(f"g{gi}", int(g))
|
||||||
|
mlflow.log_param("algo", algo)
|
||||||
|
for k, v in model_params.items():
|
||||||
|
mlflow.log_param(f"m_{k}", v)
|
||||||
|
for k, v in pre_cfg.items():
|
||||||
|
mlflow.log_param(f"p_{k}", v)
|
||||||
|
|
||||||
|
mse_like, stability, meta = evaluate_config_protocol_aware(
|
||||||
|
X=X,
|
||||||
|
y=y,
|
||||||
|
task=task,
|
||||||
|
algo=algo,
|
||||||
|
model_params=model_params,
|
||||||
|
pre_cfg=pre_cfg,
|
||||||
|
protocol_fn=protocol_fn,
|
||||||
|
protocol_params=protocol_params,
|
||||||
|
seed=seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
mlflow.log_metric("mse_like", mse_like)
|
||||||
|
mlflow.log_metric("stability", stability)
|
||||||
|
for mk, mv in meta.items():
|
||||||
|
mlflow.log_metric(mk, mv)
|
||||||
|
|
||||||
|
return mse_like, stability
|
||||||
|
|
||||||
|
toolbox.register("evaluate", eval_ind)
|
||||||
|
toolbox.register("mate", tools.cxTwoPoint)
|
||||||
|
toolbox.register(
|
||||||
|
"mutate",
|
||||||
|
tools.mutUniformInt,
|
||||||
|
low=0,
|
||||||
|
up=1000000,
|
||||||
|
indpb=0.2,
|
||||||
|
)
|
||||||
|
toolbox.register("select", tools.selNSGA2)
|
||||||
|
|
||||||
|
return toolbox
|
||||||
37
src/protocols_methodology/protocols.py
Normal file
37
src/protocols_methodology/protocols.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# src/protocols.py
|
||||||
|
|
||||||
|
from typing import Iterable, Tuple, List
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from pandas.api.types import is_numeric_dtype
|
||||||
|
from sklearn.model_selection import KFold
|
||||||
|
from sklearn.utils import resample
|
||||||
|
|
||||||
|
def kfold_indices(n: int, k: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
|
||||||
|
kf = KFold(n_splits=k, shuffle=True, random_state=seed)
|
||||||
|
for tr, te in kf.split(range(n)):
|
||||||
|
yield np.array(tr), np.array(te)
|
||||||
|
|
||||||
|
def bootstrap_indices(n: int, B: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
|
||||||
|
rng = np.random.RandomState(seed)
|
||||||
|
for _ in range(B):
|
||||||
|
train_idx = resample(np.arange(n), replace=True, n_samples=n, random_state=rng)
|
||||||
|
mask = np.ones(n, dtype=bool)
|
||||||
|
mask[train_idx] = False
|
||||||
|
test_idx = np.where(mask)[0]
|
||||||
|
if len(test_idx) == 0:
|
||||||
|
test_idx = rng.choice(n, size=max(1, n // 5), replace=False)
|
||||||
|
yield train_idx, test_idx
|
||||||
|
|
||||||
|
def noise_perturbations(X: pd.DataFrame, levels: List[float], seed: int):
|
||||||
|
rng = np.random.RandomState(seed)
|
||||||
|
Xn_list = []
|
||||||
|
# compute std only on numeric columns
|
||||||
|
num_cols = [c for c in X.columns if is_numeric_dtype(X[c])]
|
||||||
|
std = X[num_cols].std().replace(0, 1.0)
|
||||||
|
for sigma in levels:
|
||||||
|
Xn = X.copy()
|
||||||
|
for col in num_cols:
|
||||||
|
Xn[col] = Xn[col] + rng.normal(0, sigma * std.get(col, 1.0), size=len(Xn))
|
||||||
|
Xn_list.append((sigma, Xn))
|
||||||
|
return Xn_list
|
||||||
204
src/protocols_methodology/run_nsga_protocols.py
Normal file
204
src/protocols_methodology/run_nsga_protocols.py
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
import pickle
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import mlflow
|
||||||
|
from deap import algorithms
|
||||||
|
from deap.tools.emo import sortNondominated
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from src.data_openml import load_dataset
|
||||||
|
from src.search.nsga_deap import decode
|
||||||
|
from src.protocols_methodology.nsga_toolbox_protocols import build_toolbox_protocol_aware
|
||||||
|
from src.protocols_methodology.automl_protocol_adapters import (
|
||||||
|
cv_protocol,
|
||||||
|
bootstrap_protocol,
|
||||||
|
noise_protocol,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(path, gen, pop, seed):
|
||||||
|
state = {
|
||||||
|
"gen": gen,
|
||||||
|
"pop": pop,
|
||||||
|
"py_random_state": random.getstate(),
|
||||||
|
"np_random_state": np.random.get_state(),
|
||||||
|
"seed": seed,
|
||||||
|
}
|
||||||
|
with open(path, "wb") as f:
|
||||||
|
pickle.dump(state, f)
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint(path):
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
state = pickle.load(f)
|
||||||
|
random.setstate(state["py_random_state"])
|
||||||
|
np.random.set_state(state["np_random_state"])
|
||||||
|
return state["gen"], state["pop"], state["seed"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||||
|
ap.add_argument("--generations", type=int, default=10)
|
||||||
|
ap.add_argument("--pop-size", type=int, default=24)
|
||||||
|
ap.add_argument("--seed", type=int, default=42)
|
||||||
|
ap.add_argument("--experiment", default="deap_nsga_protocol_study")
|
||||||
|
ap.add_argument("--checkpoint-every", type=int, default=5)
|
||||||
|
ap.add_argument("--shap-pf-eval-rows", type=int, default=512)
|
||||||
|
ap.add_argument("--n-folds", type=int, default=3)
|
||||||
|
ap.add_argument("--n-bootstrap", type=int, default=30)
|
||||||
|
ap.add_argument("--n-noise", type=int, default=30)
|
||||||
|
ap.add_argument("--noise-std", type=float, default=0.01)
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
X, y, task = load_dataset(args.dataset, random_state=args.seed)
|
||||||
|
mlflow.set_experiment(args.experiment)
|
||||||
|
|
||||||
|
protocols = {
|
||||||
|
"cv": (cv_protocol, {"n_folds": args.n_folds}),
|
||||||
|
"bootstrap": (bootstrap_protocol, {"n_bootstrap": args.n_bootstrap}),
|
||||||
|
"noise": (noise_protocol, {"n_replicates": args.n_noise, "noise_std": args.noise_std}),
|
||||||
|
}
|
||||||
|
|
||||||
|
base_outdir = Path("runs") / f"{args.dataset}_protocol_study"
|
||||||
|
base_outdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for pname, (pfn, pparams) in protocols.items():
|
||||||
|
outdir = base_outdir / pname
|
||||||
|
outdir.mkdir(parents=True, exist_ok=True)
|
||||||
|
ckpt_path = outdir / "checkpoint.pkl"
|
||||||
|
|
||||||
|
random.seed(args.seed)
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
|
||||||
|
toolbox = build_toolbox_protocol_aware(
|
||||||
|
X=X,
|
||||||
|
y=y,
|
||||||
|
task=task,
|
||||||
|
seed=args.seed,
|
||||||
|
protocol_fn=pfn,
|
||||||
|
protocol_params=pparams,
|
||||||
|
mlflow_experiment=args.experiment,
|
||||||
|
)
|
||||||
|
|
||||||
|
if ckpt_path.exists():
|
||||||
|
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
|
||||||
|
if loaded_seed != args.seed:
|
||||||
|
print(f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}")
|
||||||
|
print(f"[{pname}] Resuming from generation {start_gen}")
|
||||||
|
else:
|
||||||
|
pop = toolbox.population(n=args.pop_size)
|
||||||
|
fits = list(map(toolbox.evaluate, pop))
|
||||||
|
for ind, fit in zip(pop, fits):
|
||||||
|
ind.fitness.values = fit
|
||||||
|
start_gen = 0
|
||||||
|
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
|
||||||
|
print(f"[{pname}] Initial checkpoint saved")
|
||||||
|
|
||||||
|
for gen in range(start_gen, args.generations):
|
||||||
|
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
|
||||||
|
fits = list(map(toolbox.evaluate, offspring))
|
||||||
|
for ind, fit in zip(offspring, fits):
|
||||||
|
ind.fitness.values = fit
|
||||||
|
pop = toolbox.select(pop + offspring, k=args.pop_size)
|
||||||
|
# save PF history for convergence plots
|
||||||
|
pf_gen = sortNondominated(pop, len(pop), first_front_only=True)[0]
|
||||||
|
rows_gen = []
|
||||||
|
for ind in pf_gen:
|
||||||
|
algo, model_params, pre_cfg = decode(ind)
|
||||||
|
rows_gen.append({
|
||||||
|
"gen": gen + 1,
|
||||||
|
"algo": algo,
|
||||||
|
"mse_like": ind.fitness.values[0],
|
||||||
|
"stability": ind.fitness.values[1],
|
||||||
|
})
|
||||||
|
pd.DataFrame(rows_gen).to_csv(outdir / f"pareto_gen_{gen + 1}.csv", index=False)
|
||||||
|
|
||||||
|
if (gen + 1) % args.checkpoint_every == 0:
|
||||||
|
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
|
||||||
|
print(f"[{pname}] Checkpoint saved at gen {gen + 1}")
|
||||||
|
# save final full population for dominated region analysis
|
||||||
|
all_rows = []
|
||||||
|
for ind in pop:
|
||||||
|
algo, model_params, pre_cfg = decode(ind)
|
||||||
|
all_rows.append({
|
||||||
|
"algo": algo,
|
||||||
|
"mse_like": ind.fitness.values[0],
|
||||||
|
"stability": ind.fitness.values[1],
|
||||||
|
**{f"m_{k}": v for k, v in model_params.items()},
|
||||||
|
**{f"p_{k}": v for k, v in pre_cfg.items()},
|
||||||
|
})
|
||||||
|
pd.DataFrame(all_rows).to_csv(outdir / "final_population.csv", index=False)
|
||||||
|
|
||||||
|
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
|
||||||
|
rows = []
|
||||||
|
for ind in pf:
|
||||||
|
algo, model_params, pre_cfg = decode(ind)
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"algo": algo,
|
||||||
|
"mse_like": ind.fitness.values[0],
|
||||||
|
"stability": ind.fitness.values[1],
|
||||||
|
**{f"m_{k}": v for k, v in model_params.items()},
|
||||||
|
**{f"p_{k}": v for k, v in pre_cfg.items()},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
pareto_path = outdir / "pareto_front.csv"
|
||||||
|
pd.DataFrame(rows).to_csv(pareto_path, index=False)
|
||||||
|
print(f"[{pname}] Saved Pareto front to {pareto_path}")
|
||||||
|
|
||||||
|
# optional SHAP saving for PF models, same as run_deap
|
||||||
|
shap_dir = outdir / "shap"
|
||||||
|
shap_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
eval_rows = min(args.shap_pf_eval_rows, len(X))
|
||||||
|
rng = np.random.RandomState(args.seed)
|
||||||
|
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
|
||||||
|
X_eval_shap = X.iloc[eval_idx]
|
||||||
|
|
||||||
|
from src.preprocessing import build_preprocessor
|
||||||
|
from src.models import make_model
|
||||||
|
from src.stability import compute_shap_matrix
|
||||||
|
from sklearn.pipeline import Pipeline as SkPipeline
|
||||||
|
|
||||||
|
for i, ind in enumerate(pf):
|
||||||
|
algo, model_params, pre_cfg = decode(ind)
|
||||||
|
|
||||||
|
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
|
||||||
|
fixed_k = pre_cfg.get("select_k", None)
|
||||||
|
|
||||||
|
preproc = build_preprocessor(
|
||||||
|
X,
|
||||||
|
task,
|
||||||
|
pre_cfg,
|
||||||
|
fixed_k=fixed_k,
|
||||||
|
fixed_poly_degree=fixed_poly_degree,
|
||||||
|
)
|
||||||
|
model = make_model(task, algo, model_params, random_state=args.seed)
|
||||||
|
pipe = SkPipeline([("pre", preproc), ("model", model)])
|
||||||
|
|
||||||
|
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||||
|
pipe,
|
||||||
|
X_fit=X,
|
||||||
|
y_fit=y,
|
||||||
|
X_eval=X_eval_shap,
|
||||||
|
task_type=task,
|
||||||
|
bg_size=128,
|
||||||
|
max_eval_rows=eval_rows,
|
||||||
|
rng_seed=args.seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
|
||||||
|
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
|
||||||
|
|
||||||
|
print(f"[{pname}] Saved SHAP arrays for {len(pf)} PF models")
|
||||||
|
|
||||||
|
print(f"Done. All protocol AutoML runs in {base_outdir}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user