New Start
This commit is contained in:
14
run_deap.py
14
run_deap.py
@@ -5,7 +5,6 @@ from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import mlflow
|
||||
from datetime import datetime
|
||||
from deap import algorithms
|
||||
from deap.tools.emo import sortNondominated
|
||||
import pandas as pd
|
||||
@@ -16,16 +15,6 @@ from src.preprocessing import build_preprocessor
|
||||
from src.models import make_model
|
||||
from src.stability import compute_shap_matrix
|
||||
|
||||
# Main network
|
||||
# mlflow.set_tracking_uri("http://192.168.2.169:5000")
|
||||
|
||||
# Cluster Subnet
|
||||
mlflow.set_tracking_uri("http://10.10.0.5:5000")
|
||||
|
||||
# Network with DNS resolution (specified hosts or Tailnet)
|
||||
#mlflow.set_tracking_uri("http://medea:5000")
|
||||
|
||||
|
||||
|
||||
def save_checkpoint(path, gen, pop, seed):
|
||||
state = {
|
||||
@@ -54,8 +43,7 @@ def main():
|
||||
ap.add_argument("--pop-size", type=int, default=24)
|
||||
ap.add_argument("--seed", type=int, default=42)
|
||||
ap.add_argument("--cv-folds", type=int, default=3)
|
||||
experiment_name = f"deap_nsga_shap_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
ap.add_argument("--experiment", default=experiment_name)
|
||||
ap.add_argument("--experiment", default="deap_nsga_shap")
|
||||
ap.add_argument("--checkpoint-every", type=int, default=5)
|
||||
ap.add_argument(
|
||||
"--shap-pf-eval-rows",
|
||||
|
||||
103
src/protocols_methodology/automl_evaluate.py
Normal file
103
src/protocols_methodology/automl_evaluate.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import numpy as np
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||
|
||||
from src.preprocessing import build_preprocessor
|
||||
from src.models import make_model
|
||||
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||
|
||||
|
||||
def evaluate_config_protocol_aware(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
algo,
|
||||
model_params,
|
||||
pre_cfg,
|
||||
protocol_fn,
|
||||
protocol_params,
|
||||
seed=0,
|
||||
max_eval_rows=1024,
|
||||
bg_size=128,
|
||||
):
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
# fixed SHAP evaluation pool per individual evaluation
|
||||
eval_size = min(max_eval_rows, len(X))
|
||||
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||
X_eval_fixed = X.iloc[eval_idx]
|
||||
|
||||
# freeze preprocessing dimensionality
|
||||
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
|
||||
probe_pre = build_preprocessor(
|
||||
X, task, pre_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
Xp = probe_pre.fit_transform(X, y)
|
||||
n_after_prep = Xp.shape[1]
|
||||
desired_k = pre_cfg.get("select_k", None)
|
||||
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||
|
||||
shap_mats_with_names = []
|
||||
losses = []
|
||||
fit_times = []
|
||||
shap_times = []
|
||||
|
||||
replicates = protocol_fn(X, y, seed=seed, **protocol_params)
|
||||
|
||||
for rep_id, rep in enumerate(replicates):
|
||||
if rep["type"] in ["cv", "bootstrap"]:
|
||||
tr, te = rep["train_idx"], rep["test_idx"]
|
||||
X_fit, y_fit = X.iloc[tr], y.iloc[tr]
|
||||
X_test, y_test = X.iloc[te], y.iloc[te]
|
||||
else:
|
||||
X_fit, y_fit = rep["X_noisy"], y
|
||||
X_test, y_test = X, y
|
||||
|
||||
preproc = build_preprocessor(
|
||||
X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||
|
||||
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||
pipe,
|
||||
X_fit=X_fit,
|
||||
y_fit=y_fit,
|
||||
X_eval=X_eval_fixed,
|
||||
task_type=task,
|
||||
bg_size=bg_size,
|
||||
max_eval_rows=max_eval_rows,
|
||||
rng_seed=seed,
|
||||
)
|
||||
shap_mats_with_names.append((shap_vals, feat_names))
|
||||
fit_times.append(t_fit)
|
||||
shap_times.append(t_shap)
|
||||
|
||||
if task == "regression":
|
||||
y_pred = pipe.predict(X_test)
|
||||
loss = float(mean_squared_error(y_test, y_pred))
|
||||
else:
|
||||
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||
y_prob = pipe.predict_proba(X_test)[:, 1]
|
||||
else:
|
||||
scores = pipe.decision_function(X_test)
|
||||
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||
y_prob = scores
|
||||
loss = float(brier_score_loss(y_test, y_prob))
|
||||
|
||||
losses.append(loss)
|
||||
|
||||
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||
|
||||
mse_like = float(np.mean(losses))
|
||||
stability_val = float(stability)
|
||||
|
||||
meta = {
|
||||
"loss_std": float(np.std(losses)),
|
||||
"fit_time_mean": float(np.mean(fit_times)) if fit_times else 0.0,
|
||||
"shap_time_mean": float(np.mean(shap_times)) if shap_times else 0.0,
|
||||
"inst_feat_std": float(agg_std),
|
||||
"n_replicates": len(replicates),
|
||||
}
|
||||
|
||||
return mse_like, stability_val, meta
|
||||
24
src/protocols_methodology/automl_protocol_adapters.py
Normal file
24
src/protocols_methodology/automl_protocol_adapters.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from src.protocols_methodology.protocols import (
|
||||
kfold_indices,
|
||||
bootstrap_indices,
|
||||
noise_perturbations,
|
||||
)
|
||||
|
||||
def cv_protocol(X, y, n_folds=5, seed=0):
|
||||
reps = []
|
||||
for tr, te in kfold_indices(len(X), n_folds, seed):
|
||||
reps.append({"type": "cv", "train_idx": tr, "test_idx": te})
|
||||
return reps
|
||||
|
||||
def bootstrap_protocol(X, y, n_bootstrap=30, seed=0):
|
||||
reps = []
|
||||
for tr, te in bootstrap_indices(len(X), n_bootstrap, seed):
|
||||
reps.append({"type": "bootstrap", "train_idx": tr, "test_idx": te})
|
||||
return reps
|
||||
|
||||
def noise_protocol(X, y, n_replicates=30, noise_std=0.01, seed=0):
|
||||
levels = [noise_std] * n_replicates
|
||||
reps = []
|
||||
for sigma, X_noisy in noise_perturbations(X, levels, seed):
|
||||
reps.append({"type": "noise", "sigma": sigma, "X_noisy": X_noisy})
|
||||
return reps
|
||||
195
src/protocols_methodology/exp_bootstrap.py
Normal file
195
src/protocols_methodology/exp_bootstrap.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||
|
||||
from src.data_openml import load_dataset
|
||||
from src.preprocessing import build_preprocessor
|
||||
from src.models import make_model
|
||||
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||
from src.protocols_methodology.protocols import bootstrap_indices
|
||||
|
||||
|
||||
|
||||
def run_bootstrap_protocol(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
algo,
|
||||
model_params,
|
||||
preproc_cfg,
|
||||
n_bootstrap=30,
|
||||
seed=0,
|
||||
max_eval_rows=1024,
|
||||
bg_size=128,
|
||||
):
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
eval_size = min(max_eval_rows, len(X))
|
||||
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||
X_eval_fixed = X.iloc[eval_idx]
|
||||
|
||||
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
|
||||
probe_pre = build_preprocessor(
|
||||
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
Xp = probe_pre.fit_transform(X, y)
|
||||
n_after_prep = Xp.shape[1]
|
||||
desired_k = preproc_cfg.get("select_k", None)
|
||||
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||
|
||||
shap_mats_with_names = []
|
||||
rep_rows = []
|
||||
|
||||
for rep_id, (tr, te) in enumerate(bootstrap_indices(len(X), n_bootstrap, seed)):
|
||||
X_boot = X.iloc[tr]
|
||||
y_boot = y.iloc[tr]
|
||||
|
||||
preproc = build_preprocessor(
|
||||
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||
|
||||
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||
pipe,
|
||||
X_fit=X_boot,
|
||||
y_fit=y_boot,
|
||||
X_eval=X_eval_fixed,
|
||||
task_type=task,
|
||||
bg_size=bg_size,
|
||||
max_eval_rows=max_eval_rows,
|
||||
rng_seed=seed,
|
||||
)
|
||||
shap_mats_with_names.append((shap_vals, feat_names))
|
||||
|
||||
# OOB loss on te to match your earlier logic
|
||||
if task == "regression":
|
||||
y_pred = pipe.predict(X.iloc[te])
|
||||
loss = float(mean_squared_error(y.iloc[te], y_pred))
|
||||
else:
|
||||
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
|
||||
else:
|
||||
scores = pipe.decision_function(X.iloc[te])
|
||||
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||
y_prob = scores
|
||||
loss = float(brier_score_loss(y.iloc[te], y_prob))
|
||||
|
||||
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||
|
||||
rep_rows.append(
|
||||
{
|
||||
"seed": seed,
|
||||
"replicate_id": rep_id,
|
||||
"protocol": "bootstrap",
|
||||
"loss": loss,
|
||||
"fit_time": float(t_fit),
|
||||
"shap_time": float(t_shap),
|
||||
"inst_feat_std_rep": float(agg_std_rep),
|
||||
"stability_rep": float(stability_rep),
|
||||
}
|
||||
)
|
||||
|
||||
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||
|
||||
summary = {
|
||||
"seed": seed,
|
||||
"protocol": "bootstrap",
|
||||
"n_replicates": n_bootstrap,
|
||||
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
|
||||
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
|
||||
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
|
||||
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
|
||||
"inst_feat_std": float(agg_std),
|
||||
"stability": float(stability),
|
||||
}
|
||||
|
||||
return summary, rep_rows
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
|
||||
ap.add_argument("--n-bootstrap", type=int, default=30)
|
||||
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
|
||||
ap.add_argument("--outdir", default="runs/protocol_bootstrap")
|
||||
args = ap.parse_args()
|
||||
|
||||
X, y, task = load_dataset(args.dataset)
|
||||
|
||||
preproc_cfg = {
|
||||
"num_impute_strategy": "median",
|
||||
"cat_impute_strategy": "most_frequent",
|
||||
"scaler": "standard",
|
||||
"poly_degree": 1,
|
||||
"select_k": None,
|
||||
}
|
||||
|
||||
if args.algo == "rf":
|
||||
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
|
||||
elif args.algo == "gbt":
|
||||
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
|
||||
else:
|
||||
model_params = {
|
||||
"hidden_layers": (64, 64),
|
||||
"activation": "relu",
|
||||
"alpha": 1e-4,
|
||||
"lr_init": 1e-3,
|
||||
"max_iter": 200,
|
||||
}
|
||||
|
||||
outdir = Path(args.outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
summaries = []
|
||||
all_rep_rows = []
|
||||
|
||||
for seed in args.seeds:
|
||||
summary, rep_rows = run_bootstrap_protocol(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
algo=args.algo,
|
||||
model_params=model_params,
|
||||
preproc_cfg=preproc_cfg,
|
||||
n_bootstrap=args.n_bootstrap,
|
||||
seed=seed,
|
||||
)
|
||||
summaries.append(summary)
|
||||
all_rep_rows.extend(rep_rows)
|
||||
|
||||
summary_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_summary.csv"
|
||||
reps_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_replicates.csv"
|
||||
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_bootstrap.json"
|
||||
|
||||
pd.DataFrame(summaries).to_csv(summary_path, index=False)
|
||||
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
|
||||
|
||||
with open(cfg_path, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"dataset": args.dataset,
|
||||
"algo": args.algo,
|
||||
"task": task,
|
||||
"protocol": "bootstrap",
|
||||
"protocol_params": {"n_bootstrap": args.n_bootstrap, "seeds": args.seeds},
|
||||
"model_params": model_params,
|
||||
"preproc_cfg": preproc_cfg,
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
print("Saved:")
|
||||
print(summary_path)
|
||||
print(reps_path)
|
||||
print(cfg_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
195
src/protocols_methodology/exp_cv.py
Normal file
195
src/protocols_methodology/exp_cv.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||
|
||||
from src.data_openml import load_dataset
|
||||
from src.preprocessing import build_preprocessor
|
||||
from src.models import make_model
|
||||
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||
from src.protocols_methodology.protocols import kfold_indices
|
||||
|
||||
def run_cv_protocol(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
algo,
|
||||
model_params,
|
||||
preproc_cfg,
|
||||
n_folds=5,
|
||||
seed=0,
|
||||
max_eval_rows=1024,
|
||||
bg_size=128,
|
||||
):
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
# fixed SHAP evaluation pool per seed
|
||||
eval_size = min(max_eval_rows, len(X))
|
||||
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||
X_eval_fixed = X.iloc[eval_idx]
|
||||
|
||||
# freeze preprocessor dimensions for stability comparability
|
||||
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
|
||||
probe_pre = build_preprocessor(
|
||||
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
Xp = probe_pre.fit_transform(X, y)
|
||||
n_after_prep = Xp.shape[1]
|
||||
desired_k = preproc_cfg.get("select_k", None)
|
||||
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||
|
||||
shap_mats_with_names = []
|
||||
rep_rows = []
|
||||
|
||||
# your exact old KFold generator
|
||||
for rep_id, (tr, te) in enumerate(kfold_indices(len(X), n_folds, seed)):
|
||||
preproc = build_preprocessor(
|
||||
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||
|
||||
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||
pipe,
|
||||
X_fit=X.iloc[tr],
|
||||
y_fit=y.iloc[tr],
|
||||
X_eval=X_eval_fixed,
|
||||
task_type=task,
|
||||
bg_size=bg_size,
|
||||
max_eval_rows=max_eval_rows,
|
||||
rng_seed=seed,
|
||||
)
|
||||
shap_mats_with_names.append((shap_vals, feat_names))
|
||||
|
||||
# loss on fold test
|
||||
if task == "regression":
|
||||
y_pred = pipe.predict(X.iloc[te])
|
||||
loss = float(mean_squared_error(y.iloc[te], y_pred))
|
||||
else:
|
||||
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||
y_prob = pipe.predict_proba(X.iloc[te])[:, 1]
|
||||
else:
|
||||
scores = pipe.decision_function(X.iloc[te])
|
||||
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||
y_prob = scores
|
||||
loss = float(brier_score_loss(y.iloc[te], y_prob))
|
||||
|
||||
# running stability so replicates can be plotted as a trajectory
|
||||
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||
|
||||
rep_rows.append(
|
||||
{
|
||||
"seed": seed,
|
||||
"replicate_id": rep_id,
|
||||
"protocol": "cv",
|
||||
"loss": loss,
|
||||
"fit_time": float(t_fit),
|
||||
"shap_time": float(t_shap),
|
||||
"inst_feat_std_rep": float(agg_std_rep),
|
||||
"stability_rep": float(stability_rep),
|
||||
}
|
||||
)
|
||||
|
||||
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||
|
||||
summary = {
|
||||
"seed": seed,
|
||||
"protocol": "cv",
|
||||
"n_replicates": n_folds,
|
||||
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
|
||||
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
|
||||
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
|
||||
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
|
||||
"inst_feat_std": float(agg_std),
|
||||
"stability": float(stability),
|
||||
}
|
||||
|
||||
return summary, rep_rows
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
|
||||
ap.add_argument("--n-folds", type=int, default=5)
|
||||
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
|
||||
ap.add_argument("--outdir", default="runs/protocol_cv")
|
||||
args = ap.parse_args()
|
||||
|
||||
X, y, task = load_dataset(args.dataset)
|
||||
|
||||
preproc_cfg = {
|
||||
"num_impute_strategy": "median",
|
||||
"cat_impute_strategy": "most_frequent",
|
||||
"scaler": "standard",
|
||||
"poly_degree": 1,
|
||||
"select_k": None,
|
||||
}
|
||||
|
||||
# fixed family for methodology experiments
|
||||
if args.algo == "rf":
|
||||
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
|
||||
elif args.algo == "gbt":
|
||||
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
|
||||
else:
|
||||
model_params = {
|
||||
"hidden_layers": (64, 64),
|
||||
"activation": "relu",
|
||||
"alpha": 1e-4,
|
||||
"lr_init": 1e-3,
|
||||
"max_iter": 200,
|
||||
}
|
||||
|
||||
outdir = Path(args.outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
summaries = []
|
||||
all_rep_rows = []
|
||||
|
||||
for seed in args.seeds:
|
||||
summary, rep_rows = run_cv_protocol(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
algo=args.algo,
|
||||
model_params=model_params,
|
||||
preproc_cfg=preproc_cfg,
|
||||
n_folds=args.n_folds,
|
||||
seed=seed,
|
||||
)
|
||||
summaries.append(summary)
|
||||
all_rep_rows.extend(rep_rows)
|
||||
|
||||
summary_path = outdir / f"{args.dataset}_{args.algo}_cv_summary.csv"
|
||||
reps_path = outdir / f"{args.dataset}_{args.algo}_cv_replicates.csv"
|
||||
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_cv.json"
|
||||
|
||||
pd.DataFrame(summaries).to_csv(summary_path, index=False)
|
||||
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
|
||||
|
||||
with open(cfg_path, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"dataset": args.dataset,
|
||||
"algo": args.algo,
|
||||
"task": task,
|
||||
"protocol": "cv",
|
||||
"protocol_params": {"n_folds": args.n_folds, "seeds": args.seeds},
|
||||
"model_params": model_params,
|
||||
"preproc_cfg": preproc_cfg,
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
print("Saved:")
|
||||
print(summary_path)
|
||||
print(reps_path)
|
||||
print(cfg_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
206
src/protocols_methodology/exp_noise.py
Normal file
206
src/protocols_methodology/exp_noise.py
Normal file
@@ -0,0 +1,206 @@
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.metrics import mean_squared_error, brier_score_loss
|
||||
|
||||
from src.data_openml import load_dataset
|
||||
from src.preprocessing import build_preprocessor
|
||||
from src.models import make_model
|
||||
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
||||
from src.protocols_methodology.protocols import noise_perturbations
|
||||
|
||||
|
||||
|
||||
def run_noise_protocol(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
algo,
|
||||
model_params,
|
||||
preproc_cfg,
|
||||
n_replicates=30,
|
||||
noise_std=0.01,
|
||||
seed=0,
|
||||
max_eval_rows=1024,
|
||||
bg_size=128,
|
||||
):
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
eval_size = min(max_eval_rows, len(X))
|
||||
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
||||
X_eval_fixed = X.iloc[eval_idx]
|
||||
|
||||
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
|
||||
probe_pre = build_preprocessor(
|
||||
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
Xp = probe_pre.fit_transform(X, y)
|
||||
n_after_prep = Xp.shape[1]
|
||||
desired_k = preproc_cfg.get("select_k", None)
|
||||
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
||||
|
||||
shap_mats_with_names = []
|
||||
rep_rows = []
|
||||
|
||||
# use your old noise generator for consistency
|
||||
# it expects a list of levels, so repeat noise_std
|
||||
levels = [noise_std] * n_replicates
|
||||
noisy_sets = noise_perturbations(X, levels, seed)
|
||||
|
||||
for rep_id, (sigma, X_noisy) in enumerate(noisy_sets):
|
||||
preproc = build_preprocessor(
|
||||
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
||||
)
|
||||
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
||||
pipe = Pipeline([("pre", preproc), ("model", model)])
|
||||
|
||||
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||
pipe,
|
||||
X_fit=X_noisy,
|
||||
y_fit=y,
|
||||
X_eval=X_eval_fixed,
|
||||
task_type=task,
|
||||
bg_size=bg_size,
|
||||
max_eval_rows=max_eval_rows,
|
||||
rng_seed=seed,
|
||||
)
|
||||
shap_mats_with_names.append((shap_vals, feat_names))
|
||||
|
||||
# evaluate on clean X to measure robustness of noisy training
|
||||
if task == "regression":
|
||||
y_pred = pipe.predict(X)
|
||||
loss = float(mean_squared_error(y, y_pred))
|
||||
else:
|
||||
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
||||
y_prob = pipe.predict_proba(X)[:, 1]
|
||||
else:
|
||||
scores = pipe.decision_function(X)
|
||||
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
||||
y_prob = scores
|
||||
loss = float(brier_score_loss(y, y_prob))
|
||||
|
||||
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||
|
||||
rep_rows.append(
|
||||
{
|
||||
"seed": seed,
|
||||
"replicate_id": rep_id,
|
||||
"protocol": "noise",
|
||||
"sigma": float(sigma),
|
||||
"loss": loss,
|
||||
"fit_time": float(t_fit),
|
||||
"shap_time": float(t_shap),
|
||||
"inst_feat_std_rep": float(agg_std_rep),
|
||||
"stability_rep": float(stability_rep),
|
||||
}
|
||||
)
|
||||
|
||||
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
||||
|
||||
summary = {
|
||||
"seed": seed,
|
||||
"protocol": "noise",
|
||||
"n_replicates": n_replicates,
|
||||
"noise_std": float(noise_std),
|
||||
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
|
||||
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
|
||||
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
|
||||
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
|
||||
"inst_feat_std": float(agg_std),
|
||||
"stability": float(stability),
|
||||
}
|
||||
|
||||
return summary, rep_rows
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
|
||||
ap.add_argument("--n-replicates", type=int, default=30)
|
||||
ap.add_argument("--noise-std", type=float, default=0.01)
|
||||
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
|
||||
ap.add_argument("--outdir", default="runs/protocol_noise")
|
||||
args = ap.parse_args()
|
||||
|
||||
X, y, task = load_dataset(args.dataset)
|
||||
|
||||
preproc_cfg = {
|
||||
"num_impute_strategy": "median",
|
||||
"cat_impute_strategy": "most_frequent",
|
||||
"scaler": "standard",
|
||||
"poly_degree": 1,
|
||||
"select_k": None,
|
||||
}
|
||||
|
||||
if args.algo == "rf":
|
||||
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
|
||||
elif args.algo == "gbt":
|
||||
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
|
||||
else:
|
||||
model_params = {
|
||||
"hidden_layers": (64, 64),
|
||||
"activation": "relu",
|
||||
"alpha": 1e-4,
|
||||
"lr_init": 1e-3,
|
||||
"max_iter": 200,
|
||||
}
|
||||
|
||||
outdir = Path(args.outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
summaries = []
|
||||
all_rep_rows = []
|
||||
|
||||
for seed in args.seeds:
|
||||
summary, rep_rows = run_noise_protocol(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
algo=args.algo,
|
||||
model_params=model_params,
|
||||
preproc_cfg=preproc_cfg,
|
||||
n_replicates=args.n_replicates,
|
||||
noise_std=args.noise_std,
|
||||
seed=seed,
|
||||
)
|
||||
summaries.append(summary)
|
||||
all_rep_rows.extend(rep_rows)
|
||||
|
||||
summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv"
|
||||
reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv"
|
||||
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json"
|
||||
|
||||
pd.DataFrame(summaries).to_csv(summary_path, index=False)
|
||||
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
|
||||
|
||||
with open(cfg_path, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"dataset": args.dataset,
|
||||
"algo": args.algo,
|
||||
"task": task,
|
||||
"protocol": "noise",
|
||||
"protocol_params": {
|
||||
"n_replicates": args.n_replicates,
|
||||
"noise_std": args.noise_std,
|
||||
"seeds": args.seeds,
|
||||
},
|
||||
"model_params": model_params,
|
||||
"preproc_cfg": preproc_cfg,
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
print("Saved:")
|
||||
print(summary_path)
|
||||
print(reps_path)
|
||||
print(cfg_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
78
src/protocols_methodology/nsga_toolbox_protocols.py
Normal file
78
src/protocols_methodology/nsga_toolbox_protocols.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import mlflow
|
||||
from deap import base, creator, tools
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
from src.search.nsga_deap import decode
|
||||
from src.protocols_methodology.automl_evaluate import evaluate_config_protocol_aware
|
||||
|
||||
|
||||
def build_toolbox_protocol_aware(
|
||||
X,
|
||||
y,
|
||||
task,
|
||||
seed,
|
||||
protocol_fn,
|
||||
protocol_params,
|
||||
mlflow_experiment,
|
||||
):
|
||||
rng = check_random_state(seed)
|
||||
|
||||
if not hasattr(creator, "FitnessMSEStab"):
|
||||
creator.create("FitnessMSEStab", base.Fitness, weights=(-1.0, 1.0))
|
||||
if not hasattr(creator, "Individual"):
|
||||
creator.create("Individual", list, fitness=creator.FitnessMSEStab)
|
||||
|
||||
toolbox = base.Toolbox()
|
||||
toolbox.register("gene", rng.randint, 0, 1000000)
|
||||
toolbox.register(
|
||||
"individual",
|
||||
tools.initRepeat,
|
||||
creator.Individual,
|
||||
toolbox.gene,
|
||||
n=16,
|
||||
)
|
||||
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
|
||||
|
||||
def eval_ind(individual):
|
||||
algo, model_params, pre_cfg = decode(individual)
|
||||
|
||||
with mlflow.start_run(run_name=f"{algo}", nested=True):
|
||||
for gi, g in enumerate(individual):
|
||||
mlflow.log_param(f"g{gi}", int(g))
|
||||
mlflow.log_param("algo", algo)
|
||||
for k, v in model_params.items():
|
||||
mlflow.log_param(f"m_{k}", v)
|
||||
for k, v in pre_cfg.items():
|
||||
mlflow.log_param(f"p_{k}", v)
|
||||
|
||||
mse_like, stability, meta = evaluate_config_protocol_aware(
|
||||
X=X,
|
||||
y=y,
|
||||
task=task,
|
||||
algo=algo,
|
||||
model_params=model_params,
|
||||
pre_cfg=pre_cfg,
|
||||
protocol_fn=protocol_fn,
|
||||
protocol_params=protocol_params,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
mlflow.log_metric("mse_like", mse_like)
|
||||
mlflow.log_metric("stability", stability)
|
||||
for mk, mv in meta.items():
|
||||
mlflow.log_metric(mk, mv)
|
||||
|
||||
return mse_like, stability
|
||||
|
||||
toolbox.register("evaluate", eval_ind)
|
||||
toolbox.register("mate", tools.cxTwoPoint)
|
||||
toolbox.register(
|
||||
"mutate",
|
||||
tools.mutUniformInt,
|
||||
low=0,
|
||||
up=1000000,
|
||||
indpb=0.2,
|
||||
)
|
||||
toolbox.register("select", tools.selNSGA2)
|
||||
|
||||
return toolbox
|
||||
37
src/protocols_methodology/protocols.py
Normal file
37
src/protocols_methodology/protocols.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# src/protocols.py
|
||||
|
||||
from typing import Iterable, Tuple, List
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.utils import resample
|
||||
|
||||
def kfold_indices(n: int, k: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
|
||||
kf = KFold(n_splits=k, shuffle=True, random_state=seed)
|
||||
for tr, te in kf.split(range(n)):
|
||||
yield np.array(tr), np.array(te)
|
||||
|
||||
def bootstrap_indices(n: int, B: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
|
||||
rng = np.random.RandomState(seed)
|
||||
for _ in range(B):
|
||||
train_idx = resample(np.arange(n), replace=True, n_samples=n, random_state=rng)
|
||||
mask = np.ones(n, dtype=bool)
|
||||
mask[train_idx] = False
|
||||
test_idx = np.where(mask)[0]
|
||||
if len(test_idx) == 0:
|
||||
test_idx = rng.choice(n, size=max(1, n // 5), replace=False)
|
||||
yield train_idx, test_idx
|
||||
|
||||
def noise_perturbations(X: pd.DataFrame, levels: List[float], seed: int):
|
||||
rng = np.random.RandomState(seed)
|
||||
Xn_list = []
|
||||
# compute std only on numeric columns
|
||||
num_cols = [c for c in X.columns if is_numeric_dtype(X[c])]
|
||||
std = X[num_cols].std().replace(0, 1.0)
|
||||
for sigma in levels:
|
||||
Xn = X.copy()
|
||||
for col in num_cols:
|
||||
Xn[col] = Xn[col] + rng.normal(0, sigma * std.get(col, 1.0), size=len(Xn))
|
||||
Xn_list.append((sigma, Xn))
|
||||
return Xn_list
|
||||
204
src/protocols_methodology/run_nsga_protocols.py
Normal file
204
src/protocols_methodology/run_nsga_protocols.py
Normal file
@@ -0,0 +1,204 @@
|
||||
import argparse
|
||||
import random
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import mlflow
|
||||
from deap import algorithms
|
||||
from deap.tools.emo import sortNondominated
|
||||
import pandas as pd
|
||||
|
||||
from src.data_openml import load_dataset
|
||||
from src.search.nsga_deap import decode
|
||||
from src.protocols_methodology.nsga_toolbox_protocols import build_toolbox_protocol_aware
|
||||
from src.protocols_methodology.automl_protocol_adapters import (
|
||||
cv_protocol,
|
||||
bootstrap_protocol,
|
||||
noise_protocol,
|
||||
)
|
||||
|
||||
|
||||
def save_checkpoint(path, gen, pop, seed):
|
||||
state = {
|
||||
"gen": gen,
|
||||
"pop": pop,
|
||||
"py_random_state": random.getstate(),
|
||||
"np_random_state": np.random.get_state(),
|
||||
"seed": seed,
|
||||
}
|
||||
with open(path, "wb") as f:
|
||||
pickle.dump(state, f)
|
||||
|
||||
|
||||
def load_checkpoint(path):
|
||||
with open(path, "rb") as f:
|
||||
state = pickle.load(f)
|
||||
random.setstate(state["py_random_state"])
|
||||
np.random.set_state(state["np_random_state"])
|
||||
return state["gen"], state["pop"], state["seed"]
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
||||
ap.add_argument("--generations", type=int, default=10)
|
||||
ap.add_argument("--pop-size", type=int, default=24)
|
||||
ap.add_argument("--seed", type=int, default=42)
|
||||
ap.add_argument("--experiment", default="deap_nsga_protocol_study")
|
||||
ap.add_argument("--checkpoint-every", type=int, default=5)
|
||||
ap.add_argument("--shap-pf-eval-rows", type=int, default=512)
|
||||
ap.add_argument("--n-folds", type=int, default=3)
|
||||
ap.add_argument("--n-bootstrap", type=int, default=30)
|
||||
ap.add_argument("--n-noise", type=int, default=30)
|
||||
ap.add_argument("--noise-std", type=float, default=0.01)
|
||||
args = ap.parse_args()
|
||||
|
||||
X, y, task = load_dataset(args.dataset, random_state=args.seed)
|
||||
mlflow.set_experiment(args.experiment)
|
||||
|
||||
protocols = {
|
||||
"cv": (cv_protocol, {"n_folds": args.n_folds}),
|
||||
"bootstrap": (bootstrap_protocol, {"n_bootstrap": args.n_bootstrap}),
|
||||
"noise": (noise_protocol, {"n_replicates": args.n_noise, "noise_std": args.noise_std}),
|
||||
}
|
||||
|
||||
base_outdir = Path("runs") / f"{args.dataset}_protocol_study"
|
||||
base_outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for pname, (pfn, pparams) in protocols.items():
|
||||
outdir = base_outdir / pname
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
ckpt_path = outdir / "checkpoint.pkl"
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
|
||||
toolbox = build_toolbox_protocol_aware(
|
||||
X=X,
|
||||
y=y,
|
||||
task=task,
|
||||
seed=args.seed,
|
||||
protocol_fn=pfn,
|
||||
protocol_params=pparams,
|
||||
mlflow_experiment=args.experiment,
|
||||
)
|
||||
|
||||
if ckpt_path.exists():
|
||||
start_gen, pop, loaded_seed = load_checkpoint(ckpt_path)
|
||||
if loaded_seed != args.seed:
|
||||
print(f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}")
|
||||
print(f"[{pname}] Resuming from generation {start_gen}")
|
||||
else:
|
||||
pop = toolbox.population(n=args.pop_size)
|
||||
fits = list(map(toolbox.evaluate, pop))
|
||||
for ind, fit in zip(pop, fits):
|
||||
ind.fitness.values = fit
|
||||
start_gen = 0
|
||||
save_checkpoint(ckpt_path, start_gen, pop, args.seed)
|
||||
print(f"[{pname}] Initial checkpoint saved")
|
||||
|
||||
for gen in range(start_gen, args.generations):
|
||||
offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2)
|
||||
fits = list(map(toolbox.evaluate, offspring))
|
||||
for ind, fit in zip(offspring, fits):
|
||||
ind.fitness.values = fit
|
||||
pop = toolbox.select(pop + offspring, k=args.pop_size)
|
||||
# save PF history for convergence plots
|
||||
pf_gen = sortNondominated(pop, len(pop), first_front_only=True)[0]
|
||||
rows_gen = []
|
||||
for ind in pf_gen:
|
||||
algo, model_params, pre_cfg = decode(ind)
|
||||
rows_gen.append({
|
||||
"gen": gen + 1,
|
||||
"algo": algo,
|
||||
"mse_like": ind.fitness.values[0],
|
||||
"stability": ind.fitness.values[1],
|
||||
})
|
||||
pd.DataFrame(rows_gen).to_csv(outdir / f"pareto_gen_{gen + 1}.csv", index=False)
|
||||
|
||||
if (gen + 1) % args.checkpoint_every == 0:
|
||||
save_checkpoint(ckpt_path, gen + 1, pop, args.seed)
|
||||
print(f"[{pname}] Checkpoint saved at gen {gen + 1}")
|
||||
# save final full population for dominated region analysis
|
||||
all_rows = []
|
||||
for ind in pop:
|
||||
algo, model_params, pre_cfg = decode(ind)
|
||||
all_rows.append({
|
||||
"algo": algo,
|
||||
"mse_like": ind.fitness.values[0],
|
||||
"stability": ind.fitness.values[1],
|
||||
**{f"m_{k}": v for k, v in model_params.items()},
|
||||
**{f"p_{k}": v for k, v in pre_cfg.items()},
|
||||
})
|
||||
pd.DataFrame(all_rows).to_csv(outdir / "final_population.csv", index=False)
|
||||
|
||||
pf = sortNondominated(pop, len(pop), first_front_only=True)[0]
|
||||
rows = []
|
||||
for ind in pf:
|
||||
algo, model_params, pre_cfg = decode(ind)
|
||||
rows.append(
|
||||
{
|
||||
"algo": algo,
|
||||
"mse_like": ind.fitness.values[0],
|
||||
"stability": ind.fitness.values[1],
|
||||
**{f"m_{k}": v for k, v in model_params.items()},
|
||||
**{f"p_{k}": v for k, v in pre_cfg.items()},
|
||||
}
|
||||
)
|
||||
|
||||
pareto_path = outdir / "pareto_front.csv"
|
||||
pd.DataFrame(rows).to_csv(pareto_path, index=False)
|
||||
print(f"[{pname}] Saved Pareto front to {pareto_path}")
|
||||
|
||||
# optional SHAP saving for PF models, same as run_deap
|
||||
shap_dir = outdir / "shap"
|
||||
shap_dir.mkdir(exist_ok=True)
|
||||
|
||||
eval_rows = min(args.shap_pf_eval_rows, len(X))
|
||||
rng = np.random.RandomState(args.seed)
|
||||
eval_idx = rng.choice(len(X), size=eval_rows, replace=False)
|
||||
X_eval_shap = X.iloc[eval_idx]
|
||||
|
||||
from src.preprocessing import build_preprocessor
|
||||
from src.models import make_model
|
||||
from src.stability import compute_shap_matrix
|
||||
from sklearn.pipeline import Pipeline as SkPipeline
|
||||
|
||||
for i, ind in enumerate(pf):
|
||||
algo, model_params, pre_cfg = decode(ind)
|
||||
|
||||
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
|
||||
fixed_k = pre_cfg.get("select_k", None)
|
||||
|
||||
preproc = build_preprocessor(
|
||||
X,
|
||||
task,
|
||||
pre_cfg,
|
||||
fixed_k=fixed_k,
|
||||
fixed_poly_degree=fixed_poly_degree,
|
||||
)
|
||||
model = make_model(task, algo, model_params, random_state=args.seed)
|
||||
pipe = SkPipeline([("pre", preproc), ("model", model)])
|
||||
|
||||
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
||||
pipe,
|
||||
X_fit=X,
|
||||
y_fit=y,
|
||||
X_eval=X_eval_shap,
|
||||
task_type=task,
|
||||
bg_size=128,
|
||||
max_eval_rows=eval_rows,
|
||||
rng_seed=args.seed,
|
||||
)
|
||||
|
||||
np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals)
|
||||
np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names))
|
||||
|
||||
print(f"[{pname}] Saved SHAP arrays for {len(pf)} PF models")
|
||||
|
||||
print(f"Done. All protocol AutoML runs in {base_outdir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user