import argparse import json from pathlib import Path import numpy as np import pandas as pd from sklearn.pipeline import Pipeline from sklearn.metrics import mean_squared_error, brier_score_loss from src.data_openml import load_dataset from src.preprocessing import build_preprocessor from src.models import make_model from src.stability import compute_shap_matrix, shap_stability_from_matrices from src.protocols_methodology.protocols import noise_perturbations def run_noise_protocol( X, y, task, algo, model_params, preproc_cfg, n_replicates=30, noise_std=0.01, seed=0, max_eval_rows=1024, bg_size=128, ): rng = np.random.RandomState(seed) eval_size = min(max_eval_rows, len(X)) eval_idx = rng.choice(len(X), size=eval_size, replace=False) X_eval_fixed = X.iloc[eval_idx] fixed_poly_degree = preproc_cfg.get("poly_degree", 1) probe_pre = build_preprocessor( X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree ) Xp = probe_pre.fit_transform(X, y) n_after_prep = Xp.shape[1] desired_k = preproc_cfg.get("select_k", None) fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep)) shap_mats_with_names = [] rep_rows = [] # use your old noise generator for consistency # it expects a list of levels, so repeat noise_std levels = [noise_std] * n_replicates noisy_sets = noise_perturbations(X, levels, seed) for rep_id, (sigma, X_noisy) in enumerate(noisy_sets): preproc = build_preprocessor( X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree ) model = make_model(task, algo, model_params, random_state=seed + rep_id) pipe = Pipeline([("pre", preproc), ("model", model)]) shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( pipe, X_fit=X_noisy, y_fit=y, X_eval=X_eval_fixed, task_type=task, bg_size=bg_size, max_eval_rows=max_eval_rows, rng_seed=seed, ) shap_mats_with_names.append((shap_vals, feat_names)) # evaluate on clean X to measure robustness of noisy training if task == "regression": y_pred = pipe.predict(X) loss = float(mean_squared_error(y, y_pred)) else: if hasattr(pipe.named_steps["model"], "predict_proba"): y_prob = pipe.predict_proba(X)[:, 1] else: scores = pipe.decision_function(X) scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) y_prob = scores loss = float(brier_score_loss(y, y_prob)) agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names) rep_rows.append( { "seed": seed, "replicate_id": rep_id, "protocol": "noise", "sigma": float(sigma), "loss": loss, "fit_time": float(t_fit), "shap_time": float(t_shap), "inst_feat_std_rep": float(agg_std_rep), "stability_rep": float(stability_rep), } ) agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names) summary = { "seed": seed, "protocol": "noise", "n_replicates": n_replicates, "noise_std": float(noise_std), "loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()), "loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)), "fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()), "shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()), "inst_feat_std": float(agg_std), "stability": float(stability), } return summary, rep_rows def main(): ap = argparse.ArgumentParser() ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"]) ap.add_argument("--n-replicates", type=int, default=30) ap.add_argument("--noise-std", type=float, default=0.01) ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4]) ap.add_argument("--outdir", default="runs/protocol_noise") args = ap.parse_args() X, y, task = load_dataset(args.dataset) preproc_cfg = { "num_impute_strategy": "median", "cat_impute_strategy": "most_frequent", "scaler": "standard", "poly_degree": 1, "select_k": None, } if args.algo == "rf": model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"} elif args.algo == "gbt": model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05} else: model_params = { "hidden_layers": (64, 64), "activation": "relu", "alpha": 1e-4, "lr_init": 1e-3, "max_iter": 200, } outdir = Path(args.outdir) outdir.mkdir(parents=True, exist_ok=True) summaries = [] all_rep_rows = [] for seed in args.seeds: summary, rep_rows = run_noise_protocol( X, y, task, algo=args.algo, model_params=model_params, preproc_cfg=preproc_cfg, n_replicates=args.n_replicates, noise_std=args.noise_std, seed=seed, ) summaries.append(summary) all_rep_rows.extend(rep_rows) summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv" reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv" cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json" pd.DataFrame(summaries).to_csv(summary_path, index=False) pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False) with open(cfg_path, "w") as f: json.dump( { "dataset": args.dataset, "algo": args.algo, "task": task, "protocol": "noise", "protocol_params": { "n_replicates": args.n_replicates, "noise_std": args.noise_std, "seeds": args.seeds, }, "model_params": model_params, "preproc_cfg": preproc_cfg, }, f, indent=2, ) print("Saved:") print(summary_path) print(reps_path) print(cfg_path) if __name__ == "__main__": main()