New Start

2025-12-04 16:53:58 -04:00
parent 3657e8ea18
commit c5fb865583
9 changed files with 1043 additions and 13 deletions
--- a/src/protocols_methodology/exp_noise.py
+++ b/src/protocols_methodology/exp_noise.py
@@ -0,0 +1,206 @@
+import argparse
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import mean_squared_error, brier_score_loss
+
+from src.data_openml import load_dataset
+from src.preprocessing import build_preprocessor
+from src.models import make_model
+from src.stability import compute_shap_matrix, shap_stability_from_matrices
+from src.protocols_methodology.protocols import noise_perturbations
+
+
+
+def run_noise_protocol(
+    X,
+    y,
+    task,
+    algo,
+    model_params,
+    preproc_cfg,
+    n_replicates=30,
+    noise_std=0.01,
+    seed=0,
+    max_eval_rows=1024,
+    bg_size=128,
+):
+    rng = np.random.RandomState(seed)
+
+    eval_size = min(max_eval_rows, len(X))
+    eval_idx = rng.choice(len(X), size=eval_size, replace=False)
+    X_eval_fixed = X.iloc[eval_idx]
+
+    fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
+    probe_pre = build_preprocessor(
+        X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
+    )
+    Xp = probe_pre.fit_transform(X, y)
+    n_after_prep = Xp.shape[1]
+    desired_k = preproc_cfg.get("select_k", None)
+    fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
+
+    shap_mats_with_names = []
+    rep_rows = []
+
+    # use your old noise generator for consistency
+    # it expects a list of levels, so repeat noise_std
+    levels = [noise_std] * n_replicates
+    noisy_sets = noise_perturbations(X, levels, seed)
+
+    for rep_id, (sigma, X_noisy) in enumerate(noisy_sets):
+        preproc = build_preprocessor(
+            X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
+        )
+        model = make_model(task, algo, model_params, random_state=seed + rep_id)
+        pipe = Pipeline([("pre", preproc), ("model", model)])
+
+        shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
+            pipe,
+            X_fit=X_noisy,
+            y_fit=y,
+            X_eval=X_eval_fixed,
+            task_type=task,
+            bg_size=bg_size,
+            max_eval_rows=max_eval_rows,
+            rng_seed=seed,
+        )
+        shap_mats_with_names.append((shap_vals, feat_names))
+
+        # evaluate on clean X to measure robustness of noisy training
+        if task == "regression":
+            y_pred = pipe.predict(X)
+            loss = float(mean_squared_error(y, y_pred))
+        else:
+            if hasattr(pipe.named_steps["model"], "predict_proba"):
+                y_prob = pipe.predict_proba(X)[:, 1]
+            else:
+                scores = pipe.decision_function(X)
+                scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
+                y_prob = scores
+            loss = float(brier_score_loss(y, y_prob))
+
+        agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
+
+        rep_rows.append(
+            {
+                "seed": seed,
+                "replicate_id": rep_id,
+                "protocol": "noise",
+                "sigma": float(sigma),
+                "loss": loss,
+                "fit_time": float(t_fit),
+                "shap_time": float(t_shap),
+                "inst_feat_std_rep": float(agg_std_rep),
+                "stability_rep": float(stability_rep),
+            }
+        )
+
+    agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
+
+    summary = {
+        "seed": seed,
+        "protocol": "noise",
+        "n_replicates": n_replicates,
+        "noise_std": float(noise_std),
+        "loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
+        "loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
+        "fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
+        "shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
+        "inst_feat_std": float(agg_std),
+        "stability": float(stability),
+    }
+
+    return summary, rep_rows
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
+    ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
+    ap.add_argument("--n-replicates", type=int, default=30)
+    ap.add_argument("--noise-std", type=float, default=0.01)
+    ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
+    ap.add_argument("--outdir", default="runs/protocol_noise")
+    args = ap.parse_args()
+
+    X, y, task = load_dataset(args.dataset)
+
+    preproc_cfg = {
+        "num_impute_strategy": "median",
+        "cat_impute_strategy": "most_frequent",
+        "scaler": "standard",
+        "poly_degree": 1,
+        "select_k": None,
+    }
+
+    if args.algo == "rf":
+        model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
+    elif args.algo == "gbt":
+        model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
+    else:
+        model_params = {
+            "hidden_layers": (64, 64),
+            "activation": "relu",
+            "alpha": 1e-4,
+            "lr_init": 1e-3,
+            "max_iter": 200,
+        }
+
+    outdir = Path(args.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    summaries = []
+    all_rep_rows = []
+
+    for seed in args.seeds:
+        summary, rep_rows = run_noise_protocol(
+            X,
+            y,
+            task,
+            algo=args.algo,
+            model_params=model_params,
+            preproc_cfg=preproc_cfg,
+            n_replicates=args.n_replicates,
+            noise_std=args.noise_std,
+            seed=seed,
+        )
+        summaries.append(summary)
+        all_rep_rows.extend(rep_rows)
+
+    summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv"
+    reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv"
+    cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json"
+
+    pd.DataFrame(summaries).to_csv(summary_path, index=False)
+    pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
+
+    with open(cfg_path, "w") as f:
+        json.dump(
+            {
+                "dataset": args.dataset,
+                "algo": args.algo,
+                "task": task,
+                "protocol": "noise",
+                "protocol_params": {
+                    "n_replicates": args.n_replicates,
+                    "noise_std": args.noise_std,
+                    "seeds": args.seeds,
+                },
+                "model_params": model_params,
+                "preproc_cfg": preproc_cfg,
+            },
+            f,
+            indent=2,
+        )
+
+    print("Saved:")
+    print(summary_path)
+    print(reps_path)
+    print(cfg_path)
+
+
+if __name__ == "__main__":
+    main()