New Start

This commit is contained in:
Varyngoth
2025-12-04 16:53:58 -04:00
parent 3657e8ea18
commit c5fb865583
9 changed files with 1043 additions and 13 deletions

View File

@@ -0,0 +1,206 @@
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.data_openml import load_dataset
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.protocols import noise_perturbations
def run_noise_protocol(
X,
y,
task,
algo,
model_params,
preproc_cfg,
n_replicates=30,
noise_std=0.01,
seed=0,
max_eval_rows=1024,
bg_size=128,
):
rng = np.random.RandomState(seed)
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = preproc_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
rep_rows = []
# use your old noise generator for consistency
# it expects a list of levels, so repeat noise_std
levels = [noise_std] * n_replicates
noisy_sets = noise_perturbations(X, levels, seed)
for rep_id, (sigma, X_noisy) in enumerate(noisy_sets):
preproc = build_preprocessor(
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
pipe,
X_fit=X_noisy,
y_fit=y,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
# evaluate on clean X to measure robustness of noisy training
if task == "regression":
y_pred = pipe.predict(X)
loss = float(mean_squared_error(y, y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X)[:, 1]
else:
scores = pipe.decision_function(X)
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y, y_prob))
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
rep_rows.append(
{
"seed": seed,
"replicate_id": rep_id,
"protocol": "noise",
"sigma": float(sigma),
"loss": loss,
"fit_time": float(t_fit),
"shap_time": float(t_shap),
"inst_feat_std_rep": float(agg_std_rep),
"stability_rep": float(stability_rep),
}
)
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
summary = {
"seed": seed,
"protocol": "noise",
"n_replicates": n_replicates,
"noise_std": float(noise_std),
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
"inst_feat_std": float(agg_std),
"stability": float(stability),
}
return summary, rep_rows
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
ap.add_argument("--n-replicates", type=int, default=30)
ap.add_argument("--noise-std", type=float, default=0.01)
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
ap.add_argument("--outdir", default="runs/protocol_noise")
args = ap.parse_args()
X, y, task = load_dataset(args.dataset)
preproc_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
if args.algo == "rf":
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
elif args.algo == "gbt":
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
else:
model_params = {
"hidden_layers": (64, 64),
"activation": "relu",
"alpha": 1e-4,
"lr_init": 1e-3,
"max_iter": 200,
}
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
summaries = []
all_rep_rows = []
for seed in args.seeds:
summary, rep_rows = run_noise_protocol(
X,
y,
task,
algo=args.algo,
model_params=model_params,
preproc_cfg=preproc_cfg,
n_replicates=args.n_replicates,
noise_std=args.noise_std,
seed=seed,
)
summaries.append(summary)
all_rep_rows.extend(rep_rows)
summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv"
reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv"
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json"
pd.DataFrame(summaries).to_csv(summary_path, index=False)
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
with open(cfg_path, "w") as f:
json.dump(
{
"dataset": args.dataset,
"algo": args.algo,
"task": task,
"protocol": "noise",
"protocol_params": {
"n_replicates": args.n_replicates,
"noise_std": args.noise_std,
"seeds": args.seeds,
},
"model_params": model_params,
"preproc_cfg": preproc_cfg,
},
f,
indent=2,
)
print("Saved:")
print(summary_path)
print(reps_path)
print(cfg_path)
if __name__ == "__main__":
main()