207 lines
6.5 KiB
Python
207 lines
6.5 KiB
Python
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.metrics import mean_squared_error, brier_score_loss
|
|
|
|
from src.data_openml import load_dataset
|
|
from src.preprocessing import build_preprocessor
|
|
from src.models import make_model
|
|
from src.stability import compute_shap_matrix, shap_stability_from_matrices
|
|
from src.protocols_methodology.protocols import noise_perturbations
|
|
|
|
|
|
|
|
def run_noise_protocol(
|
|
X,
|
|
y,
|
|
task,
|
|
algo,
|
|
model_params,
|
|
preproc_cfg,
|
|
n_replicates=30,
|
|
noise_std=0.01,
|
|
seed=0,
|
|
max_eval_rows=1024,
|
|
bg_size=128,
|
|
):
|
|
rng = np.random.RandomState(seed)
|
|
|
|
eval_size = min(max_eval_rows, len(X))
|
|
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
|
|
X_eval_fixed = X.iloc[eval_idx]
|
|
|
|
fixed_poly_degree = preproc_cfg.get("poly_degree", 1)
|
|
probe_pre = build_preprocessor(
|
|
X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
|
|
)
|
|
Xp = probe_pre.fit_transform(X, y)
|
|
n_after_prep = Xp.shape[1]
|
|
desired_k = preproc_cfg.get("select_k", None)
|
|
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
|
|
|
|
shap_mats_with_names = []
|
|
rep_rows = []
|
|
|
|
# use your old noise generator for consistency
|
|
# it expects a list of levels, so repeat noise_std
|
|
levels = [noise_std] * n_replicates
|
|
noisy_sets = noise_perturbations(X, levels, seed)
|
|
|
|
for rep_id, (sigma, X_noisy) in enumerate(noisy_sets):
|
|
preproc = build_preprocessor(
|
|
X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
|
|
)
|
|
model = make_model(task, algo, model_params, random_state=seed + rep_id)
|
|
pipe = Pipeline([("pre", preproc), ("model", model)])
|
|
|
|
shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix(
|
|
pipe,
|
|
X_fit=X_noisy,
|
|
y_fit=y,
|
|
X_eval=X_eval_fixed,
|
|
task_type=task,
|
|
bg_size=bg_size,
|
|
max_eval_rows=max_eval_rows,
|
|
rng_seed=seed,
|
|
)
|
|
shap_mats_with_names.append((shap_vals, feat_names))
|
|
|
|
# evaluate on clean X to measure robustness of noisy training
|
|
if task == "regression":
|
|
y_pred = pipe.predict(X)
|
|
loss = float(mean_squared_error(y, y_pred))
|
|
else:
|
|
if hasattr(pipe.named_steps["model"], "predict_proba"):
|
|
y_prob = pipe.predict_proba(X)[:, 1]
|
|
else:
|
|
scores = pipe.decision_function(X)
|
|
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
|
|
y_prob = scores
|
|
loss = float(brier_score_loss(y, y_prob))
|
|
|
|
agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
|
|
|
rep_rows.append(
|
|
{
|
|
"seed": seed,
|
|
"replicate_id": rep_id,
|
|
"protocol": "noise",
|
|
"sigma": float(sigma),
|
|
"loss": loss,
|
|
"fit_time": float(t_fit),
|
|
"shap_time": float(t_shap),
|
|
"inst_feat_std_rep": float(agg_std_rep),
|
|
"stability_rep": float(stability_rep),
|
|
}
|
|
)
|
|
|
|
agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names)
|
|
|
|
summary = {
|
|
"seed": seed,
|
|
"protocol": "noise",
|
|
"n_replicates": n_replicates,
|
|
"noise_std": float(noise_std),
|
|
"loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()),
|
|
"loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)),
|
|
"fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()),
|
|
"shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()),
|
|
"inst_feat_std": float(agg_std),
|
|
"stability": float(stability),
|
|
}
|
|
|
|
return summary, rep_rows
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
|
|
ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"])
|
|
ap.add_argument("--n-replicates", type=int, default=30)
|
|
ap.add_argument("--noise-std", type=float, default=0.01)
|
|
ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4])
|
|
ap.add_argument("--outdir", default="runs/protocol_noise")
|
|
args = ap.parse_args()
|
|
|
|
X, y, task = load_dataset(args.dataset)
|
|
|
|
preproc_cfg = {
|
|
"num_impute_strategy": "median",
|
|
"cat_impute_strategy": "most_frequent",
|
|
"scaler": "standard",
|
|
"poly_degree": 1,
|
|
"select_k": None,
|
|
}
|
|
|
|
if args.algo == "rf":
|
|
model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"}
|
|
elif args.algo == "gbt":
|
|
model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05}
|
|
else:
|
|
model_params = {
|
|
"hidden_layers": (64, 64),
|
|
"activation": "relu",
|
|
"alpha": 1e-4,
|
|
"lr_init": 1e-3,
|
|
"max_iter": 200,
|
|
}
|
|
|
|
outdir = Path(args.outdir)
|
|
outdir.mkdir(parents=True, exist_ok=True)
|
|
|
|
summaries = []
|
|
all_rep_rows = []
|
|
|
|
for seed in args.seeds:
|
|
summary, rep_rows = run_noise_protocol(
|
|
X,
|
|
y,
|
|
task,
|
|
algo=args.algo,
|
|
model_params=model_params,
|
|
preproc_cfg=preproc_cfg,
|
|
n_replicates=args.n_replicates,
|
|
noise_std=args.noise_std,
|
|
seed=seed,
|
|
)
|
|
summaries.append(summary)
|
|
all_rep_rows.extend(rep_rows)
|
|
|
|
summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv"
|
|
reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv"
|
|
cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json"
|
|
|
|
pd.DataFrame(summaries).to_csv(summary_path, index=False)
|
|
pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False)
|
|
|
|
with open(cfg_path, "w") as f:
|
|
json.dump(
|
|
{
|
|
"dataset": args.dataset,
|
|
"algo": args.algo,
|
|
"task": task,
|
|
"protocol": "noise",
|
|
"protocol_params": {
|
|
"n_replicates": args.n_replicates,
|
|
"noise_std": args.noise_std,
|
|
"seeds": args.seeds,
|
|
},
|
|
"model_params": model_params,
|
|
"preproc_cfg": preproc_cfg,
|
|
},
|
|
f,
|
|
indent=2,
|
|
)
|
|
|
|
print("Saved:")
|
|
print(summary_path)
|
|
print(reps_path)
|
|
print(cfg_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|