diff --git a/run_deap.py b/run_deap.py index f83960c..b647b76 100644 --- a/run_deap.py +++ b/run_deap.py @@ -5,7 +5,6 @@ from pathlib import Path import numpy as np import mlflow -from datetime import datetime from deap import algorithms from deap.tools.emo import sortNondominated import pandas as pd @@ -16,16 +15,6 @@ from src.preprocessing import build_preprocessor from src.models import make_model from src.stability import compute_shap_matrix -# Main network -# mlflow.set_tracking_uri("http://192.168.2.169:5000") - -# Cluster Subnet -mlflow.set_tracking_uri("http://10.10.0.5:5000") - -# Network with DNS resolution (specified hosts or Tailnet) -#mlflow.set_tracking_uri("http://medea:5000") - - def save_checkpoint(path, gen, pop, seed): state = { @@ -54,8 +43,7 @@ def main(): ap.add_argument("--pop-size", type=int, default=24) ap.add_argument("--seed", type=int, default=42) ap.add_argument("--cv-folds", type=int, default=3) - experiment_name = f"deap_nsga_shap_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - ap.add_argument("--experiment", default=experiment_name) + ap.add_argument("--experiment", default="deap_nsga_shap") ap.add_argument("--checkpoint-every", type=int, default=5) ap.add_argument( "--shap-pf-eval-rows", diff --git a/src/protocols_methodology/automl_evaluate.py b/src/protocols_methodology/automl_evaluate.py new file mode 100644 index 0000000..26e1d52 --- /dev/null +++ b/src/protocols_methodology/automl_evaluate.py @@ -0,0 +1,103 @@ +import numpy as np +from sklearn.pipeline import Pipeline +from sklearn.metrics import mean_squared_error, brier_score_loss + +from src.preprocessing import build_preprocessor +from src.models import make_model +from src.stability import compute_shap_matrix, shap_stability_from_matrices + + +def evaluate_config_protocol_aware( + X, + y, + task, + algo, + model_params, + pre_cfg, + protocol_fn, + protocol_params, + seed=0, + max_eval_rows=1024, + bg_size=128, +): + rng = np.random.RandomState(seed) + + # fixed SHAP evaluation pool per individual evaluation + eval_size = min(max_eval_rows, len(X)) + eval_idx = rng.choice(len(X), size=eval_size, replace=False) + X_eval_fixed = X.iloc[eval_idx] + + # freeze preprocessing dimensionality + fixed_poly_degree = pre_cfg.get("poly_degree", 1) + probe_pre = build_preprocessor( + X, task, pre_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree + ) + Xp = probe_pre.fit_transform(X, y) + n_after_prep = Xp.shape[1] + desired_k = pre_cfg.get("select_k", None) + fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep)) + + shap_mats_with_names = [] + losses = [] + fit_times = [] + shap_times = [] + + replicates = protocol_fn(X, y, seed=seed, **protocol_params) + + for rep_id, rep in enumerate(replicates): + if rep["type"] in ["cv", "bootstrap"]: + tr, te = rep["train_idx"], rep["test_idx"] + X_fit, y_fit = X.iloc[tr], y.iloc[tr] + X_test, y_test = X.iloc[te], y.iloc[te] + else: + X_fit, y_fit = rep["X_noisy"], y + X_test, y_test = X, y + + preproc = build_preprocessor( + X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree + ) + model = make_model(task, algo, model_params, random_state=seed + rep_id) + pipe = Pipeline([("pre", preproc), ("model", model)]) + + shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( + pipe, + X_fit=X_fit, + y_fit=y_fit, + X_eval=X_eval_fixed, + task_type=task, + bg_size=bg_size, + max_eval_rows=max_eval_rows, + rng_seed=seed, + ) + shap_mats_with_names.append((shap_vals, feat_names)) + fit_times.append(t_fit) + shap_times.append(t_shap) + + if task == "regression": + y_pred = pipe.predict(X_test) + loss = float(mean_squared_error(y_test, y_pred)) + else: + if hasattr(pipe.named_steps["model"], "predict_proba"): + y_prob = pipe.predict_proba(X_test)[:, 1] + else: + scores = pipe.decision_function(X_test) + scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) + y_prob = scores + loss = float(brier_score_loss(y_test, y_prob)) + + losses.append(loss) + + agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + mse_like = float(np.mean(losses)) + stability_val = float(stability) + + meta = { + "loss_std": float(np.std(losses)), + "fit_time_mean": float(np.mean(fit_times)) if fit_times else 0.0, + "shap_time_mean": float(np.mean(shap_times)) if shap_times else 0.0, + "inst_feat_std": float(agg_std), + "n_replicates": len(replicates), + } + + return mse_like, stability_val, meta diff --git a/src/protocols_methodology/automl_protocol_adapters.py b/src/protocols_methodology/automl_protocol_adapters.py new file mode 100644 index 0000000..e5e9801 --- /dev/null +++ b/src/protocols_methodology/automl_protocol_adapters.py @@ -0,0 +1,24 @@ +from src.protocols_methodology.protocols import ( + kfold_indices, + bootstrap_indices, + noise_perturbations, +) + +def cv_protocol(X, y, n_folds=5, seed=0): + reps = [] + for tr, te in kfold_indices(len(X), n_folds, seed): + reps.append({"type": "cv", "train_idx": tr, "test_idx": te}) + return reps + +def bootstrap_protocol(X, y, n_bootstrap=30, seed=0): + reps = [] + for tr, te in bootstrap_indices(len(X), n_bootstrap, seed): + reps.append({"type": "bootstrap", "train_idx": tr, "test_idx": te}) + return reps + +def noise_protocol(X, y, n_replicates=30, noise_std=0.01, seed=0): + levels = [noise_std] * n_replicates + reps = [] + for sigma, X_noisy in noise_perturbations(X, levels, seed): + reps.append({"type": "noise", "sigma": sigma, "X_noisy": X_noisy}) + return reps diff --git a/src/protocols_methodology/exp_bootstrap.py b/src/protocols_methodology/exp_bootstrap.py new file mode 100644 index 0000000..a907e81 --- /dev/null +++ b/src/protocols_methodology/exp_bootstrap.py @@ -0,0 +1,195 @@ +import argparse +import json +from pathlib import Path +import numpy as np +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.metrics import mean_squared_error, brier_score_loss + +from src.data_openml import load_dataset +from src.preprocessing import build_preprocessor +from src.models import make_model +from src.stability import compute_shap_matrix, shap_stability_from_matrices +from src.protocols_methodology.protocols import bootstrap_indices + + + +def run_bootstrap_protocol( + X, + y, + task, + algo, + model_params, + preproc_cfg, + n_bootstrap=30, + seed=0, + max_eval_rows=1024, + bg_size=128, +): + rng = np.random.RandomState(seed) + + eval_size = min(max_eval_rows, len(X)) + eval_idx = rng.choice(len(X), size=eval_size, replace=False) + X_eval_fixed = X.iloc[eval_idx] + + fixed_poly_degree = preproc_cfg.get("poly_degree", 1) + probe_pre = build_preprocessor( + X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree + ) + Xp = probe_pre.fit_transform(X, y) + n_after_prep = Xp.shape[1] + desired_k = preproc_cfg.get("select_k", None) + fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep)) + + shap_mats_with_names = [] + rep_rows = [] + + for rep_id, (tr, te) in enumerate(bootstrap_indices(len(X), n_bootstrap, seed)): + X_boot = X.iloc[tr] + y_boot = y.iloc[tr] + + preproc = build_preprocessor( + X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree + ) + model = make_model(task, algo, model_params, random_state=seed + rep_id) + pipe = Pipeline([("pre", preproc), ("model", model)]) + + shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( + pipe, + X_fit=X_boot, + y_fit=y_boot, + X_eval=X_eval_fixed, + task_type=task, + bg_size=bg_size, + max_eval_rows=max_eval_rows, + rng_seed=seed, + ) + shap_mats_with_names.append((shap_vals, feat_names)) + + # OOB loss on te to match your earlier logic + if task == "regression": + y_pred = pipe.predict(X.iloc[te]) + loss = float(mean_squared_error(y.iloc[te], y_pred)) + else: + if hasattr(pipe.named_steps["model"], "predict_proba"): + y_prob = pipe.predict_proba(X.iloc[te])[:, 1] + else: + scores = pipe.decision_function(X.iloc[te]) + scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) + y_prob = scores + loss = float(brier_score_loss(y.iloc[te], y_prob)) + + agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + rep_rows.append( + { + "seed": seed, + "replicate_id": rep_id, + "protocol": "bootstrap", + "loss": loss, + "fit_time": float(t_fit), + "shap_time": float(t_shap), + "inst_feat_std_rep": float(agg_std_rep), + "stability_rep": float(stability_rep), + } + ) + + agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + summary = { + "seed": seed, + "protocol": "bootstrap", + "n_replicates": n_bootstrap, + "loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()), + "loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)), + "fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()), + "shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()), + "inst_feat_std": float(agg_std), + "stability": float(stability), + } + + return summary, rep_rows + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"]) + ap.add_argument("--n-bootstrap", type=int, default=30) + ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4]) + ap.add_argument("--outdir", default="runs/protocol_bootstrap") + args = ap.parse_args() + + X, y, task = load_dataset(args.dataset) + + preproc_cfg = { + "num_impute_strategy": "median", + "cat_impute_strategy": "most_frequent", + "scaler": "standard", + "poly_degree": 1, + "select_k": None, + } + + if args.algo == "rf": + model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"} + elif args.algo == "gbt": + model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05} + else: + model_params = { + "hidden_layers": (64, 64), + "activation": "relu", + "alpha": 1e-4, + "lr_init": 1e-3, + "max_iter": 200, + } + + outdir = Path(args.outdir) + outdir.mkdir(parents=True, exist_ok=True) + + summaries = [] + all_rep_rows = [] + + for seed in args.seeds: + summary, rep_rows = run_bootstrap_protocol( + X, + y, + task, + algo=args.algo, + model_params=model_params, + preproc_cfg=preproc_cfg, + n_bootstrap=args.n_bootstrap, + seed=seed, + ) + summaries.append(summary) + all_rep_rows.extend(rep_rows) + + summary_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_summary.csv" + reps_path = outdir / f"{args.dataset}_{args.algo}_bootstrap_replicates.csv" + cfg_path = outdir / f"config_{args.dataset}_{args.algo}_bootstrap.json" + + pd.DataFrame(summaries).to_csv(summary_path, index=False) + pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False) + + with open(cfg_path, "w") as f: + json.dump( + { + "dataset": args.dataset, + "algo": args.algo, + "task": task, + "protocol": "bootstrap", + "protocol_params": {"n_bootstrap": args.n_bootstrap, "seeds": args.seeds}, + "model_params": model_params, + "preproc_cfg": preproc_cfg, + }, + f, + indent=2, + ) + + print("Saved:") + print(summary_path) + print(reps_path) + print(cfg_path) + + +if __name__ == "__main__": + main() diff --git a/src/protocols_methodology/exp_cv.py b/src/protocols_methodology/exp_cv.py new file mode 100644 index 0000000..4f2e822 --- /dev/null +++ b/src/protocols_methodology/exp_cv.py @@ -0,0 +1,195 @@ +import argparse +import json +from pathlib import Path +import numpy as np +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.metrics import mean_squared_error, brier_score_loss + +from src.data_openml import load_dataset +from src.preprocessing import build_preprocessor +from src.models import make_model +from src.stability import compute_shap_matrix, shap_stability_from_matrices +from src.protocols_methodology.protocols import kfold_indices + +def run_cv_protocol( + X, + y, + task, + algo, + model_params, + preproc_cfg, + n_folds=5, + seed=0, + max_eval_rows=1024, + bg_size=128, +): + rng = np.random.RandomState(seed) + + # fixed SHAP evaluation pool per seed + eval_size = min(max_eval_rows, len(X)) + eval_idx = rng.choice(len(X), size=eval_size, replace=False) + X_eval_fixed = X.iloc[eval_idx] + + # freeze preprocessor dimensions for stability comparability + fixed_poly_degree = preproc_cfg.get("poly_degree", 1) + probe_pre = build_preprocessor( + X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree + ) + Xp = probe_pre.fit_transform(X, y) + n_after_prep = Xp.shape[1] + desired_k = preproc_cfg.get("select_k", None) + fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep)) + + shap_mats_with_names = [] + rep_rows = [] + + # your exact old KFold generator + for rep_id, (tr, te) in enumerate(kfold_indices(len(X), n_folds, seed)): + preproc = build_preprocessor( + X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree + ) + model = make_model(task, algo, model_params, random_state=seed + rep_id) + pipe = Pipeline([("pre", preproc), ("model", model)]) + + shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( + pipe, + X_fit=X.iloc[tr], + y_fit=y.iloc[tr], + X_eval=X_eval_fixed, + task_type=task, + bg_size=bg_size, + max_eval_rows=max_eval_rows, + rng_seed=seed, + ) + shap_mats_with_names.append((shap_vals, feat_names)) + + # loss on fold test + if task == "regression": + y_pred = pipe.predict(X.iloc[te]) + loss = float(mean_squared_error(y.iloc[te], y_pred)) + else: + if hasattr(pipe.named_steps["model"], "predict_proba"): + y_prob = pipe.predict_proba(X.iloc[te])[:, 1] + else: + scores = pipe.decision_function(X.iloc[te]) + scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) + y_prob = scores + loss = float(brier_score_loss(y.iloc[te], y_prob)) + + # running stability so replicates can be plotted as a trajectory + agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + rep_rows.append( + { + "seed": seed, + "replicate_id": rep_id, + "protocol": "cv", + "loss": loss, + "fit_time": float(t_fit), + "shap_time": float(t_shap), + "inst_feat_std_rep": float(agg_std_rep), + "stability_rep": float(stability_rep), + } + ) + + agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + summary = { + "seed": seed, + "protocol": "cv", + "n_replicates": n_folds, + "loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()), + "loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)), + "fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()), + "shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()), + "inst_feat_std": float(agg_std), + "stability": float(stability), + } + + return summary, rep_rows + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"]) + ap.add_argument("--n-folds", type=int, default=5) + ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4]) + ap.add_argument("--outdir", default="runs/protocol_cv") + args = ap.parse_args() + + X, y, task = load_dataset(args.dataset) + + preproc_cfg = { + "num_impute_strategy": "median", + "cat_impute_strategy": "most_frequent", + "scaler": "standard", + "poly_degree": 1, + "select_k": None, + } + + # fixed family for methodology experiments + if args.algo == "rf": + model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"} + elif args.algo == "gbt": + model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05} + else: + model_params = { + "hidden_layers": (64, 64), + "activation": "relu", + "alpha": 1e-4, + "lr_init": 1e-3, + "max_iter": 200, + } + + outdir = Path(args.outdir) + outdir.mkdir(parents=True, exist_ok=True) + + summaries = [] + all_rep_rows = [] + + for seed in args.seeds: + summary, rep_rows = run_cv_protocol( + X, + y, + task, + algo=args.algo, + model_params=model_params, + preproc_cfg=preproc_cfg, + n_folds=args.n_folds, + seed=seed, + ) + summaries.append(summary) + all_rep_rows.extend(rep_rows) + + summary_path = outdir / f"{args.dataset}_{args.algo}_cv_summary.csv" + reps_path = outdir / f"{args.dataset}_{args.algo}_cv_replicates.csv" + cfg_path = outdir / f"config_{args.dataset}_{args.algo}_cv.json" + + pd.DataFrame(summaries).to_csv(summary_path, index=False) + pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False) + + with open(cfg_path, "w") as f: + json.dump( + { + "dataset": args.dataset, + "algo": args.algo, + "task": task, + "protocol": "cv", + "protocol_params": {"n_folds": args.n_folds, "seeds": args.seeds}, + "model_params": model_params, + "preproc_cfg": preproc_cfg, + }, + f, + indent=2, + ) + + print("Saved:") + print(summary_path) + print(reps_path) + print(cfg_path) + + +if __name__ == "__main__": + main() diff --git a/src/protocols_methodology/exp_noise.py b/src/protocols_methodology/exp_noise.py new file mode 100644 index 0000000..6be3c3b --- /dev/null +++ b/src/protocols_methodology/exp_noise.py @@ -0,0 +1,206 @@ +import argparse +import json +from pathlib import Path +import numpy as np +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.metrics import mean_squared_error, brier_score_loss + +from src.data_openml import load_dataset +from src.preprocessing import build_preprocessor +from src.models import make_model +from src.stability import compute_shap_matrix, shap_stability_from_matrices +from src.protocols_methodology.protocols import noise_perturbations + + + +def run_noise_protocol( + X, + y, + task, + algo, + model_params, + preproc_cfg, + n_replicates=30, + noise_std=0.01, + seed=0, + max_eval_rows=1024, + bg_size=128, +): + rng = np.random.RandomState(seed) + + eval_size = min(max_eval_rows, len(X)) + eval_idx = rng.choice(len(X), size=eval_size, replace=False) + X_eval_fixed = X.iloc[eval_idx] + + fixed_poly_degree = preproc_cfg.get("poly_degree", 1) + probe_pre = build_preprocessor( + X, task, preproc_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree + ) + Xp = probe_pre.fit_transform(X, y) + n_after_prep = Xp.shape[1] + desired_k = preproc_cfg.get("select_k", None) + fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep)) + + shap_mats_with_names = [] + rep_rows = [] + + # use your old noise generator for consistency + # it expects a list of levels, so repeat noise_std + levels = [noise_std] * n_replicates + noisy_sets = noise_perturbations(X, levels, seed) + + for rep_id, (sigma, X_noisy) in enumerate(noisy_sets): + preproc = build_preprocessor( + X, task, preproc_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree + ) + model = make_model(task, algo, model_params, random_state=seed + rep_id) + pipe = Pipeline([("pre", preproc), ("model", model)]) + + shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( + pipe, + X_fit=X_noisy, + y_fit=y, + X_eval=X_eval_fixed, + task_type=task, + bg_size=bg_size, + max_eval_rows=max_eval_rows, + rng_seed=seed, + ) + shap_mats_with_names.append((shap_vals, feat_names)) + + # evaluate on clean X to measure robustness of noisy training + if task == "regression": + y_pred = pipe.predict(X) + loss = float(mean_squared_error(y, y_pred)) + else: + if hasattr(pipe.named_steps["model"], "predict_proba"): + y_prob = pipe.predict_proba(X)[:, 1] + else: + scores = pipe.decision_function(X) + scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) + y_prob = scores + loss = float(brier_score_loss(y, y_prob)) + + agg_std_rep, stability_rep, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + rep_rows.append( + { + "seed": seed, + "replicate_id": rep_id, + "protocol": "noise", + "sigma": float(sigma), + "loss": loss, + "fit_time": float(t_fit), + "shap_time": float(t_shap), + "inst_feat_std_rep": float(agg_std_rep), + "stability_rep": float(stability_rep), + } + ) + + agg_std, stability, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + summary = { + "seed": seed, + "protocol": "noise", + "n_replicates": n_replicates, + "noise_std": float(noise_std), + "loss_mean": float(pd.Series([r["loss"] for r in rep_rows]).mean()), + "loss_std": float(pd.Series([r["loss"] for r in rep_rows]).std(ddof=0)), + "fit_time_mean": float(pd.Series([r["fit_time"] for r in rep_rows]).mean()), + "shap_time_mean": float(pd.Series([r["shap_time"] for r in rep_rows]).mean()), + "inst_feat_std": float(agg_std), + "stability": float(stability), + } + + return summary, rep_rows + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--algo", default="rf", choices=["rf", "gbt", "mlp"]) + ap.add_argument("--n-replicates", type=int, default=30) + ap.add_argument("--noise-std", type=float, default=0.01) + ap.add_argument("--seeds", type=int, nargs="+", default=[0, 1, 2, 3, 4]) + ap.add_argument("--outdir", default="runs/protocol_noise") + args = ap.parse_args() + + X, y, task = load_dataset(args.dataset) + + preproc_cfg = { + "num_impute_strategy": "median", + "cat_impute_strategy": "most_frequent", + "scaler": "standard", + "poly_degree": 1, + "select_k": None, + } + + if args.algo == "rf": + model_params = {"n_estimators": 300, "max_depth": 8, "max_features": "sqrt"} + elif args.algo == "gbt": + model_params = {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05} + else: + model_params = { + "hidden_layers": (64, 64), + "activation": "relu", + "alpha": 1e-4, + "lr_init": 1e-3, + "max_iter": 200, + } + + outdir = Path(args.outdir) + outdir.mkdir(parents=True, exist_ok=True) + + summaries = [] + all_rep_rows = [] + + for seed in args.seeds: + summary, rep_rows = run_noise_protocol( + X, + y, + task, + algo=args.algo, + model_params=model_params, + preproc_cfg=preproc_cfg, + n_replicates=args.n_replicates, + noise_std=args.noise_std, + seed=seed, + ) + summaries.append(summary) + all_rep_rows.extend(rep_rows) + + summary_path = outdir / f"{args.dataset}_{args.algo}_noise_summary.csv" + reps_path = outdir / f"{args.dataset}_{args.algo}_noise_replicates.csv" + cfg_path = outdir / f"config_{args.dataset}_{args.algo}_noise.json" + + pd.DataFrame(summaries).to_csv(summary_path, index=False) + pd.DataFrame(all_rep_rows).to_csv(reps_path, index=False) + + with open(cfg_path, "w") as f: + json.dump( + { + "dataset": args.dataset, + "algo": args.algo, + "task": task, + "protocol": "noise", + "protocol_params": { + "n_replicates": args.n_replicates, + "noise_std": args.noise_std, + "seeds": args.seeds, + }, + "model_params": model_params, + "preproc_cfg": preproc_cfg, + }, + f, + indent=2, + ) + + print("Saved:") + print(summary_path) + print(reps_path) + print(cfg_path) + + +if __name__ == "__main__": + main() diff --git a/src/protocols_methodology/nsga_toolbox_protocols.py b/src/protocols_methodology/nsga_toolbox_protocols.py new file mode 100644 index 0000000..66008ec --- /dev/null +++ b/src/protocols_methodology/nsga_toolbox_protocols.py @@ -0,0 +1,78 @@ +import mlflow +from deap import base, creator, tools +from sklearn.utils import check_random_state + +from src.search.nsga_deap import decode +from src.protocols_methodology.automl_evaluate import evaluate_config_protocol_aware + + +def build_toolbox_protocol_aware( + X, + y, + task, + seed, + protocol_fn, + protocol_params, + mlflow_experiment, +): + rng = check_random_state(seed) + + if not hasattr(creator, "FitnessMSEStab"): + creator.create("FitnessMSEStab", base.Fitness, weights=(-1.0, 1.0)) + if not hasattr(creator, "Individual"): + creator.create("Individual", list, fitness=creator.FitnessMSEStab) + + toolbox = base.Toolbox() + toolbox.register("gene", rng.randint, 0, 1000000) + toolbox.register( + "individual", + tools.initRepeat, + creator.Individual, + toolbox.gene, + n=16, + ) + toolbox.register("population", tools.initRepeat, list, toolbox.individual) + + def eval_ind(individual): + algo, model_params, pre_cfg = decode(individual) + + with mlflow.start_run(run_name=f"{algo}", nested=True): + for gi, g in enumerate(individual): + mlflow.log_param(f"g{gi}", int(g)) + mlflow.log_param("algo", algo) + for k, v in model_params.items(): + mlflow.log_param(f"m_{k}", v) + for k, v in pre_cfg.items(): + mlflow.log_param(f"p_{k}", v) + + mse_like, stability, meta = evaluate_config_protocol_aware( + X=X, + y=y, + task=task, + algo=algo, + model_params=model_params, + pre_cfg=pre_cfg, + protocol_fn=protocol_fn, + protocol_params=protocol_params, + seed=seed, + ) + + mlflow.log_metric("mse_like", mse_like) + mlflow.log_metric("stability", stability) + for mk, mv in meta.items(): + mlflow.log_metric(mk, mv) + + return mse_like, stability + + toolbox.register("evaluate", eval_ind) + toolbox.register("mate", tools.cxTwoPoint) + toolbox.register( + "mutate", + tools.mutUniformInt, + low=0, + up=1000000, + indpb=0.2, + ) + toolbox.register("select", tools.selNSGA2) + + return toolbox diff --git a/src/protocols_methodology/protocols.py b/src/protocols_methodology/protocols.py new file mode 100644 index 0000000..382250c --- /dev/null +++ b/src/protocols_methodology/protocols.py @@ -0,0 +1,37 @@ +# src/protocols.py + +from typing import Iterable, Tuple, List +import numpy as np +import pandas as pd +from pandas.api.types import is_numeric_dtype +from sklearn.model_selection import KFold +from sklearn.utils import resample + +def kfold_indices(n: int, k: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]: + kf = KFold(n_splits=k, shuffle=True, random_state=seed) + for tr, te in kf.split(range(n)): + yield np.array(tr), np.array(te) + +def bootstrap_indices(n: int, B: int, seed: int) -> Iterable[Tuple[np.ndarray, np.ndarray]]: + rng = np.random.RandomState(seed) + for _ in range(B): + train_idx = resample(np.arange(n), replace=True, n_samples=n, random_state=rng) + mask = np.ones(n, dtype=bool) + mask[train_idx] = False + test_idx = np.where(mask)[0] + if len(test_idx) == 0: + test_idx = rng.choice(n, size=max(1, n // 5), replace=False) + yield train_idx, test_idx + +def noise_perturbations(X: pd.DataFrame, levels: List[float], seed: int): + rng = np.random.RandomState(seed) + Xn_list = [] + # compute std only on numeric columns + num_cols = [c for c in X.columns if is_numeric_dtype(X[c])] + std = X[num_cols].std().replace(0, 1.0) + for sigma in levels: + Xn = X.copy() + for col in num_cols: + Xn[col] = Xn[col] + rng.normal(0, sigma * std.get(col, 1.0), size=len(Xn)) + Xn_list.append((sigma, Xn)) + return Xn_list diff --git a/src/protocols_methodology/run_nsga_protocols.py b/src/protocols_methodology/run_nsga_protocols.py new file mode 100644 index 0000000..5d4ff59 --- /dev/null +++ b/src/protocols_methodology/run_nsga_protocols.py @@ -0,0 +1,204 @@ +import argparse +import random +import pickle +from pathlib import Path + +import numpy as np +import mlflow +from deap import algorithms +from deap.tools.emo import sortNondominated +import pandas as pd + +from src.data_openml import load_dataset +from src.search.nsga_deap import decode +from src.protocols_methodology.nsga_toolbox_protocols import build_toolbox_protocol_aware +from src.protocols_methodology.automl_protocol_adapters import ( + cv_protocol, + bootstrap_protocol, + noise_protocol, +) + + +def save_checkpoint(path, gen, pop, seed): + state = { + "gen": gen, + "pop": pop, + "py_random_state": random.getstate(), + "np_random_state": np.random.get_state(), + "seed": seed, + } + with open(path, "wb") as f: + pickle.dump(state, f) + + +def load_checkpoint(path): + with open(path, "rb") as f: + state = pickle.load(f) + random.setstate(state["py_random_state"]) + np.random.set_state(state["np_random_state"]) + return state["gen"], state["pop"], state["seed"] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--generations", type=int, default=10) + ap.add_argument("--pop-size", type=int, default=24) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--experiment", default="deap_nsga_protocol_study") + ap.add_argument("--checkpoint-every", type=int, default=5) + ap.add_argument("--shap-pf-eval-rows", type=int, default=512) + ap.add_argument("--n-folds", type=int, default=3) + ap.add_argument("--n-bootstrap", type=int, default=30) + ap.add_argument("--n-noise", type=int, default=30) + ap.add_argument("--noise-std", type=float, default=0.01) + args = ap.parse_args() + + X, y, task = load_dataset(args.dataset, random_state=args.seed) + mlflow.set_experiment(args.experiment) + + protocols = { + "cv": (cv_protocol, {"n_folds": args.n_folds}), + "bootstrap": (bootstrap_protocol, {"n_bootstrap": args.n_bootstrap}), + "noise": (noise_protocol, {"n_replicates": args.n_noise, "noise_std": args.noise_std}), + } + + base_outdir = Path("runs") / f"{args.dataset}_protocol_study" + base_outdir.mkdir(parents=True, exist_ok=True) + + for pname, (pfn, pparams) in protocols.items(): + outdir = base_outdir / pname + outdir.mkdir(parents=True, exist_ok=True) + ckpt_path = outdir / "checkpoint.pkl" + + random.seed(args.seed) + np.random.seed(args.seed) + + toolbox = build_toolbox_protocol_aware( + X=X, + y=y, + task=task, + seed=args.seed, + protocol_fn=pfn, + protocol_params=pparams, + mlflow_experiment=args.experiment, + ) + + if ckpt_path.exists(): + start_gen, pop, loaded_seed = load_checkpoint(ckpt_path) + if loaded_seed != args.seed: + print(f"Warning: checkpoint seed {loaded_seed} differs from current seed {args.seed}") + print(f"[{pname}] Resuming from generation {start_gen}") + else: + pop = toolbox.population(n=args.pop_size) + fits = list(map(toolbox.evaluate, pop)) + for ind, fit in zip(pop, fits): + ind.fitness.values = fit + start_gen = 0 + save_checkpoint(ckpt_path, start_gen, pop, args.seed) + print(f"[{pname}] Initial checkpoint saved") + + for gen in range(start_gen, args.generations): + offspring = algorithms.varAnd(pop, toolbox, cxpb=0.7, mutpb=0.2) + fits = list(map(toolbox.evaluate, offspring)) + for ind, fit in zip(offspring, fits): + ind.fitness.values = fit + pop = toolbox.select(pop + offspring, k=args.pop_size) + # save PF history for convergence plots + pf_gen = sortNondominated(pop, len(pop), first_front_only=True)[0] + rows_gen = [] + for ind in pf_gen: + algo, model_params, pre_cfg = decode(ind) + rows_gen.append({ + "gen": gen + 1, + "algo": algo, + "mse_like": ind.fitness.values[0], + "stability": ind.fitness.values[1], + }) + pd.DataFrame(rows_gen).to_csv(outdir / f"pareto_gen_{gen + 1}.csv", index=False) + + if (gen + 1) % args.checkpoint_every == 0: + save_checkpoint(ckpt_path, gen + 1, pop, args.seed) + print(f"[{pname}] Checkpoint saved at gen {gen + 1}") + # save final full population for dominated region analysis + all_rows = [] + for ind in pop: + algo, model_params, pre_cfg = decode(ind) + all_rows.append({ + "algo": algo, + "mse_like": ind.fitness.values[0], + "stability": ind.fitness.values[1], + **{f"m_{k}": v for k, v in model_params.items()}, + **{f"p_{k}": v for k, v in pre_cfg.items()}, + }) + pd.DataFrame(all_rows).to_csv(outdir / "final_population.csv", index=False) + + pf = sortNondominated(pop, len(pop), first_front_only=True)[0] + rows = [] + for ind in pf: + algo, model_params, pre_cfg = decode(ind) + rows.append( + { + "algo": algo, + "mse_like": ind.fitness.values[0], + "stability": ind.fitness.values[1], + **{f"m_{k}": v for k, v in model_params.items()}, + **{f"p_{k}": v for k, v in pre_cfg.items()}, + } + ) + + pareto_path = outdir / "pareto_front.csv" + pd.DataFrame(rows).to_csv(pareto_path, index=False) + print(f"[{pname}] Saved Pareto front to {pareto_path}") + + # optional SHAP saving for PF models, same as run_deap + shap_dir = outdir / "shap" + shap_dir.mkdir(exist_ok=True) + + eval_rows = min(args.shap_pf_eval_rows, len(X)) + rng = np.random.RandomState(args.seed) + eval_idx = rng.choice(len(X), size=eval_rows, replace=False) + X_eval_shap = X.iloc[eval_idx] + + from src.preprocessing import build_preprocessor + from src.models import make_model + from src.stability import compute_shap_matrix + from sklearn.pipeline import Pipeline as SkPipeline + + for i, ind in enumerate(pf): + algo, model_params, pre_cfg = decode(ind) + + fixed_poly_degree = pre_cfg.get("poly_degree", 1) + fixed_k = pre_cfg.get("select_k", None) + + preproc = build_preprocessor( + X, + task, + pre_cfg, + fixed_k=fixed_k, + fixed_poly_degree=fixed_poly_degree, + ) + model = make_model(task, algo, model_params, random_state=args.seed) + pipe = SkPipeline([("pre", preproc), ("model", model)]) + + shap_vals, t_fit, t_shap, feat_names = compute_shap_matrix( + pipe, + X_fit=X, + y_fit=y, + X_eval=X_eval_shap, + task_type=task, + bg_size=128, + max_eval_rows=eval_rows, + rng_seed=args.seed, + ) + + np.save(shap_dir / f"pf_{i}_shap_vals.npy", shap_vals) + np.save(shap_dir / f"pf_{i}_feat_names.npy", np.asarray(feat_names)) + + print(f"[{pname}] Saved SHAP arrays for {len(pf)} PF models") + + print(f"Done. All protocol AutoML runs in {base_outdir}") + + +if __name__ == "__main__": + main()