Really updated to add code for new experiments

This commit is contained in:
Varyngoth
2025-12-30 16:45:57 -04:00
parent 7737997e7a
commit aa0d17aaef
6 changed files with 778 additions and 0 deletions

View File

@@ -0,0 +1,194 @@
import argparse
import json
import time
from pathlib import Path
import numpy as np
import mlflow
from src.data_openml import load_dataset
from .search_space_rf import (
PARAM_NAMES,
decode_rf,
clip_to_bounds,
sample_uniform,
)
from .scalarized_evaluate import evaluate_scalarized
def run_cmaes_tc(
dataset,
alpha,
seed=0,
pop_size=20,
generations=50,
elite_frac=0.5,
n_folds=3,
sigma0=0.2,
tc_k=5,
eps_scalar=1e-4,
reinject_factor=1.5,
experiment_name="cmaes_tc_unimodal",
outdir="runs/unimodal_cmaes_tc",
):
"""Simplified CMA-ES-TC.
We keep a diagonal covariance adaptation via elite std.
Label this as simplified CMA-ES for teaching unless you later add full cov updates.
"""
X, y, task = load_dataset(dataset, random_state=seed)
pre_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
rng = np.random.RandomState(seed)
dim = len(PARAM_NAMES)
n_elite = max(1, int(pop_size * elite_frac))
outdir = Path(outdir) / f"{dataset}_alpha{alpha}_seed{seed}"
outdir.mkdir(parents=True, exist_ok=True)
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=f"{dataset}_alpha{alpha}_seed{seed}"):
mlflow.log_params({
"dataset": dataset,
"alpha": alpha,
"seed": seed,
"pop_size": pop_size,
"generations": generations,
"elite_frac": elite_frac,
"n_folds": n_folds,
"sigma0": sigma0,
"tc_k": tc_k,
"eps_scalar": eps_scalar,
})
# Initialize mean from random uniform sample
init_pop = np.stack([sample_uniform(rng) for _ in range(pop_size)])
mean = np.mean(init_pop, axis=0)
sigma = np.ones(dim) * sigma0
log = []
std_log = []
best_vec = None
best_scalar = float("inf")
best_meta = None
start = time.time()
for gen in range(generations):
samples = rng.normal(loc=mean, scale=sigma, size=(pop_size, dim))
samples = [clip_to_bounds(s) for s in samples]
fits = []
for s in samples:
mp = decode_rf(s)
scalar, meta = evaluate_scalarized(
X=X,
y=y,
task=task,
algo="rf",
model_params=mp,
pre_cfg=pre_cfg,
alpha=alpha,
seed=seed,
n_folds=n_folds,
)
fits.append((scalar, s, meta))
fits.sort(key=lambda x: x[0])
elites = [f[1] for f in fits[:n_elite]]
elites = np.stack(elites)
mean = np.mean(elites, axis=0)
sigma = np.std(elites, axis=0) + 1e-6
scalars = [f[0] for f in fits]
pop_std = float(np.std(scalars))
std_log.append(pop_std)
best_scalar_gen, best_vec_gen, best_meta_gen = fits[0]
if best_scalar_gen < best_scalar:
best_scalar = float(best_scalar_gen)
best_vec = best_vec_gen.copy()
best_meta = best_meta_gen.copy()
row = {
"gen": gen,
"best_scalar": float(best_scalar_gen),
"best_mse_like": best_meta_gen["mse_like"],
"best_shap_std": best_meta_gen["shap_std"],
"best_stability_score": best_meta_gen["stability_score"],
"pop_std_scalar": pop_std,
"params": decode_rf(best_vec_gen),
"elapsed_s": time.time() - start,
}
log.append(row)
mlflow.log_metrics({
"best_scalar": row["best_scalar"],
"best_mse_like": row["best_mse_like"],
"best_shap_std": row["best_shap_std"],
"best_stability_score": row["best_stability_score"],
"pop_std_scalar": row["pop_std_scalar"],
}, step=gen)
if gen >= tc_k and all(s < eps_scalar for s in std_log[-tc_k:]):
sigma *= reinject_factor
mlflow.log_metric("tc_reinject", 1.0, step=gen)
hall = {
"best_params": decode_rf(best_vec),
"best_scalar": best_scalar,
**best_meta,
}
(outdir / "log.json").write_text(json.dumps(log, indent=2))
(outdir / "hall_of_fame.json").write_text(json.dumps(hall, indent=2))
mlflow.log_dict(log, "log.json")
mlflow.log_dict(hall, "hall_of_fame.json")
return hall
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--alpha", type=float, required=True)
ap.add_argument("--seed", type=int, default=0)
ap.add_argument("--pop-size", type=int, default=20)
ap.add_argument("--generations", type=int, default=50)
ap.add_argument("--elite-frac", type=float, default=0.5)
ap.add_argument("--n-folds", type=int, default=3)
ap.add_argument("--sigma0", type=float, default=0.2)
ap.add_argument("--tc-k", type=int, default=5)
ap.add_argument("--eps-scalar", type=float, default=1e-4)
ap.add_argument("--experiment-name", default="cmaes_tc_unimodal")
ap.add_argument("--outdir", default="runs/unimodal_cmaes_tc")
args = ap.parse_args()
run_cmaes_tc(
dataset=args.dataset,
alpha=args.alpha,
seed=args.seed,
pop_size=args.pop_size,
generations=args.generations,
elite_frac=args.elite_frac,
n_folds=args.n_folds,
sigma0=args.sigma0,
tc_k=args.tc_k,
eps_scalar=args.eps_scalar,
experiment_name=args.experiment_name,
outdir=args.outdir,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,183 @@
import argparse
import json
import time
from pathlib import Path
import numpy as np
import mlflow
from src.data_openml import load_dataset
from .search_space_rf import (
PARAM_NAMES,
sample_uniform,
decode_rf,
clip_to_bounds,
)
from .scalarized_evaluate import evaluate_scalarized
def run_emna_tc(
dataset,
alpha,
seed=0,
pop_size=20,
generations=50,
elite_frac=0.5,
n_folds=3,
tc_k=5,
eps_scalar=1e-4,
reinject_factor=1.5,
experiment_name="emna_tc_unimodal",
outdir="runs/unimodal_emna_tc",
):
X, y, task = load_dataset(dataset, random_state=seed)
pre_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
rng = np.random.RandomState(seed)
dim = len(PARAM_NAMES)
n_elite = max(1, int(pop_size * elite_frac))
outdir = Path(outdir) / f"{dataset}_alpha{alpha}_seed{seed}"
outdir.mkdir(parents=True, exist_ok=True)
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=f"{dataset}_alpha{alpha}_seed{seed}"):
mlflow.log_params({
"dataset": dataset,
"alpha": alpha,
"seed": seed,
"pop_size": pop_size,
"generations": generations,
"elite_frac": elite_frac,
"n_folds": n_folds,
"tc_k": tc_k,
"eps_scalar": eps_scalar,
})
pop = [sample_uniform(rng) for _ in range(pop_size)]
mean = np.mean(pop, axis=0)
cov = np.cov(np.stack(pop).T) + 1e-6 * np.eye(dim)
log = []
std_log = []
best_vec = None
best_scalar = float("inf")
best_meta = None
start = time.time()
for gen in range(generations):
samples = rng.multivariate_normal(mean, cov, size=pop_size)
samples = [clip_to_bounds(s) for s in samples]
fits = []
for s in samples:
mp = decode_rf(s)
scalar, meta = evaluate_scalarized(
X=X,
y=y,
task=task,
algo="rf",
model_params=mp,
pre_cfg=pre_cfg,
alpha=alpha,
seed=seed,
n_folds=n_folds,
)
fits.append((scalar, s, meta))
fits.sort(key=lambda x: x[0])
elites = [f[1] for f in fits[:n_elite]]
mean = np.mean(elites, axis=0)
cov = np.cov(np.stack(elites).T) + 1e-6 * np.eye(dim)
scalars = [f[0] for f in fits]
pop_std = float(np.std(scalars))
std_log.append(pop_std)
best_scalar_gen, best_vec_gen, best_meta_gen = fits[0]
if best_scalar_gen < best_scalar:
best_scalar = float(best_scalar_gen)
best_vec = best_vec_gen.copy()
best_meta = best_meta_gen.copy()
row = {
"gen": gen,
"best_scalar": float(best_scalar_gen),
"best_mse_like": best_meta_gen["mse_like"],
"best_shap_std": best_meta_gen["shap_std"],
"best_stability_score": best_meta_gen["stability_score"],
"pop_std_scalar": pop_std,
"params": decode_rf(best_vec_gen),
"elapsed_s": time.time() - start,
}
log.append(row)
mlflow.log_metrics({
"best_scalar": row["best_scalar"],
"best_mse_like": row["best_mse_like"],
"best_shap_std": row["best_shap_std"],
"best_stability_score": row["best_stability_score"],
"pop_std_scalar": row["pop_std_scalar"],
}, step=gen)
# Threshold convergence reinjection
if gen >= tc_k and all(s < eps_scalar for s in std_log[-tc_k:]):
cov *= reinject_factor
mlflow.log_metric("tc_reinject", 1.0, step=gen)
hall = {
"best_params": decode_rf(best_vec),
"best_scalar": best_scalar,
**best_meta,
}
(outdir / "log.json").write_text(json.dumps(log, indent=2))
(outdir / "hall_of_fame.json").write_text(json.dumps(hall, indent=2))
mlflow.log_dict(log, "log.json")
mlflow.log_dict(hall, "hall_of_fame.json")
return hall
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--alpha", type=float, required=True)
ap.add_argument("--seed", type=int, default=0)
ap.add_argument("--pop-size", type=int, default=20)
ap.add_argument("--generations", type=int, default=50)
ap.add_argument("--elite-frac", type=float, default=0.5)
ap.add_argument("--n-folds", type=int, default=3)
ap.add_argument("--tc-k", type=int, default=5)
ap.add_argument("--eps-scalar", type=float, default=1e-4)
ap.add_argument("--experiment-name", default="emna_tc_unimodal")
ap.add_argument("--outdir", default="runs/unimodal_emna_tc")
args = ap.parse_args()
run_emna_tc(
dataset=args.dataset,
alpha=args.alpha,
seed=args.seed,
pop_size=args.pop_size,
generations=args.generations,
elite_frac=args.elite_frac,
n_folds=args.n_folds,
tc_k=args.tc_k,
eps_scalar=args.eps_scalar,
experiment_name=args.experiment_name,
outdir=args.outdir,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,196 @@
import argparse
import json
import time
from pathlib import Path
import numpy as np
import mlflow
from src.data_openml import load_dataset
from .search_space_rf import (
PARAM_NAMES,
sample_uniform,
decode_rf,
clip_to_bounds,
)
from .scalarized_evaluate import evaluate_scalarized
def run_pso_tc(
dataset,
alpha,
seed=0,
swarm_size=20,
iterations=50,
n_folds=3,
w=0.7,
c1=1.4,
c2=1.4,
tc_k=5,
eps_scalar=1e-4,
reinject_factor=1.5,
experiment_name="pso_tc_unimodal",
outdir="runs/unimodal_pso_tc",
):
X, y, task = load_dataset(dataset, random_state=seed)
pre_cfg = {
"num_impute_strategy": "median",
"cat_impute_strategy": "most_frequent",
"scaler": "standard",
"poly_degree": 1,
"select_k": None,
}
rng = np.random.RandomState(seed)
dim = len(PARAM_NAMES)
outdir = Path(outdir) / f"{dataset}_alpha{alpha}_seed{seed}"
outdir.mkdir(parents=True, exist_ok=True)
mlflow.set_experiment(experiment_name)
with mlflow.start_run(run_name=f"{dataset}_alpha{alpha}_seed{seed}"):
mlflow.log_params({
"dataset": dataset,
"alpha": alpha,
"seed": seed,
"swarm_size": swarm_size,
"iterations": iterations,
"n_folds": n_folds,
"w": w,
"c1": c1,
"c2": c2,
"tc_k": tc_k,
"eps_scalar": eps_scalar,
})
pos = np.array([sample_uniform(rng) for _ in range(swarm_size)])
vel = rng.normal(0, 1, size=(swarm_size, dim)) * 0.1
pbest_pos = pos.copy()
pbest_fit = np.full(swarm_size, np.inf)
gbest_pos = None
gbest_fit = np.inf
gbest_meta = None
log = []
std_log = []
start = time.time()
for it in range(iterations):
fits = []
for i in range(swarm_size):
mp = decode_rf(pos[i])
scalar, meta = evaluate_scalarized(
X=X,
y=y,
task=task,
algo="rf",
model_params=mp,
pre_cfg=pre_cfg,
alpha=alpha,
seed=seed,
n_folds=n_folds,
)
fits.append((scalar, meta))
if scalar < pbest_fit[i]:
pbest_fit[i] = scalar
pbest_pos[i] = pos[i].copy()
if scalar < gbest_fit:
gbest_fit = scalar
gbest_pos = pos[i].copy()
gbest_meta = meta.copy()
scalars = [f[0] for f in fits]
pop_std = float(np.std(scalars))
std_log.append(pop_std)
row = {
"iter": it,
"best_scalar": float(gbest_fit),
"best_mse_like": gbest_meta["mse_like"],
"best_shap_std": gbest_meta["shap_std"],
"best_stability_score": gbest_meta["stability_score"],
"pop_std_scalar": pop_std,
"params": decode_rf(gbest_pos),
"elapsed_s": time.time() - start,
}
log.append(row)
mlflow.log_metrics({
"best_scalar": row["best_scalar"],
"best_mse_like": row["best_mse_like"],
"best_shap_std": row["best_shap_std"],
"best_stability_score": row["best_stability_score"],
"pop_std_scalar": row["pop_std_scalar"],
}, step=it)
r1 = rng.rand(swarm_size, dim)
r2 = rng.rand(swarm_size, dim)
vel = (
w * vel
+ c1 * r1 * (pbest_pos - pos)
+ c2 * r2 * (gbest_pos - pos)
)
pos = pos + vel
pos = np.array([clip_to_bounds(p) for p in pos])
if it >= tc_k and all(s < eps_scalar for s in std_log[-tc_k:]):
vel *= reinject_factor
mlflow.log_metric("tc_reinject", 1.0, step=it)
hall = {
"best_params": decode_rf(gbest_pos),
"best_scalar": float(gbest_fit),
**gbest_meta,
}
(outdir / "log.json").write_text(json.dumps(log, indent=2))
(outdir / "hall_of_fame.json").write_text(json.dumps(hall, indent=2))
mlflow.log_dict(log, "log.json")
mlflow.log_dict(hall, "hall_of_fame.json")
return hall
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--alpha", type=float, required=True)
ap.add_argument("--seed", type=int, default=0)
ap.add_argument("--swarm-size", type=int, default=20)
ap.add_argument("--iterations", type=int, default=50)
ap.add_argument("--n-folds", type=int, default=3)
ap.add_argument("--w", type=float, default=0.7)
ap.add_argument("--c1", type=float, default=1.4)
ap.add_argument("--c2", type=float, default=1.4)
ap.add_argument("--tc-k", type=int, default=5)
ap.add_argument("--eps-scalar", type=float, default=1e-4)
ap.add_argument("--experiment-name", default="pso_tc_unimodal")
ap.add_argument("--outdir", default="runs/unimodal_pso_tc")
args = ap.parse_args()
run_pso_tc(
dataset=args.dataset,
alpha=args.alpha,
seed=args.seed,
swarm_size=args.swarm_size,
iterations=args.iterations,
n_folds=args.n_folds,
w=args.w,
c1=args.c1,
c2=args.c2,
tc_k=args.tc_k,
eps_scalar=args.eps_scalar,
experiment_name=args.experiment_name,
outdir=args.outdir,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,55 @@
import argparse
import subprocess
import sys
ALPHAS = [round(0.1 * i, 1) for i in range(1, 10)]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"])
ap.add_argument("--seed", type=int, default=0)
ap.add_argument("--emna-pop", type=int, default=20)
ap.add_argument("--emna-gens", type=int, default=50)
ap.add_argument("--cmaes-pop", type=int, default=20)
ap.add_argument("--cmaes-gens", type=int, default=50)
ap.add_argument("--pso-swarm", type=int, default=20)
ap.add_argument("--pso-iters", type=int, default=50)
args = ap.parse_args()
py = sys.executable # uses your current venv python
for alpha in ALPHAS:
subprocess.run([
py, "-m", "src.experiments_unimodal_scalarized.emna_tc",
"--dataset", args.dataset,
"--alpha", str(alpha),
"--seed", str(args.seed),
"--pop-size", str(args.emna_pop),
"--generations", str(args.emna_gens),
], check=True)
subprocess.run([
py, "-m", "src.experiments_unimodal_scalarized.cmaes_tc",
"--dataset", args.dataset,
"--alpha", str(alpha),
"--seed", str(args.seed),
"--pop-size", str(args.cmaes_pop),
"--generations", str(args.cmaes_gens),
], check=True)
subprocess.run([
py, "-m", "src.experiments_unimodal_scalarized.pso_tc",
"--dataset", args.dataset,
"--alpha", str(alpha),
"--seed", str(args.seed),
"--swarm-size", str(args.pso_swarm),
"--iterations", str(args.pso_iters),
], check=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,102 @@
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, brier_score_loss
from src.preprocessing import build_preprocessor
from src.models import make_model
from src.stability import compute_shap_matrix, shap_stability_from_matrices
from src.protocols_methodology.automl_protocol_adapters import cv_protocol
def evaluate_scalarized(
X,
y,
task,
algo,
model_params,
pre_cfg,
alpha,
seed=0,
n_folds=3,
max_eval_rows=512,
bg_size=128,
):
"""Single objective scalarization.
Returns:
scalar: alpha * loss_mean + (1-alpha) * shap_std_mean (minimize)
meta: dict with mse_like, shap_std, stability_score
"""
rng = np.random.RandomState(seed)
# Fixed SHAP evaluation pool for this evaluation.
eval_size = min(max_eval_rows, len(X))
eval_idx = rng.choice(len(X), size=eval_size, replace=False)
X_eval_fixed = X.iloc[eval_idx]
# Freeze preprocessing dimensionality.
fixed_poly_degree = pre_cfg.get("poly_degree", 1)
probe_pre = build_preprocessor(
X, task, pre_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree
)
Xp = probe_pre.fit_transform(X, y)
n_after_prep = Xp.shape[1]
desired_k = pre_cfg.get("select_k", None)
fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep))
shap_mats_with_names = []
losses = []
reps = cv_protocol(X, y, n_folds=n_folds, seed=seed)
for rep_id, rep in enumerate(reps):
tr, te = rep["train_idx"], rep["test_idx"]
X_fit, y_fit = X.iloc[tr], y.iloc[tr]
X_test, y_test = X.iloc[te], y.iloc[te]
preproc = build_preprocessor(
X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree
)
model = make_model(task, algo, model_params, random_state=seed + rep_id)
pipe = Pipeline([("pre", preproc), ("model", model)])
shap_vals, _, _, feat_names = compute_shap_matrix(
pipe,
X_fit=X_fit,
y_fit=y_fit,
X_eval=X_eval_fixed,
task_type=task,
bg_size=bg_size,
max_eval_rows=max_eval_rows,
rng_seed=seed,
)
shap_mats_with_names.append((shap_vals, feat_names))
if task == "regression":
y_pred = pipe.predict(X_test)
loss = float(mean_squared_error(y_test, y_pred))
else:
if hasattr(pipe.named_steps["model"], "predict_proba"):
y_prob = pipe.predict_proba(X_test)[:, 1]
else:
scores = pipe.decision_function(X_test)
scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
y_prob = scores
loss = float(brier_score_loss(y_test, y_prob))
losses.append(loss)
agg_std, stability_score, _, _ = shap_stability_from_matrices(shap_mats_with_names)
mse_like = float(np.mean(losses))
shap_std = float(agg_std)
scalar = float(alpha * mse_like + (1.0 - alpha) * shap_std)
meta = {
"mse_like": mse_like,
"shap_std": shap_std,
"stability_score": float(stability_score),
}
return scalar, meta

View File

@@ -0,0 +1,48 @@
import numpy as np
# Continuous-ish search space for RandomForest.
# We fix model family to keep the landscape close to unimodal once scalarized.
BOUNDS = {
"n_estimators": (50, 400), # int
"max_depth": (2, 15), # int
"max_features": (0.2, 1.0), # float (fraction of features)
"min_samples_split": (2, 20), # int
"min_samples_leaf": (1, 10), # int
}
PARAM_NAMES = list(BOUNDS.keys())
def sample_uniform(rng: np.random.RandomState):
vec = []
for k in PARAM_NAMES:
low, high = BOUNDS[k]
if k == "max_features":
vec.append(rng.uniform(low, high))
else:
vec.append(rng.randint(low, high + 1))
return np.array(vec, dtype=float)
def clip_to_bounds(vec):
out = []
for i, k in enumerate(PARAM_NAMES):
low, high = BOUNDS[k]
v = float(vec[i])
if k == "max_features":
v = float(np.clip(v, low, high))
else:
v = int(np.clip(round(v), low, high))
out.append(v)
return np.array(out, dtype=float)
def decode_rf(vec):
v = clip_to_bounds(vec)
return {
"n_estimators": int(v[0]),
"max_depth": int(v[1]),
"max_features": float(v[2]),
"min_samples_split": int(v[3]),
"min_samples_leaf": int(v[4]),
}