From aa0d17aaefdc375676ac44e3e5ac49a387ee7395 Mon Sep 17 00:00:00 2001 From: Varyngoth Date: Tue, 30 Dec 2025 16:45:57 -0400 Subject: [PATCH] Really updated to add code for new experiments --- .../cmaes_tc.py | 194 +++++++++++++++++ .../emna_tc.py | 183 ++++++++++++++++ src/experiments_unimodal_scalarized/pso_tc.py | 196 ++++++++++++++++++ .../run_all_alphas.py | 55 +++++ .../scalarized_evaluate.py | 102 +++++++++ .../search_space_rf.py | 48 +++++ 6 files changed, 778 insertions(+) create mode 100644 src/experiments_unimodal_scalarized/cmaes_tc.py create mode 100644 src/experiments_unimodal_scalarized/emna_tc.py create mode 100644 src/experiments_unimodal_scalarized/pso_tc.py create mode 100644 src/experiments_unimodal_scalarized/run_all_alphas.py create mode 100644 src/experiments_unimodal_scalarized/scalarized_evaluate.py create mode 100644 src/experiments_unimodal_scalarized/search_space_rf.py diff --git a/src/experiments_unimodal_scalarized/cmaes_tc.py b/src/experiments_unimodal_scalarized/cmaes_tc.py new file mode 100644 index 0000000..e832dfa --- /dev/null +++ b/src/experiments_unimodal_scalarized/cmaes_tc.py @@ -0,0 +1,194 @@ +import argparse +import json +import time +from pathlib import Path + +import numpy as np +import mlflow + +from src.data_openml import load_dataset +from .search_space_rf import ( + PARAM_NAMES, + decode_rf, + clip_to_bounds, + sample_uniform, +) +from .scalarized_evaluate import evaluate_scalarized + + +def run_cmaes_tc( + dataset, + alpha, + seed=0, + pop_size=20, + generations=50, + elite_frac=0.5, + n_folds=3, + sigma0=0.2, + tc_k=5, + eps_scalar=1e-4, + reinject_factor=1.5, + experiment_name="cmaes_tc_unimodal", + outdir="runs/unimodal_cmaes_tc", +): + """Simplified CMA-ES-TC. + + We keep a diagonal covariance adaptation via elite std. + Label this as simplified CMA-ES for teaching unless you later add full cov updates. + """ + + X, y, task = load_dataset(dataset, random_state=seed) + + pre_cfg = { + "num_impute_strategy": "median", + "cat_impute_strategy": "most_frequent", + "scaler": "standard", + "poly_degree": 1, + "select_k": None, + } + + rng = np.random.RandomState(seed) + dim = len(PARAM_NAMES) + n_elite = max(1, int(pop_size * elite_frac)) + + outdir = Path(outdir) / f"{dataset}_alpha{alpha}_seed{seed}" + outdir.mkdir(parents=True, exist_ok=True) + + mlflow.set_experiment(experiment_name) + with mlflow.start_run(run_name=f"{dataset}_alpha{alpha}_seed{seed}"): + mlflow.log_params({ + "dataset": dataset, + "alpha": alpha, + "seed": seed, + "pop_size": pop_size, + "generations": generations, + "elite_frac": elite_frac, + "n_folds": n_folds, + "sigma0": sigma0, + "tc_k": tc_k, + "eps_scalar": eps_scalar, + }) + + # Initialize mean from random uniform sample + init_pop = np.stack([sample_uniform(rng) for _ in range(pop_size)]) + mean = np.mean(init_pop, axis=0) + sigma = np.ones(dim) * sigma0 + + log = [] + std_log = [] + best_vec = None + best_scalar = float("inf") + best_meta = None + + start = time.time() + + for gen in range(generations): + samples = rng.normal(loc=mean, scale=sigma, size=(pop_size, dim)) + samples = [clip_to_bounds(s) for s in samples] + + fits = [] + for s in samples: + mp = decode_rf(s) + scalar, meta = evaluate_scalarized( + X=X, + y=y, + task=task, + algo="rf", + model_params=mp, + pre_cfg=pre_cfg, + alpha=alpha, + seed=seed, + n_folds=n_folds, + ) + fits.append((scalar, s, meta)) + + fits.sort(key=lambda x: x[0]) + elites = [f[1] for f in fits[:n_elite]] + elites = np.stack(elites) + + mean = np.mean(elites, axis=0) + sigma = np.std(elites, axis=0) + 1e-6 + + scalars = [f[0] for f in fits] + pop_std = float(np.std(scalars)) + std_log.append(pop_std) + + best_scalar_gen, best_vec_gen, best_meta_gen = fits[0] + if best_scalar_gen < best_scalar: + best_scalar = float(best_scalar_gen) + best_vec = best_vec_gen.copy() + best_meta = best_meta_gen.copy() + + row = { + "gen": gen, + "best_scalar": float(best_scalar_gen), + "best_mse_like": best_meta_gen["mse_like"], + "best_shap_std": best_meta_gen["shap_std"], + "best_stability_score": best_meta_gen["stability_score"], + "pop_std_scalar": pop_std, + "params": decode_rf(best_vec_gen), + "elapsed_s": time.time() - start, + } + log.append(row) + + mlflow.log_metrics({ + "best_scalar": row["best_scalar"], + "best_mse_like": row["best_mse_like"], + "best_shap_std": row["best_shap_std"], + "best_stability_score": row["best_stability_score"], + "pop_std_scalar": row["pop_std_scalar"], + }, step=gen) + + if gen >= tc_k and all(s < eps_scalar for s in std_log[-tc_k:]): + sigma *= reinject_factor + mlflow.log_metric("tc_reinject", 1.0, step=gen) + + hall = { + "best_params": decode_rf(best_vec), + "best_scalar": best_scalar, + **best_meta, + } + + (outdir / "log.json").write_text(json.dumps(log, indent=2)) + (outdir / "hall_of_fame.json").write_text(json.dumps(hall, indent=2)) + + mlflow.log_dict(log, "log.json") + mlflow.log_dict(hall, "hall_of_fame.json") + + return hall + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--alpha", type=float, required=True) + ap.add_argument("--seed", type=int, default=0) + ap.add_argument("--pop-size", type=int, default=20) + ap.add_argument("--generations", type=int, default=50) + ap.add_argument("--elite-frac", type=float, default=0.5) + ap.add_argument("--n-folds", type=int, default=3) + ap.add_argument("--sigma0", type=float, default=0.2) + ap.add_argument("--tc-k", type=int, default=5) + ap.add_argument("--eps-scalar", type=float, default=1e-4) + ap.add_argument("--experiment-name", default="cmaes_tc_unimodal") + ap.add_argument("--outdir", default="runs/unimodal_cmaes_tc") + args = ap.parse_args() + + run_cmaes_tc( + dataset=args.dataset, + alpha=args.alpha, + seed=args.seed, + pop_size=args.pop_size, + generations=args.generations, + elite_frac=args.elite_frac, + n_folds=args.n_folds, + sigma0=args.sigma0, + tc_k=args.tc_k, + eps_scalar=args.eps_scalar, + experiment_name=args.experiment_name, + outdir=args.outdir, + ) + + +if __name__ == "__main__": + main() diff --git a/src/experiments_unimodal_scalarized/emna_tc.py b/src/experiments_unimodal_scalarized/emna_tc.py new file mode 100644 index 0000000..5242a57 --- /dev/null +++ b/src/experiments_unimodal_scalarized/emna_tc.py @@ -0,0 +1,183 @@ +import argparse +import json +import time +from pathlib import Path + +import numpy as np +import mlflow + +from src.data_openml import load_dataset +from .search_space_rf import ( + PARAM_NAMES, + sample_uniform, + decode_rf, + clip_to_bounds, +) +from .scalarized_evaluate import evaluate_scalarized + + +def run_emna_tc( + dataset, + alpha, + seed=0, + pop_size=20, + generations=50, + elite_frac=0.5, + n_folds=3, + tc_k=5, + eps_scalar=1e-4, + reinject_factor=1.5, + experiment_name="emna_tc_unimodal", + outdir="runs/unimodal_emna_tc", +): + X, y, task = load_dataset(dataset, random_state=seed) + + pre_cfg = { + "num_impute_strategy": "median", + "cat_impute_strategy": "most_frequent", + "scaler": "standard", + "poly_degree": 1, + "select_k": None, + } + + rng = np.random.RandomState(seed) + dim = len(PARAM_NAMES) + n_elite = max(1, int(pop_size * elite_frac)) + + outdir = Path(outdir) / f"{dataset}_alpha{alpha}_seed{seed}" + outdir.mkdir(parents=True, exist_ok=True) + + mlflow.set_experiment(experiment_name) + with mlflow.start_run(run_name=f"{dataset}_alpha{alpha}_seed{seed}"): + mlflow.log_params({ + "dataset": dataset, + "alpha": alpha, + "seed": seed, + "pop_size": pop_size, + "generations": generations, + "elite_frac": elite_frac, + "n_folds": n_folds, + "tc_k": tc_k, + "eps_scalar": eps_scalar, + }) + + pop = [sample_uniform(rng) for _ in range(pop_size)] + mean = np.mean(pop, axis=0) + cov = np.cov(np.stack(pop).T) + 1e-6 * np.eye(dim) + + log = [] + std_log = [] + best_vec = None + best_scalar = float("inf") + best_meta = None + + start = time.time() + + for gen in range(generations): + samples = rng.multivariate_normal(mean, cov, size=pop_size) + samples = [clip_to_bounds(s) for s in samples] + + fits = [] + for s in samples: + mp = decode_rf(s) + scalar, meta = evaluate_scalarized( + X=X, + y=y, + task=task, + algo="rf", + model_params=mp, + pre_cfg=pre_cfg, + alpha=alpha, + seed=seed, + n_folds=n_folds, + ) + fits.append((scalar, s, meta)) + + fits.sort(key=lambda x: x[0]) + elites = [f[1] for f in fits[:n_elite]] + + mean = np.mean(elites, axis=0) + cov = np.cov(np.stack(elites).T) + 1e-6 * np.eye(dim) + + scalars = [f[0] for f in fits] + pop_std = float(np.std(scalars)) + std_log.append(pop_std) + + best_scalar_gen, best_vec_gen, best_meta_gen = fits[0] + if best_scalar_gen < best_scalar: + best_scalar = float(best_scalar_gen) + best_vec = best_vec_gen.copy() + best_meta = best_meta_gen.copy() + + row = { + "gen": gen, + "best_scalar": float(best_scalar_gen), + "best_mse_like": best_meta_gen["mse_like"], + "best_shap_std": best_meta_gen["shap_std"], + "best_stability_score": best_meta_gen["stability_score"], + "pop_std_scalar": pop_std, + "params": decode_rf(best_vec_gen), + "elapsed_s": time.time() - start, + } + log.append(row) + + mlflow.log_metrics({ + "best_scalar": row["best_scalar"], + "best_mse_like": row["best_mse_like"], + "best_shap_std": row["best_shap_std"], + "best_stability_score": row["best_stability_score"], + "pop_std_scalar": row["pop_std_scalar"], + }, step=gen) + + # Threshold convergence reinjection + if gen >= tc_k and all(s < eps_scalar for s in std_log[-tc_k:]): + cov *= reinject_factor + mlflow.log_metric("tc_reinject", 1.0, step=gen) + + hall = { + "best_params": decode_rf(best_vec), + "best_scalar": best_scalar, + **best_meta, + } + + (outdir / "log.json").write_text(json.dumps(log, indent=2)) + (outdir / "hall_of_fame.json").write_text(json.dumps(hall, indent=2)) + + mlflow.log_dict(log, "log.json") + mlflow.log_dict(hall, "hall_of_fame.json") + + return hall + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--alpha", type=float, required=True) + ap.add_argument("--seed", type=int, default=0) + ap.add_argument("--pop-size", type=int, default=20) + ap.add_argument("--generations", type=int, default=50) + ap.add_argument("--elite-frac", type=float, default=0.5) + ap.add_argument("--n-folds", type=int, default=3) + ap.add_argument("--tc-k", type=int, default=5) + ap.add_argument("--eps-scalar", type=float, default=1e-4) + ap.add_argument("--experiment-name", default="emna_tc_unimodal") + ap.add_argument("--outdir", default="runs/unimodal_emna_tc") + args = ap.parse_args() + + run_emna_tc( + dataset=args.dataset, + alpha=args.alpha, + seed=args.seed, + pop_size=args.pop_size, + generations=args.generations, + elite_frac=args.elite_frac, + n_folds=args.n_folds, + tc_k=args.tc_k, + eps_scalar=args.eps_scalar, + experiment_name=args.experiment_name, + outdir=args.outdir, + ) + + +if __name__ == "__main__": + main() diff --git a/src/experiments_unimodal_scalarized/pso_tc.py b/src/experiments_unimodal_scalarized/pso_tc.py new file mode 100644 index 0000000..ab9dc5d --- /dev/null +++ b/src/experiments_unimodal_scalarized/pso_tc.py @@ -0,0 +1,196 @@ +import argparse +import json +import time +from pathlib import Path + +import numpy as np +import mlflow + +from src.data_openml import load_dataset +from .search_space_rf import ( + PARAM_NAMES, + sample_uniform, + decode_rf, + clip_to_bounds, +) +from .scalarized_evaluate import evaluate_scalarized + + +def run_pso_tc( + dataset, + alpha, + seed=0, + swarm_size=20, + iterations=50, + n_folds=3, + w=0.7, + c1=1.4, + c2=1.4, + tc_k=5, + eps_scalar=1e-4, + reinject_factor=1.5, + experiment_name="pso_tc_unimodal", + outdir="runs/unimodal_pso_tc", +): + X, y, task = load_dataset(dataset, random_state=seed) + + pre_cfg = { + "num_impute_strategy": "median", + "cat_impute_strategy": "most_frequent", + "scaler": "standard", + "poly_degree": 1, + "select_k": None, + } + + rng = np.random.RandomState(seed) + dim = len(PARAM_NAMES) + + outdir = Path(outdir) / f"{dataset}_alpha{alpha}_seed{seed}" + outdir.mkdir(parents=True, exist_ok=True) + + mlflow.set_experiment(experiment_name) + with mlflow.start_run(run_name=f"{dataset}_alpha{alpha}_seed{seed}"): + mlflow.log_params({ + "dataset": dataset, + "alpha": alpha, + "seed": seed, + "swarm_size": swarm_size, + "iterations": iterations, + "n_folds": n_folds, + "w": w, + "c1": c1, + "c2": c2, + "tc_k": tc_k, + "eps_scalar": eps_scalar, + }) + + pos = np.array([sample_uniform(rng) for _ in range(swarm_size)]) + vel = rng.normal(0, 1, size=(swarm_size, dim)) * 0.1 + + pbest_pos = pos.copy() + pbest_fit = np.full(swarm_size, np.inf) + + gbest_pos = None + gbest_fit = np.inf + gbest_meta = None + + log = [] + std_log = [] + start = time.time() + + for it in range(iterations): + fits = [] + for i in range(swarm_size): + mp = decode_rf(pos[i]) + scalar, meta = evaluate_scalarized( + X=X, + y=y, + task=task, + algo="rf", + model_params=mp, + pre_cfg=pre_cfg, + alpha=alpha, + seed=seed, + n_folds=n_folds, + ) + fits.append((scalar, meta)) + + if scalar < pbest_fit[i]: + pbest_fit[i] = scalar + pbest_pos[i] = pos[i].copy() + + if scalar < gbest_fit: + gbest_fit = scalar + gbest_pos = pos[i].copy() + gbest_meta = meta.copy() + + scalars = [f[0] for f in fits] + pop_std = float(np.std(scalars)) + std_log.append(pop_std) + + row = { + "iter": it, + "best_scalar": float(gbest_fit), + "best_mse_like": gbest_meta["mse_like"], + "best_shap_std": gbest_meta["shap_std"], + "best_stability_score": gbest_meta["stability_score"], + "pop_std_scalar": pop_std, + "params": decode_rf(gbest_pos), + "elapsed_s": time.time() - start, + } + log.append(row) + + mlflow.log_metrics({ + "best_scalar": row["best_scalar"], + "best_mse_like": row["best_mse_like"], + "best_shap_std": row["best_shap_std"], + "best_stability_score": row["best_stability_score"], + "pop_std_scalar": row["pop_std_scalar"], + }, step=it) + + r1 = rng.rand(swarm_size, dim) + r2 = rng.rand(swarm_size, dim) + + vel = ( + w * vel + + c1 * r1 * (pbest_pos - pos) + + c2 * r2 * (gbest_pos - pos) + ) + pos = pos + vel + pos = np.array([clip_to_bounds(p) for p in pos]) + + if it >= tc_k and all(s < eps_scalar for s in std_log[-tc_k:]): + vel *= reinject_factor + mlflow.log_metric("tc_reinject", 1.0, step=it) + + hall = { + "best_params": decode_rf(gbest_pos), + "best_scalar": float(gbest_fit), + **gbest_meta, + } + + (outdir / "log.json").write_text(json.dumps(log, indent=2)) + (outdir / "hall_of_fame.json").write_text(json.dumps(hall, indent=2)) + + mlflow.log_dict(log, "log.json") + mlflow.log_dict(hall, "hall_of_fame.json") + + return hall + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--alpha", type=float, required=True) + ap.add_argument("--seed", type=int, default=0) + ap.add_argument("--swarm-size", type=int, default=20) + ap.add_argument("--iterations", type=int, default=50) + ap.add_argument("--n-folds", type=int, default=3) + ap.add_argument("--w", type=float, default=0.7) + ap.add_argument("--c1", type=float, default=1.4) + ap.add_argument("--c2", type=float, default=1.4) + ap.add_argument("--tc-k", type=int, default=5) + ap.add_argument("--eps-scalar", type=float, default=1e-4) + ap.add_argument("--experiment-name", default="pso_tc_unimodal") + ap.add_argument("--outdir", default="runs/unimodal_pso_tc") + args = ap.parse_args() + + run_pso_tc( + dataset=args.dataset, + alpha=args.alpha, + seed=args.seed, + swarm_size=args.swarm_size, + iterations=args.iterations, + n_folds=args.n_folds, + w=args.w, + c1=args.c1, + c2=args.c2, + tc_k=args.tc_k, + eps_scalar=args.eps_scalar, + experiment_name=args.experiment_name, + outdir=args.outdir, + ) + + +if __name__ == "__main__": + main() diff --git a/src/experiments_unimodal_scalarized/run_all_alphas.py b/src/experiments_unimodal_scalarized/run_all_alphas.py new file mode 100644 index 0000000..93a2221 --- /dev/null +++ b/src/experiments_unimodal_scalarized/run_all_alphas.py @@ -0,0 +1,55 @@ +import argparse +import subprocess +import sys + +ALPHAS = [round(0.1 * i, 1) for i in range(1, 10)] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dataset", required=True, choices=["adult", "cal_housing"]) + ap.add_argument("--seed", type=int, default=0) + + ap.add_argument("--emna-pop", type=int, default=20) + ap.add_argument("--emna-gens", type=int, default=50) + + ap.add_argument("--cmaes-pop", type=int, default=20) + ap.add_argument("--cmaes-gens", type=int, default=50) + + ap.add_argument("--pso-swarm", type=int, default=20) + ap.add_argument("--pso-iters", type=int, default=50) + args = ap.parse_args() + + py = sys.executable # uses your current venv python + + for alpha in ALPHAS: + subprocess.run([ + py, "-m", "src.experiments_unimodal_scalarized.emna_tc", + "--dataset", args.dataset, + "--alpha", str(alpha), + "--seed", str(args.seed), + "--pop-size", str(args.emna_pop), + "--generations", str(args.emna_gens), + ], check=True) + + subprocess.run([ + py, "-m", "src.experiments_unimodal_scalarized.cmaes_tc", + "--dataset", args.dataset, + "--alpha", str(alpha), + "--seed", str(args.seed), + "--pop-size", str(args.cmaes_pop), + "--generations", str(args.cmaes_gens), + ], check=True) + + subprocess.run([ + py, "-m", "src.experiments_unimodal_scalarized.pso_tc", + "--dataset", args.dataset, + "--alpha", str(alpha), + "--seed", str(args.seed), + "--swarm-size", str(args.pso_swarm), + "--iterations", str(args.pso_iters), + ], check=True) + + +if __name__ == "__main__": + main() diff --git a/src/experiments_unimodal_scalarized/scalarized_evaluate.py b/src/experiments_unimodal_scalarized/scalarized_evaluate.py new file mode 100644 index 0000000..eb6873d --- /dev/null +++ b/src/experiments_unimodal_scalarized/scalarized_evaluate.py @@ -0,0 +1,102 @@ +import numpy as np +from sklearn.pipeline import Pipeline +from sklearn.metrics import mean_squared_error, brier_score_loss + +from src.preprocessing import build_preprocessor +from src.models import make_model +from src.stability import compute_shap_matrix, shap_stability_from_matrices +from src.protocols_methodology.automl_protocol_adapters import cv_protocol + + +def evaluate_scalarized( + X, + y, + task, + algo, + model_params, + pre_cfg, + alpha, + seed=0, + n_folds=3, + max_eval_rows=512, + bg_size=128, +): + """Single objective scalarization. + + Returns: + scalar: alpha * loss_mean + (1-alpha) * shap_std_mean (minimize) + meta: dict with mse_like, shap_std, stability_score + """ + + rng = np.random.RandomState(seed) + + # Fixed SHAP evaluation pool for this evaluation. + eval_size = min(max_eval_rows, len(X)) + eval_idx = rng.choice(len(X), size=eval_size, replace=False) + X_eval_fixed = X.iloc[eval_idx] + + # Freeze preprocessing dimensionality. + fixed_poly_degree = pre_cfg.get("poly_degree", 1) + probe_pre = build_preprocessor( + X, task, pre_cfg, fixed_k=None, fixed_poly_degree=fixed_poly_degree + ) + Xp = probe_pre.fit_transform(X, y) + n_after_prep = Xp.shape[1] + desired_k = pre_cfg.get("select_k", None) + fixed_k = None if desired_k is None else int(min(max(1, desired_k), n_after_prep)) + + shap_mats_with_names = [] + losses = [] + + reps = cv_protocol(X, y, n_folds=n_folds, seed=seed) + + for rep_id, rep in enumerate(reps): + tr, te = rep["train_idx"], rep["test_idx"] + X_fit, y_fit = X.iloc[tr], y.iloc[tr] + X_test, y_test = X.iloc[te], y.iloc[te] + + preproc = build_preprocessor( + X, task, pre_cfg, fixed_k=fixed_k, fixed_poly_degree=fixed_poly_degree + ) + model = make_model(task, algo, model_params, random_state=seed + rep_id) + pipe = Pipeline([("pre", preproc), ("model", model)]) + + shap_vals, _, _, feat_names = compute_shap_matrix( + pipe, + X_fit=X_fit, + y_fit=y_fit, + X_eval=X_eval_fixed, + task_type=task, + bg_size=bg_size, + max_eval_rows=max_eval_rows, + rng_seed=seed, + ) + shap_mats_with_names.append((shap_vals, feat_names)) + + if task == "regression": + y_pred = pipe.predict(X_test) + loss = float(mean_squared_error(y_test, y_pred)) + else: + if hasattr(pipe.named_steps["model"], "predict_proba"): + y_prob = pipe.predict_proba(X_test)[:, 1] + else: + scores = pipe.decision_function(X_test) + scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) + y_prob = scores + loss = float(brier_score_loss(y_test, y_prob)) + + losses.append(loss) + + agg_std, stability_score, _, _ = shap_stability_from_matrices(shap_mats_with_names) + + mse_like = float(np.mean(losses)) + shap_std = float(agg_std) + + scalar = float(alpha * mse_like + (1.0 - alpha) * shap_std) + + meta = { + "mse_like": mse_like, + "shap_std": shap_std, + "stability_score": float(stability_score), + } + return scalar, meta diff --git a/src/experiments_unimodal_scalarized/search_space_rf.py b/src/experiments_unimodal_scalarized/search_space_rf.py new file mode 100644 index 0000000..dfb52fb --- /dev/null +++ b/src/experiments_unimodal_scalarized/search_space_rf.py @@ -0,0 +1,48 @@ +import numpy as np + +# Continuous-ish search space for RandomForest. +# We fix model family to keep the landscape close to unimodal once scalarized. +BOUNDS = { + "n_estimators": (50, 400), # int + "max_depth": (2, 15), # int + "max_features": (0.2, 1.0), # float (fraction of features) + "min_samples_split": (2, 20), # int + "min_samples_leaf": (1, 10), # int +} + +PARAM_NAMES = list(BOUNDS.keys()) + + +def sample_uniform(rng: np.random.RandomState): + vec = [] + for k in PARAM_NAMES: + low, high = BOUNDS[k] + if k == "max_features": + vec.append(rng.uniform(low, high)) + else: + vec.append(rng.randint(low, high + 1)) + return np.array(vec, dtype=float) + + +def clip_to_bounds(vec): + out = [] + for i, k in enumerate(PARAM_NAMES): + low, high = BOUNDS[k] + v = float(vec[i]) + if k == "max_features": + v = float(np.clip(v, low, high)) + else: + v = int(np.clip(round(v), low, high)) + out.append(v) + return np.array(out, dtype=float) + + +def decode_rf(vec): + v = clip_to_bounds(vec) + return { + "n_estimators": int(v[0]), + "max_depth": int(v[1]), + "max_features": float(v[2]), + "min_samples_split": int(v[3]), + "min_samples_leaf": int(v[4]), + }