from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer, PolynomialFeatures from sklearn.impute import SimpleImputer from sklearn.feature_selection import SelectKBest, f_classif, f_regression from sklearn.feature_selection import VarianceThreshold from sklearn.base import BaseEstimator, TransformerMixin import numpy as np class SafeSelectK(BaseEstimator, TransformerMixin): def __init__(self, task: str, k=None): self.task = task self.k = k self.selector_ = None self.k_effective_ = None self.support_mask_ = None self.feature_names_in_ = None self.feature_names_out_ = None def fit(self, X, y=None): if self.k is None: self.selector_ = "passthrough" self.feature_names_out_ = self.feature_names_in_ return self n_feats = X.shape[1] k_eff = int(min(max(1, self.k), n_feats)) score_func = f_classif if self.task == "classification" else f_regression sel = SelectKBest(score_func=score_func, k=k_eff).fit(X, y) self.selector_ = sel self.k_effective_ = k_eff mask = np.zeros(n_feats, dtype=bool) mask[sel.get_support(indices=True)] = True self.support_mask_ = mask if self.feature_names_in_ is not None: self.feature_names_out_ = self.feature_names_in_[mask] return self def set_feature_names_in(self, names): self.feature_names_in_ = np.asarray(names) def transform(self, X): if self.selector_ == "passthrough": return X return self.selector_.transform(X) def get_feature_names_out(self, input_features=None): if getattr(self, "feature_names_out_", None) is not None: return self.feature_names_out_ if getattr(self, "support_mask_", None) is not None and input_features is not None: input_features = np.asarray(input_features) return input_features[self.support_mask_] return None class ConstantFilter(BaseEstimator, TransformerMixin): def __init__(self, eps=0.0): self.eps = eps self.mask_ = None self.feature_names_in_ = None self.feature_names_out_ = None def fit(self, X, y=None): X = np.asarray(X) var = X.var(axis=0) self.mask_ = var > self.eps if self.feature_names_in_ is not None: self.feature_names_out_ = np.asarray(self.feature_names_in_)[self.mask_] return self def set_feature_names_in(self, names): self.feature_names_in_ = np.asarray(names) def get_feature_names_out(self): if self.feature_names_out_ is not None: return self.feature_names_out_ # fallback when names were not set return np.array([f"f{i}" for i, keep in enumerate(self.mask_) if keep]) def transform(self, X): X = np.asarray(X) return X[:, self.mask_] def build_preprocessor(X_full, task, cfg, fixed_k=None, fixed_poly_degree=None): cat_cols = X_full.select_dtypes(include=["object", "category", "bool"]).columns.tolist() num_cols = [c for c in X_full.columns if c not in cat_cols] num_imputer = SimpleImputer(strategy=cfg.get("num_impute_strategy", "median")) cat_imputer = SimpleImputer(strategy=cfg.get("cat_impute_strategy", "most_frequent")) scaler_name = cfg.get("scaler", "standard") if scaler_name == "standard": num_scaler = StandardScaler(with_mean=True, with_std=True) elif scaler_name == "robust": num_scaler = RobustScaler() elif scaler_name == "minmax": num_scaler = MinMaxScaler() elif scaler_name == "power": num_scaler = PowerTransformer(method="yeo-johnson") else: num_scaler = "passthrough" poly_degree = fixed_poly_degree if fixed_poly_degree is not None else cfg.get("poly_degree", 1) poly = PolynomialFeatures(degree=poly_degree, include_bias=False) if poly_degree > 1 else "passthrough" # always fix categories from the full dataset fixed_categories = None if len(cat_cols) > 0: fixed_categories = {c: sorted(X_full[c].dropna().astype(str).unique()) for c in cat_cols} ohe_kwargs = dict(handle_unknown="ignore") try: ohe_kwargs["sparse_output"] = False except TypeError: ohe_kwargs["sparse"] = False if fixed_categories is not None: ohe_kwargs["categories"] = [fixed_categories[c] for c in cat_cols] cat_encoder = OneHotEncoder(**ohe_kwargs) num_steps = [("impute", num_imputer), ("scale", num_scaler), ("poly", poly)] if int(cfg.get("use_vt", 0)): num_steps.append(("vt", VarianceThreshold(threshold=float(cfg.get("vt_thr", 0.0))))) ct = ColumnTransformer([ ("num", Pipeline(steps=num_steps), num_cols), ("cat", Pipeline(steps=[("impute", cat_imputer), ("oh", cat_encoder)]), cat_cols), ]) select_k = fixed_k if fixed_k is not None else cfg.get("select_k", None) selector = SafeSelectK(task=task, k=select_k) pre = Pipeline([ ("prep", ct), ("drop_const", ConstantFilter(eps=0.0)), ("select", selector), ]) return pre