automl_datasets/src/dataset.py

import os
import pandas as pd
import openml

# --- CACHE SETUP ---
CACHE_DIR = os.path.expanduser("~/openml_cache")
os.makedirs(CACHE_DIR, exist_ok=True)
openml.config.cache_directory = CACHE_DIR

# --- Dataset IDs ---
TASKS = {
    "adult": 7592,
    "spambase": 43,
    "optdigits": 28,
}

DATASETS = {
    "cal_housing": 44025
}

# --- Load functions ---
def _load_task_dataframe(task_id: int):
    task = openml.tasks.get_task(task_id)
    dataset_id = task.dataset_id
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=task.target_name)
    mask = ~y.isna()
    return X.loc[mask], y.loc[mask]

def load_dataset(name: str):
    if name in TASKS:
        X, y = _load_task_dataframe(TASKS[name])
        task_type = "classification"
    elif name in DATASETS:
        ds_id = DATASETS[name]
        ds = openml.datasets.get_dataset(ds_id)
        target_col = ds.default_target_attribute
        X, y, _, _ = ds.get_data(dataset_format="dataframe", target=None)
        mask = ~X[target_col].isna()
        X = X.loc[mask]
        y = X[target_col].loc[mask]
        task_type = "regression"
    else:
        raise ValueError(f"Unknown dataset {name}")
    return X, y, task_type

# --- Interactive main ---
def main():
    print("Available datasets:")
    all_datasets = list(TASKS.keys()) + list(DATASETS.keys())
    for i, name in enumerate(all_datasets):
        print(f"{i+1}. {name}")

    selection = input("Enter the dataset name: ").strip()
    if selection not in all_datasets:
        raise ValueError(f"Dataset '{selection}' not recognized.")

    X, y, task_type = load_dataset(selection)

    # --- Identify default target ---
    default_target = y.name
    print(f"\nDefault target column: {default_target}")

    # --- Print all features (without explanations) ---
    print("\nFeatures in the dataset:")
    for col in X.columns.unique():
        print(f"- {col} ({X[col].dtype})")

    # --- Target selection ---
    target = input("\nEnter the target feature (or press Enter to use default): ").strip()
    if target:
        if target not in X.columns:
            raise ValueError(f"Target feature '{target}' not found.")
        y = X[target]
        X = X.drop(columns=[target], errors="ignore")
    else:
        target = default_target
        X = X.drop(columns=[target], errors="ignore")

    # --- Feature exclusion ---
    exclude_input = input("\nEnter features to exclude (comma-separated), or press Enter to skip: ").strip()
    if exclude_input:
        exclude_cols = [col.strip() for col in exclude_input.split(",")]
        for col in exclude_cols:
            if col in X.columns:
                X = X.drop(columns=[col])
            else:
                print(f"Warning: '{col}' not found in dataset and cannot be excluded.")

    # --- Show preview ---
    print("\nFinal dataset preview (first 5 rows):")
    print(X.head())
    print("\nTarget preview (first 5 rows):")
    print(y.head())
    print(f"\nTask type: {task_type}")
    print(f"Target column: {target}")
    print(f"Number of features: {len(X.columns)}")

    # --- Export to CSV ---
    output_file = input("\nEnter filename to save dataset as CSV (e.g., dataset.csv): ").strip()
    if output_file:
        df_export = X.copy()
        df_export[target] = y  # append target at the end
        df_export.to_csv(output_file, index=False)
        print(f"Dataset saved to {output_file} (target column: '{target}')")

        # Save the CSV path to a temporary text file in the current directory
        temp_path_file = "last_csv_path.txt"
        full_path = os.path.abspath(output_file)
        with open(temp_path_file, "w") as f:
            f.write(full_path)
        print(f"CSV path written to {temp_path_file}")

if __name__ == "__main__":
    main()