116 lines
3.8 KiB
Python
116 lines
3.8 KiB
Python
import os
|
|
import pandas as pd
|
|
import openml
|
|
|
|
# --- CACHE SETUP ---
|
|
CACHE_DIR = os.path.expanduser("~/openml_cache")
|
|
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
openml.config.cache_directory = CACHE_DIR
|
|
|
|
# --- Dataset IDs ---
|
|
TASKS = {
|
|
"adult": 7592,
|
|
"spambase": 43,
|
|
"optdigits": 28,
|
|
}
|
|
|
|
DATASETS = {
|
|
"cal_housing": 44025
|
|
}
|
|
|
|
# --- Load functions ---
|
|
def _load_task_dataframe(task_id: int):
|
|
task = openml.tasks.get_task(task_id)
|
|
dataset_id = task.dataset_id
|
|
dataset = openml.datasets.get_dataset(dataset_id)
|
|
X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=task.target_name)
|
|
mask = ~y.isna()
|
|
return X.loc[mask], y.loc[mask]
|
|
|
|
def load_dataset(name: str):
|
|
if name in TASKS:
|
|
X, y = _load_task_dataframe(TASKS[name])
|
|
task_type = "classification"
|
|
elif name in DATASETS:
|
|
ds_id = DATASETS[name]
|
|
ds = openml.datasets.get_dataset(ds_id)
|
|
target_col = ds.default_target_attribute
|
|
X, y, _, _ = ds.get_data(dataset_format="dataframe", target=None)
|
|
mask = ~X[target_col].isna()
|
|
X = X.loc[mask]
|
|
y = X[target_col].loc[mask]
|
|
task_type = "regression"
|
|
else:
|
|
raise ValueError(f"Unknown dataset {name}")
|
|
return X, y, task_type
|
|
|
|
# --- Interactive main ---
|
|
def main():
|
|
print("Available datasets:")
|
|
all_datasets = list(TASKS.keys()) + list(DATASETS.keys())
|
|
for i, name in enumerate(all_datasets):
|
|
print(f"{i+1}. {name}")
|
|
|
|
selection = input("Enter the dataset name: ").strip()
|
|
if selection not in all_datasets:
|
|
raise ValueError(f"Dataset '{selection}' not recognized.")
|
|
|
|
X, y, task_type = load_dataset(selection)
|
|
|
|
# --- Identify default target ---
|
|
default_target = y.name
|
|
print(f"\nDefault target column: {default_target}")
|
|
|
|
# --- Print all features (without explanations) ---
|
|
print("\nFeatures in the dataset:")
|
|
for col in X.columns.unique():
|
|
print(f"- {col} ({X[col].dtype})")
|
|
|
|
# --- Target selection ---
|
|
target = input("\nEnter the target feature (or press Enter to use default): ").strip()
|
|
if target:
|
|
if target not in X.columns:
|
|
raise ValueError(f"Target feature '{target}' not found.")
|
|
y = X[target]
|
|
X = X.drop(columns=[target], errors="ignore")
|
|
else:
|
|
target = default_target
|
|
X = X.drop(columns=[target], errors="ignore")
|
|
|
|
# --- Feature exclusion ---
|
|
exclude_input = input("\nEnter features to exclude (comma-separated), or press Enter to skip: ").strip()
|
|
if exclude_input:
|
|
exclude_cols = [col.strip() for col in exclude_input.split(",")]
|
|
for col in exclude_cols:
|
|
if col in X.columns:
|
|
X = X.drop(columns=[col])
|
|
else:
|
|
print(f"Warning: '{col}' not found in dataset and cannot be excluded.")
|
|
|
|
# --- Show preview ---
|
|
print("\nFinal dataset preview (first 5 rows):")
|
|
print(X.head())
|
|
print("\nTarget preview (first 5 rows):")
|
|
print(y.head())
|
|
print(f"\nTask type: {task_type}")
|
|
print(f"Target column: {target}")
|
|
print(f"Number of features: {len(X.columns)}")
|
|
|
|
# --- Export to CSV ---
|
|
output_file = input("\nEnter filename to save dataset as CSV (e.g., dataset.csv): ").strip()
|
|
if output_file:
|
|
df_export = X.copy()
|
|
df_export[target] = y # append target at the end
|
|
df_export.to_csv(output_file, index=False)
|
|
print(f"Dataset saved to {output_file} (target column: '{target}')")
|
|
|
|
# Save the CSV path to a temporary text file in the current directory
|
|
temp_path_file = "last_csv_path.txt"
|
|
full_path = os.path.abspath(output_file)
|
|
with open(temp_path_file, "w") as f:
|
|
f.write(full_path)
|
|
print(f"CSV path written to {temp_path_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|