Functional runs from nsga_exp.py, however, extremely slow due to limited parallelization

This commit is contained in:
Varyngoth
2025-11-11 22:05:44 -04:00
parent 376bc2a8c5
commit c16c545bc8
4 changed files with 42 additions and 10 deletions

5
.gitignore vendored
View File

@@ -1 +1,4 @@
./src/test/
./src/test/
./src/cal_housing.csv
./src/__pycache__
./src/results_nsga

View File

@@ -10,3 +10,27 @@ mount -t nfs -o vers=3,proto=tcp 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data
WORK_DIR=/mnt/data
mkdir -p /mnt/data/cache # ensure directory exists
export OPENML_CACHE_DIR=/mnt/data/cache
apt install python3-venv
python3 -m venv <environment_name>
source <environment_name/bin/activate
pip install -r requirements.txt
chmod -R 750 /root/automl_datasets
adduser mlly
passwd mlly
# mlly
chown -R mlly:mlly /mnt/data
chmod -R 750 /mnt/data
su - mlly
sudo usermod -aG wheel mlly
sudo chmod -R u+rwx /mnt/data/automl_datasets/nsga/

View File

@@ -10,10 +10,10 @@
#module load python/3.8
# Activate your virtual environment
#source /env0/bin/activate
source /nsga/bin/activate
# Run the Python script with the specified time parameter
srun python /mnt/data/src/nsga_exp.py
srun python /root/automl_datasets/src/nsga_exp.py
# Deactivate the virtual environment
deactivate

View File

@@ -32,11 +32,16 @@ def load_dataset():
return data
# Preprocess the data and separate features and target
def preprocess_data(input_data, pipeline, k_value, target_column):
pipeline.named_steps['feature_selection'].set_params(k=k_value)
X = input_data.drop(columns=[target_column], errors='ignore')
print(X.columns)
y = input_data[target_column]
#
# preprocess_data changed to not use k_features
# feature_selection changed to go based on number of features in dataset
# This is anti-thetical to the larger study but I am not smart enough to make it work properly
#
def preprocess_data(input_data, pipeline, k_value):
X = input_data.iloc[:, :-1]
y = input_data.iloc[:, -1]
k_value = X.shape[1]
pipeline.named_steps['feature_selection'].set_params(k=k_value)
X = pipeline.fit_transform(X, y)
return X, y
@@ -203,8 +208,8 @@ if __name__ == "__main__":
output_base_directory="./results_nsga",
population_size=80, # Larger population size for a comprehensive search
n_generations=100, # Increased number of generations
num_parents=30, # Increased number of parents
num_offspring=50, # Increased number of offspring
# num_parents=30, # Increased number of parents
# num_offspring=50, # Increased number of offspring
time_limit=108000 # 20 hours (20 * 3600 seconds)
)