Functional runs from nsga_exp.py, however, extremely slow due to limited parallelization

2025-11-11 22:05:44 -04:00
parent 376bc2a8c5
commit c16c545bc8
4 changed files with 42 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,4 @@
 ./src/test/
 ./src/cal_housing.csv
 ./src/__pycache__
 ./src/results_nsga
--- a/setup.sh
+++ b/setup.sh
@@ -10,3 +10,27 @@ mount -t nfs -o vers=3,proto=tcp 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data
 WORK_DIR=/mnt/data
 mkdir -p /mnt/data/cache       # ensure directory exists
 export OPENML_CACHE_DIR=/mnt/data/cache
 apt install python3-venv
 python3 -m venv <environment_name>
 source <environment_name/bin/activate
 pip install -r requirements.txt
 chmod -R 750 /root/automl_datasets
 adduser mlly
 passwd mlly
 # mlly
 chown -R mlly:mlly /mnt/data
 chmod -R 750 /mnt/data
 su - mlly
 sudo usermod -aG wheel mlly
 sudo chmod -R u+rwx /mnt/data/automl_datasets/nsga/
--- a/src/nsga_batch.sh
+++ b/src/nsga_batch.sh
@@ -10,10 +10,10 @@
 #module load python/3.8
 # Activate your virtual environment
-#source /env0/bin/activate
+source /nsga/bin/activate
 # Run the Python script with the specified time parameter
-srun python /mnt/data/src/nsga_exp.py
+srun python /root/automl_datasets/src/nsga_exp.py
 # Deactivate the virtual environment
 deactivate
--- a/src/nsga_exp.py
+++ b/src/nsga_exp.py
@@ -32,11 +32,16 @@ def load_dataset():
    return data
 # Preprocess the data and separate features and target
-def preprocess_data(input_data, pipeline, k_value, target_column):
+#
 # preprocess_data changed to not use k_features
 # feature_selection changed to go based on number of features in dataset
 # This is anti-thetical to the larger study but I am not smart enough to make it work properly
 #
 def preprocess_data(input_data, pipeline, k_value):
    X = input_data.iloc[:, :-1]
    y = input_data.iloc[:, -1]
    k_value = X.shape[1]
    pipeline.named_steps['feature_selection'].set_params(k=k_value)    
    X = input_data.drop(columns=[target_column], errors='ignore')
    print(X.columns)    
    y = input_data[target_column]
    X = pipeline.fit_transform(X, y)
    return X, y
@@ -203,8 +208,8 @@ if __name__ == "__main__":
        output_base_directory="./results_nsga",
        population_size=80,  # Larger population size for a comprehensive search
        n_generations=100,  # Increased number of generations
-        num_parents=30,  # Increased number of parents
+#        num_parents=30,  # Increased number of parents
-        num_offspring=50,  # Increased number of offspring
+#        num_offspring=50,  # Increased number of offspring
        time_limit=108000  # 20 hours (20 * 3600 seconds)
    )