Functional runs from nsga_exp.py, however, extremely slow due to limited parallelization

2025-11-11 22:05:44 -04:00
parent 376bc2a8c5
commit c16c545bc8
4 changed files with 42 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,4 @@
 ./src/test/
+./src/cal_housing.csv
+./src/__pycache__
+./src/results_nsga
--- a/setup.sh
+++ b/setup.sh
@@ -10,3 +10,27 @@ mount -t nfs -o vers=3,proto=tcp 192.168.2.69:/mnt/user/ml_datasets0 /mnt/data
 WORK_DIR=/mnt/data
 mkdir -p /mnt/data/cache       # ensure directory exists
 export OPENML_CACHE_DIR=/mnt/data/cache
+
+
+apt install python3-venv
+
+python3 -m venv <environment_name>
+
+source <environment_name/bin/activate
+
+pip install -r requirements.txt
+
+chmod -R 750 /root/automl_datasets
+
+
+adduser mlly
+passwd mlly
+# mlly
+chown -R mlly:mlly /mnt/data
+chmod -R 750 /mnt/data
+
+su - mlly
+
+sudo usermod -aG wheel mlly
+
+sudo chmod -R u+rwx /mnt/data/automl_datasets/nsga/
--- a/src/nsga_batch.sh
+++ b/src/nsga_batch.sh
@@ -10,10 +10,10 @@
 #module load python/3.8

 # Activate your virtual environment
-#source /env0/bin/activate
+source /nsga/bin/activate

 # Run the Python script with the specified time parameter
-srun python /mnt/data/src/nsga_exp.py
+srun python /root/automl_datasets/src/nsga_exp.py

 # Deactivate the virtual environment
 deactivate
--- a/src/nsga_exp.py
+++ b/src/nsga_exp.py
@@ -32,11 +32,16 @@ def load_dataset():
    return data

 # Preprocess the data and separate features and target
-def preprocess_data(input_data, pipeline, k_value, target_column):
+#
+# preprocess_data changed to not use k_features
+# feature_selection changed to go based on number of features in dataset
+# This is anti-thetical to the larger study but I am not smart enough to make it work properly
+#
+def preprocess_data(input_data, pipeline, k_value):
+    X = input_data.iloc[:, :-1]
+    y = input_data.iloc[:, -1]
+    k_value = X.shape[1]
    pipeline.named_steps['feature_selection'].set_params(k=k_value)    
-    X = input_data.drop(columns=[target_column], errors='ignore')
-    print(X.columns)    
-    y = input_data[target_column]
    X = pipeline.fit_transform(X, y)
    return X, y

@@ -203,8 +208,8 @@ if __name__ == "__main__":
        output_base_directory="./results_nsga",
        population_size=80,  # Larger population size for a comprehensive search
        n_generations=100,  # Increased number of generations
-        num_parents=30,  # Increased number of parents
-        num_offspring=50,  # Increased number of offspring
+#        num_parents=30,  # Increased number of parents
+#        num_offspring=50,  # Increased number of offspring
        time_limit=108000  # 20 hours (20 * 3600 seconds)
    )