fix: obtain list of gpus available based in user

FerTV · FerTV · commit 0f9dff27eebe · 2024-12-13T16:53:26.000+01:00
diff --git a/nebula/controller.py b/nebula/controller.py
@@ -113,9 +113,9 @@ async def get_least_memory_gpu():
     }
     
     
-@app.get("/available_gpu")
+@app.get("/available_gpus/")
 async def get_available_gpu():
-    available_gpu_index = None
+    available_gpus = []
       
     if importlib.util.find_spec("pynvml") is not None:
         try:
@@ -130,16 +130,15 @@ async def get_available_gpu():
                 memory_used_percent = (memory_info.used / memory_info.total) * 100
                 
                 # Obtain available GPUs
-                if memory_used_percent < 5 and available_gpu_index is None:
-                    available_gpu_index = i
-                                    
+                if memory_used_percent < 5:
+                    available_gpus.append(i)
+                        
+            return {
+                "available_gpus": available_gpus,
+            }                       
         except Exception:  # noqa: S110
             pass
 
-    return {
-        "available_gpu_index": available_gpu_index,
-    }
-
 
 class NebulaEventHandler(PatternMatchingEventHandler):
     """
diff --git a/nebula/core/training/lightning.py b/nebula/core/training/lightning.py
@@ -173,10 +173,10 @@ def create_logger(self):
     def create_trainer(self):
         # Create a new trainer and logger for each round
         self.create_logger()
-        num_gpus = torch.cuda.device_count()
+        num_gpus = len(self.config.participant["device_args"]["gpu_id"])
         if self.config.participant["device_args"]["accelerator"] == "gpu" and num_gpus > 0:
             # Use all available GPUs
-            if self.config.participant["device_args"]["gpu_id"] == -1:
+            if num_gpus > 1:
                 gpu_index = self.config.participant["device_args"]["idx"] % num_gpus
             # Use the selected GPU
             else:
diff --git a/nebula/frontend/app.py b/nebula/frontend/app.py
@@ -442,8 +442,8 @@ async def get_host_resources():
                 return None
             
             
-async def get_available_gpu():
-    url = f"http://{settings.controller_host}:{settings.controller_port}/available_gpu"
+async def get_available_gpus():
+    url = f"http://{settings.controller_host}:{settings.controller_port}/available_gpus"
     async with aiohttp.ClientSession() as session:
         async with session.get(url) as response:
             if response.status == 200:
@@ -1229,15 +1229,18 @@ async def node_stopped(scenario_name: str, request: Request):
 
 async def assign_available_gpu(scenario_data, role):    
     if scenario_data["accelerator"] == "cpu":
-        scenario_data["gpu_id"] = None
+        scenario_data["gpu_id"] = []
     else:
+        available_gpus = await get_available_gpus()
+        
         if role == "user":
-            gpu = await get_available_gpu()
-            scenario_data["gpu_id"] = gpu.get("available_gpu_index")
+            json_available_gpus = available_gpus.pop()
+            scenario_data["gpu_id"] = json_available_gpus
         elif role == "admin":
-            scenario_data["gpu_id"] = -1
+            json_available_gpus = available_gpus
+            scenario_data["gpu_id"] = json_available_gpus
         else:
-            scenario_data["gpu_id"] = None
+            scenario_data["gpu_id"] = []
     
     return scenario_data
 
@@ -1266,7 +1269,7 @@ async def run_scenario(scenario_data, role, user):
         dataset=scenario_data["dataset"],
         rounds=scenario_data["rounds"],
         role=role,
-        gpu_id=scenario_data["gpu_id"]
+        gpu_id=json.dumps(scenario_data["gpu_id"])
     )
 
     # Run the actual scenario
diff --git a/nebula/frontend/database.py b/nebula/frontend/database.py
@@ -110,7 +110,7 @@ async def initialize_databases():
                 rounds TEXT,
                 role TEXT,
                 username TEXT,
-                gpu_id INTEGER
+                gpu_id TEXT
             );
             """
         )
@@ -127,7 +127,7 @@ async def initialize_databases():
             "rounds": "TEXT",
             "role": "TEXT",
             "username": "TEXT",
-            "gpu_id" : "INTEGER",
+            "gpu_id" : "TEXT",
         }
         await ensure_columns(conn, "scenarios", desired_columns)
 
diff --git a/nebula/scenarios.py b/nebula/scenarios.py
@@ -90,7 +90,7 @@ def __init__(
             logginglevel (str): Logging level.
             report_status_data_queue (bool): Indicator to report information about the nodes of the scenario
             accelerator (str): Accelerator used.
-            gpu_id (int) : Id of the used gpu
+            gpu_id (list) : Id list of the used gpu
             network_subnet (str): Network subnet.
             network_gateway (str): Network gateway.
             epochs (int): Number of epochs.