fix: docker ips and gpus lists

FerTV · FerTV · commit 6c4ec136de31 · 2024-12-13T19:16:12.000+01:00
diff --git a/nebula/core/training/lightning.py b/nebula/core/training/lightning.py
@@ -177,7 +177,7 @@ def create_trainer(self):
         if self.config.participant["device_args"]["accelerator"] == "gpu" and num_gpus > 0:
             # Use all available GPUs
             if num_gpus > 1:
-                gpu_index = self.config.participant["device_args"]["idx"] % num_gpus
+                gpu_index = [self.config.participant["device_args"]["idx"] % num_gpus]
             # Use the selected GPU
             else:
                 gpu_index = self.config.participant["device_args"]["gpu_id"]
@@ -186,7 +186,7 @@ def create_trainer(self):
                 callbacks=[ModelSummary(max_depth=1), NebulaProgressBar()],
                 max_epochs=self.epochs,
                 accelerator=self.config.participant["device_args"]["accelerator"],
-                devices=[gpu_index],
+                devices=gpu_index,
                 logger=self._logger,
                 enable_checkpointing=False,
                 enable_model_summary=False,
diff --git a/nebula/frontend/app.py b/nebula/frontend/app.py
@@ -1233,12 +1233,13 @@ async def assign_available_gpu(scenario_data, role):
     else:
         response = await get_available_gpus()
         available_gpus = response.get("available_gpus")
-        if role == "user":
-            scenario_data["gpu_id"] = available_gpus.pop()
-        elif role == "admin":
-            scenario_data["gpu_id"] = available_gpus
-        else:
-            scenario_data["gpu_id"] = []
+        if len(available_gpus) > 0:
+            if role == "user":
+                scenario_data["gpu_id"] = [available_gpus.pop()]
+            elif role == "admin":
+                scenario_data["gpu_id"] = available_gpus
+            else:
+                scenario_data["gpu_id"] = []
     
     return scenario_data
 
diff --git a/nebula/scenarios.py b/nebula/scenarios.py
@@ -666,13 +666,15 @@ def start_nodes_docker(self):
                     binds=[f"{self.root_path}:/nebula", "/var/run/docker.sock:/var/run/docker.sock"],
                     privileged=True,
                     device_requests=[docker.types.DeviceRequest(driver="nvidia", count=-1, capabilities=[["gpu"]])],
+                    extra_hosts={"host.docker.internal": "host-gateway"},
                 )
             else:
                 environment = ""
                 host_config = client.api.create_host_config(
                     binds=[f"{self.root_path}:/nebula", "/var/run/docker.sock:/var/run/docker.sock"],
                     privileged=True,
                     device_requests=[],
+                    extra_hosts={"host.docker.internal": "host-gateway"},
                 )
 
             volumes = ["/nebula", "/var/run/docker.sock"]
diff --git a/nebula/utils.py b/nebula/utils.py
@@ -67,7 +67,7 @@ def create_docker_network(cls, network_name, subnet=None, prefix=24):
 
             # If no subnet is provided or it exists, find the next available one
             if not subnet or subnet in existing_subnets:
-                for i in range(2, 255):  # Iterate over 192.168.2.0 to 192.168.254.0
+                for i in range(50, 255):  # Iterate over 192.168.50.0 to 192.168.254.0
                     subnet = f"{base_subnet}.{i}.0/{prefix}"
                     potential_base = f"{base_subnet}.{i}"
                     if subnet not in existing_subnets: