Skip to content

Commit 6c4ec13

Browse files
committed
fix: docker ips and gpus lists
1 parent 773f56f commit 6c4ec13

File tree

4 files changed

+12
-9
lines changed

4 files changed

+12
-9
lines changed

nebula/core/training/lightning.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def create_trainer(self):
177177
if self.config.participant["device_args"]["accelerator"] == "gpu" and num_gpus > 0:
178178
# Use all available GPUs
179179
if num_gpus > 1:
180-
gpu_index = self.config.participant["device_args"]["idx"] % num_gpus
180+
gpu_index = [self.config.participant["device_args"]["idx"] % num_gpus]
181181
# Use the selected GPU
182182
else:
183183
gpu_index = self.config.participant["device_args"]["gpu_id"]
@@ -186,7 +186,7 @@ def create_trainer(self):
186186
callbacks=[ModelSummary(max_depth=1), NebulaProgressBar()],
187187
max_epochs=self.epochs,
188188
accelerator=self.config.participant["device_args"]["accelerator"],
189-
devices=[gpu_index],
189+
devices=gpu_index,
190190
logger=self._logger,
191191
enable_checkpointing=False,
192192
enable_model_summary=False,

nebula/frontend/app.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,12 +1233,13 @@ async def assign_available_gpu(scenario_data, role):
12331233
else:
12341234
response = await get_available_gpus()
12351235
available_gpus = response.get("available_gpus")
1236-
if role == "user":
1237-
scenario_data["gpu_id"] = available_gpus.pop()
1238-
elif role == "admin":
1239-
scenario_data["gpu_id"] = available_gpus
1240-
else:
1241-
scenario_data["gpu_id"] = []
1236+
if len(available_gpus) > 0:
1237+
if role == "user":
1238+
scenario_data["gpu_id"] = [available_gpus.pop()]
1239+
elif role == "admin":
1240+
scenario_data["gpu_id"] = available_gpus
1241+
else:
1242+
scenario_data["gpu_id"] = []
12421243

12431244
return scenario_data
12441245

nebula/scenarios.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,13 +666,15 @@ def start_nodes_docker(self):
666666
binds=[f"{self.root_path}:/nebula", "/var/run/docker.sock:/var/run/docker.sock"],
667667
privileged=True,
668668
device_requests=[docker.types.DeviceRequest(driver="nvidia", count=-1, capabilities=[["gpu"]])],
669+
extra_hosts={"host.docker.internal": "host-gateway"},
669670
)
670671
else:
671672
environment = ""
672673
host_config = client.api.create_host_config(
673674
binds=[f"{self.root_path}:/nebula", "/var/run/docker.sock:/var/run/docker.sock"],
674675
privileged=True,
675676
device_requests=[],
677+
extra_hosts={"host.docker.internal": "host-gateway"},
676678
)
677679

678680
volumes = ["/nebula", "/var/run/docker.sock"]

nebula/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def create_docker_network(cls, network_name, subnet=None, prefix=24):
6767

6868
# If no subnet is provided or it exists, find the next available one
6969
if not subnet or subnet in existing_subnets:
70-
for i in range(2, 255): # Iterate over 192.168.2.0 to 192.168.254.0
70+
for i in range(50, 255): # Iterate over 192.168.50.0 to 192.168.254.0
7171
subnet = f"{base_subnet}.{i}.0/{prefix}"
7272
potential_base = f"{base_subnet}.{i}"
7373
if subnet not in existing_subnets:

0 commit comments

Comments
 (0)