Skip to content

Commit 6fee315

Browse files
committed
feature: centralize databases for improved consistency
fix: resolve GPU-related bugs when launching scenarios
1 parent aebcbbb commit 6fee315

File tree

9 files changed

+96
-51
lines changed

9 files changed

+96
-51
lines changed

app/main.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,14 @@
7676
help="Config directory path",
7777
)
7878

79+
argparser.add_argument(
80+
"-d",
81+
"--database",
82+
dest="databases",
83+
default="/opt/nebula",
84+
help="Nebula databases path",
85+
)
86+
7987
argparser.add_argument(
8088
"-l",
8189
"--logs",

nebula/addons/waf/Dockerfile-waf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ RUN cp nginx-modules/nginx-1.24.0/objs/ngx_http_geoip2_module.so /usr/lib/nginx/
3636
RUN cp nginx-modules/nginx-1.24.0/objs/ngx_stream_geoip2_module.so /usr/lib/nginx/modules
3737

3838
# geoip2 database downloaded
39-
RUN wget https://git.io/GeoLite2-Country.mmdb -P /usr/share/GeoIP/
39+
RUN wget https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-Country.mmdb -P /usr/share/GeoIP/
4040

4141
# nginx configuration files
4242
COPY default.conf /etc/nginx/templates/conf.d/default.conf.template

nebula/addons/waf/default.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ server {
1212
listen 80 default_server;
1313

1414
server_name localhost;
15-
set $upstream http://nebula-nebula-frontend; # Change this
15+
set $upstream http://nebula_nebula-frontend; # Change this
1616
set $always_redirect off;
1717
modsecurity on;
1818
location /nebula {

nebula/controller.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ def __init__(self, args):
339339
self.statistics_port = args.statsport if hasattr(args, "statsport") else 8080
340340
self.simulation = args.simulation
341341
self.config_dir = args.config
342+
self.db_dir = args.databases if hasattr(args, "databases") else "/opt/nebula"
342343
self.test = args.test if hasattr(args, "test") else False
343344
self.log_dir = args.logs
344345
self.cert_dir = args.certs
@@ -453,6 +454,7 @@ def start(self):
453454
else:
454455
self.run_frontend()
455456
logging.info(f"NEBULA Frontend is running at http://localhost:{self.frontend_port}")
457+
logging.info(f"NEBULA Databases created in {self.db_dir}")
456458

457459
# Watchdog for running additional scripts in the host machine (i.e. during the execution of a federation)
458460
event_handler = NebulaEventHandler()
@@ -516,7 +518,7 @@ def run_controller_api(self):
516518
)
517519

518520
def run_waf(self):
519-
network_name = f"{os.environ['USER']}-nebula-net-base"
521+
network_name = f"{os.environ['USER']}_nebula-net-base"
520522
base = DockerUtils.create_docker_network(network_name)
521523

522524
client = docker.from_env()
@@ -537,7 +539,7 @@ def run_waf(self):
537539

538540
container_id_waf = client.api.create_container(
539541
image="nebula-waf",
540-
name=f"{os.environ['USER']}-nebula-waf",
542+
name=f"{os.environ['USER']}_nebula-waf",
541543
detach=True,
542544
volumes=volumes_waf,
543545
host_config=host_config_waf,
@@ -571,7 +573,7 @@ def run_waf(self):
571573

572574
container_id = client.api.create_container(
573575
image="nebula-waf-grafana",
574-
name=f"{os.environ['USER']}-nebula-waf-grafana",
576+
name=f"{os.environ['USER']}_nebula-waf-grafana",
575577
detach=True,
576578
environment=environment,
577579
host_config=host_config,
@@ -595,7 +597,7 @@ def run_waf(self):
595597

596598
container_id_loki = client.api.create_container(
597599
image="nebula-waf-loki",
598-
name=f"{os.environ['USER']}-nebula-waf-loki",
600+
name=f"{os.environ['USER']}_nebula-waf-loki",
599601
detach=True,
600602
command=command,
601603
host_config=host_config_loki,
@@ -619,7 +621,7 @@ def run_waf(self):
619621

620622
container_id_promtail = client.api.create_container(
621623
image="nebula-waf-promtail",
622-
name=f"{os.environ['USER']}-nebula-waf-promtail",
624+
name=f"{os.environ['USER']}_nebula-waf-promtail",
623625
detach=True,
624626
volumes=volumes_promtail,
625627
host_config=host_config_promtail,
@@ -646,7 +648,7 @@ def run_frontend(self):
646648
except Exception:
647649
logging.info("No GPU available for the frontend, nodes will be deploy in CPU mode")
648650

649-
network_name = f"{os.environ['USER']}-nebula-net-base"
651+
network_name = f"{os.environ['USER']}_nebula-net-base"
650652

651653
# Create the Docker network
652654
base = DockerUtils.create_docker_network(network_name)
@@ -681,6 +683,7 @@ def run_frontend(self):
681683
f"{self.root_path}:/nebula",
682684
"/var/run/docker.sock:/var/run/docker.sock",
683685
f"{self.root_path}/nebula/frontend/config/nebula:/etc/nginx/sites-available/default",
686+
f"{self.db_dir}/databases:/nebula/nebula/frontend/databases"
684687
],
685688
extra_hosts={"host.docker.internal": "host-gateway"},
686689
port_bindings={80: self.frontend_port, 8080: self.statistics_port},
@@ -692,7 +695,7 @@ def run_frontend(self):
692695

693696
container_id = client.api.create_container(
694697
image="nebula-frontend",
695-
name=f"{os.environ['USER']}-nebula-frontend",
698+
name=f"{os.environ['USER']}_nebula-frontend",
696699
detach=True,
697700
environment=environment,
698701
volumes=volumes,
@@ -708,16 +711,17 @@ def run_test(self):
708711

709712
@staticmethod
710713
def stop_waf():
711-
DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}-nebula-waf")
714+
DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}_nebula-waf")
712715

713716
@staticmethod
714717
def stop():
715718
logging.info("Closing NEBULA (exiting from components)... Please wait")
716-
DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}")
719+
DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}_nebula")
720+
DockerUtils.remove_containers_by_prefix(f"nebula_")
717721
ScenarioManagement.stop_blockchain()
718722
ScenarioManagement.stop_participants()
719723
Controller.stop_waf()
720-
DockerUtils.remove_docker_networks_by_prefix(f"{os.environ['USER']}")
724+
DockerUtils.remove_docker_networks_by_prefix(f"{os.environ['USER']}_")
721725
controller_pid_file = os.path.join(os.path.dirname(__file__), "controller.pid")
722726
try:
723727
with open(controller_pid_file) as f:

nebula/core/training/lightning.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def create_trainer(self):
185185
self._trainer = Trainer(
186186
callbacks=[ModelSummary(max_depth=1), NebulaProgressBar()],
187187
max_epochs=self.epochs,
188-
accelerator=self.config.participant["device_args"]["accelerator"],
188+
accelerator="gpu",
189189
devices=gpu_index,
190190
logger=self._logger,
191191
enable_checkpointing=False,
@@ -197,7 +197,7 @@ def create_trainer(self):
197197
self._trainer = Trainer(
198198
callbacks=[ModelSummary(max_depth=1), NebulaProgressBar()],
199199
max_epochs=self.epochs,
200-
accelerator=self.config.participant["device_args"]["accelerator"],
200+
accelerator="cpu",
201201
devices="auto",
202202
logger=self._logger,
203203
enable_checkpointing=False,

nebula/frontend/app.py

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -472,15 +472,15 @@ async def check_enough_resources():
472472
resources = await get_host_resources()
473473

474474
mem_percent = resources.get("memory_percent")
475-
gpu_memory_percent = resources.get("gpu_memory_percent", [])
475+
# gpu_memory_percent = resources.get("gpu_memory_percent", [])
476476

477477
# if cpu_percent >= settings.resources_threshold or mem_percent >= settings.resources_threshold:
478478
if mem_percent >= settings.resources_threshold:
479479
return False
480-
elif len(gpu_memory_percent) > 0:
481-
for gpu_mem in gpu_memory_percent:
482-
if gpu_mem >= settings.resources_threshold:
483-
return False
480+
# elif len(gpu_memory_percent) > 0:
481+
# for gpu_mem in gpu_memory_percent:
482+
# if gpu_mem >= settings.resources_threshold:
483+
# return False
484484

485485
return True
486486

@@ -493,17 +493,17 @@ async def monitor_resources(user):
493493
if not enough_resources:
494494
running_scenario = get_running_scenario(user)
495495
if running_scenario:
496-
# Wich card has big memory consumption
497-
gpu = await get_least_memory_gpu()
498-
# Stop scenario if is using the high memory gpu
496+
# # Wich card has big memory consumption
497+
# gpu = await get_least_memory_gpu()
498+
# # Stop scenario if is using the high memory gpu
499499
running_scenario_as_dict = dict(running_scenario)
500-
if running_scenario_as_dict["gpu_id"] == gpu.get("available_gpu_index"):
501-
scenario_name = running_scenario_as_dict["name"]
502-
stop_scenario(scenario_name, user)
503-
user_data.scenarios_list_length -= 1
504-
user_data.finish_scenario_event.set()
500+
scenario_name = running_scenario_as_dict["name"]
501+
# if running_scenario_as_dict["gpu_id"] == gpu.get("available_gpu_index"):
502+
stop_scenario(scenario_name, user)
503+
user_data.scenarios_list_length -= 1
504+
user_data.finish_scenario_event.set()
505505

506-
await asyncio.sleep(5)
506+
await asyncio.sleep(20)
507507

508508

509509

@@ -876,9 +876,9 @@ def stop_scenario(scenario_name, user):
876876
from nebula.scenarios import ScenarioManagement
877877

878878
ScenarioManagement.stop_participants(scenario_name)
879-
DockerUtils.remove_containers_by_prefix(f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-{user}-participant")
879+
DockerUtils.remove_containers_by_prefix(f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_{user}-participant")
880880
DockerUtils.remove_docker_network(
881-
f"{(os.environ.get('NEBULA_CONTROLLER_NAME'))}-{str(user).lower()}-nebula-net-scenario"
881+
f"{(os.environ.get('NEBULA_CONTROLLER_NAME'))}_{str(user).lower()}-nebula-net-scenario"
882882
)
883883
ScenarioManagement.stop_blockchain()
884884
scenario_set_status_to_finished(scenario_name)
@@ -1227,19 +1227,42 @@ async def node_stopped(scenario_name: str, request: Request):
12271227
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST)
12281228

12291229

1230-
async def assign_available_gpu(scenario_data, role):
1230+
async def assign_available_gpu(scenario_data, role):
1231+
available_gpus = []
1232+
12311233
if scenario_data["accelerator"] == "cpu":
12321234
scenario_data["gpu_id"] = []
12331235
else:
12341236
response = await get_available_gpus()
1235-
available_gpus = response.get("available_gpus")
1236-
if len(available_gpus) > 0:
1237-
if role == "user":
1238-
scenario_data["gpu_id"] = [available_gpus.pop()]
1239-
elif role == "admin":
1240-
scenario_data["gpu_id"] = available_gpus
1241-
else:
1242-
scenario_data["gpu_id"] = []
1237+
# Obtain available system_gpus
1238+
available_system_gpus = response.get("available_gpus")
1239+
running_scenarios = get_running_scenario(get_all=True)
1240+
# Obtain currently used gpus
1241+
if running_scenarios:
1242+
running_gpus = []
1243+
for scenario in running_scenarios:
1244+
scenario_gpus = json.loads(scenario["gpu_id"])
1245+
for gpu in scenario_gpus:
1246+
if gpu not in running_gpus:
1247+
running_gpus.append(gpu)
1248+
1249+
# Obtain gpus that are not in use
1250+
for gpu in available_system_gpus:
1251+
if gpu not in running_gpus:
1252+
available_gpus.append(gpu)
1253+
else:
1254+
available_gpus = available_system_gpus
1255+
1256+
# Assign gpus based in user role
1257+
if len(available_gpus) > 0:
1258+
if role == "user":
1259+
scenario_data["gpu_id"] = [available_gpus.pop()]
1260+
elif role == "admin":
1261+
scenario_data["gpu_id"] = available_gpus
1262+
else:
1263+
scenario_data["gpu_id"] = []
1264+
else:
1265+
scenario_data["gpu_id"] = []
12431266

12441267
return scenario_data
12451268

nebula/frontend/database.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import datetime
3+
import logging
34
import sqlite3
45

56
import aiosqlite
@@ -23,10 +24,15 @@
2324

2425

2526
async def setup_database(db_file_location):
26-
async with aiosqlite.connect(db_file_location) as db:
27-
for pragma in PRAGMA_SETTINGS:
28-
await db.execute(pragma)
29-
await db.commit()
27+
try:
28+
async with aiosqlite.connect(db_file_location) as db:
29+
for pragma in PRAGMA_SETTINGS:
30+
await db.execute(pragma)
31+
await db.commit()
32+
except PermissionError:
33+
logging.info("No permission to create the databases. Change the default databases directory")
34+
except Exception as e:
35+
logging.exception(f"An error has ocurred during setup_database: {e}")
3036

3137

3238
async def ensure_columns(conn, table_name, desired_columns):
@@ -545,7 +551,7 @@ def scenario_set_status_to_completed(scenario_name):
545551
print(f"Database error: {e}")
546552

547553

548-
def get_running_scenario(username=None):
554+
def get_running_scenario(username=None, get_all=False):
549555
with sqlite3.connect(scenario_db_file_location) as conn:
550556
conn.row_factory = sqlite3.Row
551557
c = conn.cursor()
@@ -556,11 +562,15 @@ def get_running_scenario(username=None):
556562
WHERE (status = ? OR status = ?) AND username = ?;
557563
"""
558564
c.execute(command, ("running", "completed", username))
565+
566+
result = c.fetchone()
559567
else:
560568
command = "SELECT * FROM scenarios WHERE status = ? OR status = ?;"
561569
c.execute(command, ("running", "completed"))
562-
563-
result = c.fetchone()
570+
if get_all:
571+
result = c.fetchall()
572+
else:
573+
result = c.fetchone()
564574

565575
return result
566576

nebula/frontend/databases/__init__.py

Whitespace-only changes.

nebula/scenarios.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def __init__(self, scenario, user=None):
253253

254254
# Assign the controller endpoint
255255
if self.scenario.deployment == "docker":
256-
self.controller = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-nebula-frontend"
256+
self.controller = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_nebula-frontend"
257257
else:
258258
self.controller = f"127.0.0.1:{os.environ.get('NEBULA_FRONTEND_PORT')}"
259259

@@ -646,7 +646,7 @@ def start_nodes_docker(self):
646646
logging.info("Starting nodes using Docker Compose...")
647647
logging.info(f"env path: {self.env_path}")
648648

649-
network_name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-{str(self.user).lower()}-nebula-net-scenario"
649+
network_name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_{str(self.user).lower()}-nebula-net-scenario"
650650

651651
# Create the Docker network
652652
base = DockerUtils.create_docker_network(network_name)
@@ -658,7 +658,7 @@ def start_nodes_docker(self):
658658
container_ids = []
659659
for idx, node in enumerate(self.config.participants):
660660
image = "nebula-core"
661-
name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-{self.user}-participant{node['device_args']['idx']}"
661+
name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_{self.user}-participant{node['device_args']['idx']}"
662662

663663
if node["device_args"]["accelerator"] == "gpu":
664664
environment = {"NVIDIA_DISABLE_REQUIRE": True}
@@ -691,15 +691,15 @@ def start_nodes_docker(self):
691691
f"{network_name}": client.api.create_endpoint_config(
692692
ipv4_address=f"{base}.{i}",
693693
),
694-
f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-nebula-net-base": client.api.create_endpoint_config(),
694+
f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_nebula-net-base": client.api.create_endpoint_config(),
695695
"chainnet": client.api.create_endpoint_config(),
696696
})
697697
else:
698698
networking_config = client.api.create_networking_config({
699699
f"{network_name}": client.api.create_endpoint_config(
700700
ipv4_address=f"{base}.{i}",
701701
),
702-
f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-nebula-net-base": client.api.create_endpoint_config(),
702+
f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_nebula-net-base": client.api.create_endpoint_config(),
703703
})
704704

705705
node["tracking_args"]["log_dir"] = "/nebula/app/logs"

0 commit comments

Comments
 (0)