Skip to content

Commit 453941e

Browse files
author
Naor Livne
committed
adding fail hard to the main work loop
1 parent 89881f2 commit 453941e

File tree

1 file changed

+134
-126
lines changed

1 file changed

+134
-126
lines changed

worker.py

Lines changed: 134 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -189,136 +189,144 @@ def get_device_group_info(nebula_connection_object, device_group_to_get_info):
189189

190190
if __name__ == "__main__":
191191

192-
# read config file and config envvars at startup, order preference is envvar>config file>default value (if exists)
193-
if os.path.exists("config/conf.json"):
194-
print("reading config file")
195-
auth_file = json.load(open("config/conf.json"))
196-
else:
197-
print("config file not found - skipping reading it and checking if needed params are given from envvars")
198-
auth_file = {}
199-
print("reading config variables")
200-
nebula_manager_auth_user = get_conf_setting("nebula_manager_auth_user", auth_file, None)
201-
nebula_manager_auth_password = get_conf_setting("nebula_manager_auth_password", auth_file, None)
202-
nebula_manager_host = get_conf_setting("nebula_manager_host", auth_file, "127.0.0.1")
203-
nebula_manager_port = int(get_conf_setting("nebula_manager_port", auth_file, "80"))
204-
nebula_manager_protocol = get_conf_setting("nebula_manager_protocol", auth_file, "http")
205-
nebula_manager_request_timeout = int(get_conf_setting("nebula_manager_request_timeout", auth_file, "60"))
206-
nebula_manager_check_in_time = int(get_conf_setting("nebula_manager_check_in_time", auth_file, "30"))
207-
registry_auth_user = get_conf_setting("registry_auth_user", auth_file, None)
208-
registry_auth_password = get_conf_setting("registry_auth_password", auth_file, None)
209-
registry_host = get_conf_setting("registry_host", auth_file, "https://index.docker.io/v1/")
210-
max_restart_wait_in_seconds = int(get_conf_setting("max_restart_wait_in_seconds", auth_file, 0))
211-
device_group = get_conf_setting("device_group", auth_file)
212-
213-
# get number of cpu cores on host
214-
cpu_cores = get_number_of_cpu_cores()
215-
216-
# work against docker socket
217-
docker_socket = DockerFunctions()
218-
219-
# ensure default "nebula" named network exists
220-
docker_socket.create_docker_network("nebula", "bridge")
221-
222-
# login to the docker registry - if no registry login details are configured will just print a message stating that
223-
docker_socket.registry_login(registry_host=registry_host, registry_user=registry_auth_user,
224-
registry_pass=registry_auth_password)
225-
226-
# login to the nebula manager
227-
nebula_connection = Nebula(username=nebula_manager_auth_user, password=nebula_manager_auth_password,
228-
host=nebula_manager_host, port=nebula_manager_port, protocol=nebula_manager_protocol,
229-
request_timeout=nebula_manager_request_timeout)
230-
231-
# make sure the nebula manager connects properly
232192
try:
233-
print("checking nebula manager connection")
234-
api_check = nebula_connection.check_api()
235-
if api_check["status_code"] == 200 and api_check["reply"]["api_available"] is True:
236-
print("nebula manager connection ok")
193+
# read config file/envvars at startup, order preference is envvar>config file>default value (if exists)
194+
if os.path.exists("config/conf.json"):
195+
print("reading config file")
196+
auth_file = json.load(open("config/conf.json"))
237197
else:
238-
print("nebula manager initial connection check failure, dropping container")
239-
except Exception as e:
240-
print >> sys.stderr, e
241-
print("error confirming connection to nebula manager - please check connection & authentication params and "
242-
"that the manager is online")
243-
os._exit(2)
244-
245-
# stop all nebula managed containers on start to ensure a clean slate to work on
246-
print("stopping all preexisting nebula manager containers in order to ensure a clean slate on boot")
247-
stop_containers({"app_name": ""})
198+
print("config file not found - skipping reading it and checking if needed params are given from envvars")
199+
auth_file = {}
200+
print("reading config variables")
201+
nebula_manager_auth_user = get_conf_setting("nebula_manager_auth_user", auth_file, None)
202+
nebula_manager_auth_password = get_conf_setting("nebula_manager_auth_password", auth_file, None)
203+
nebula_manager_host = get_conf_setting("nebula_manager_host", auth_file, "127.0.0.1")
204+
nebula_manager_port = int(get_conf_setting("nebula_manager_port", auth_file, "80"))
205+
nebula_manager_protocol = get_conf_setting("nebula_manager_protocol", auth_file, "http")
206+
nebula_manager_request_timeout = int(get_conf_setting("nebula_manager_request_timeout", auth_file, "60"))
207+
nebula_manager_check_in_time = int(get_conf_setting("nebula_manager_check_in_time", auth_file, "30"))
208+
registry_auth_user = get_conf_setting("registry_auth_user", auth_file, None)
209+
registry_auth_password = get_conf_setting("registry_auth_password", auth_file, None)
210+
registry_host = get_conf_setting("registry_host", auth_file, "https://index.docker.io/v1/")
211+
max_restart_wait_in_seconds = int(get_conf_setting("max_restart_wait_in_seconds", auth_file, 0))
212+
device_group = get_conf_setting("device_group", auth_file)
213+
214+
# get number of cpu cores on host
215+
cpu_cores = get_number_of_cpu_cores()
216+
217+
# work against docker socket
218+
docker_socket = DockerFunctions()
219+
220+
# ensure default "nebula" named network exists
221+
docker_socket.create_docker_network("nebula", "bridge")
222+
223+
# login to the docker registry - if no registry login details are configured will just print a message stating
224+
# that
225+
docker_socket.registry_login(registry_host=registry_host, registry_user=registry_auth_user,
226+
registry_pass=registry_auth_password)
227+
228+
# login to the nebula manager
229+
nebula_connection = Nebula(username=nebula_manager_auth_user, password=nebula_manager_auth_password,
230+
host=nebula_manager_host, port=nebula_manager_port, protocol=nebula_manager_protocol,
231+
request_timeout=nebula_manager_request_timeout)
232+
233+
# make sure the nebula manager connects properly
234+
try:
235+
print("checking nebula manager connection")
236+
api_check = nebula_connection.check_api()
237+
if api_check["status_code"] == 200 and api_check["reply"]["api_available"] is True:
238+
print("nebula manager connection ok")
239+
else:
240+
print("nebula manager initial connection check failure, dropping container")
241+
os._exit(2)
242+
except Exception as e:
243+
print >> sys.stderr, e
244+
print("error confirming connection to nebula manager - please check connection & authentication params and "
245+
"that the manager is online")
246+
os._exit(2)
247+
248+
# stop all nebula managed containers on start to ensure a clean slate to work on
249+
print("stopping all preexisting nebula manager containers in order to ensure a clean slate on boot")
250+
stop_containers({"app_name": ""})
251+
252+
# get the initial device_group configuration and store it in memory
253+
local_device_group_info = get_device_group_info(nebula_connection, device_group)
248254

249-
# get the initial device_group configuration and store it in memory
250-
local_device_group_info = get_device_group_info(nebula_connection, device_group)
255+
# make sure the device_group exists in the nebula cluster
256+
while local_device_group_info["status_code"] == 403 and \
257+
local_device_group_info["reply"]["device_group_exists"] is False:
258+
print("device_group " + device_group + " doesn't exist in nebula cluster, waiting for it to be created")
259+
local_device_group_info = get_device_group_info(nebula_connection, device_group)
260+
time.sleep(nebula_manager_check_in_time)
261+
262+
# start all apps that are set to running on boot
263+
for nebula_app in local_device_group_info["reply"]["apps"]:
264+
if nebula_app["running"] is True:
265+
print("initial start of " + nebula_app["app_name"] + " app")
266+
start_containers(nebula_app)
267+
print("completed initial start of " + nebula_app["app_name"] + " app")
268+
269+
# open a thread which is in charge of restarting any containers which healthcheck shows them as unhealthy
270+
print("starting work container health checking thread")
271+
Thread(target=restart_unhealthy_containers).start()
272+
273+
# loop forever
274+
print("starting device_group " + device_group + " /info check loop, configured to check for changes every "
275+
+ str(nebula_manager_check_in_time) + " seconds")
276+
while True:
251277

252-
# make sure the device_group exists in the nebula cluster
253-
while local_device_group_info["status_code"] == 403 and \
254-
local_device_group_info["reply"]["device_group_exists"] is False:
255-
print("device_group " + device_group + " doesn't exist in nebula cluster, waiting for it to be created")
256-
local_device_group_info = get_device_group_info(nebula_connection, device_group)
257-
time.sleep(nebula_manager_check_in_time)
258-
259-
# start all apps that are set to running on boot
260-
for nebula_app in local_device_group_info["reply"]["apps"]:
261-
if nebula_app["running"] is True:
262-
print("initial start of " + nebula_app["app_name"] + " app")
263-
start_containers(nebula_app)
264-
print("completed initial start of " + nebula_app["app_name"] + " app")
265-
266-
# open a thread which is in charge of restarting any containers which healthcheck shows them as unhealthy
267-
print("starting work container health checking thread")
268-
Thread(target=restart_unhealthy_containers).start()
269-
270-
# loop forever
271-
print("starting device_group " + device_group + " /info check loop, configured to check for changes every "
272-
+ str(nebula_manager_check_in_time) + " seconds")
273-
while True:
274-
275-
# wait the configurable time before checking the device_group info page again
276-
time.sleep(nebula_manager_check_in_time)
277-
278-
monotonic_id_increase = False
279-
280-
# get the device_group configuration
281-
remote_device_group_info = get_device_group_info(nebula_connection, device_group)
282-
283-
# logic that checks if the each app_id was increased and updates the app containers if the answer is yes
284-
# the logic also starts containers of newly added apps to the device_group
285-
for remote_nebula_app in remote_device_group_info["reply"]["apps"]:
286-
if remote_nebula_app["app_name"] in local_device_group_info["reply"]["apps_list"]:
287-
local_app_index = local_device_group_info["reply"]["apps_list"].index(remote_nebula_app["app_name"])
288-
if remote_nebula_app["app_id"] > local_device_group_info["reply"]["apps"][local_app_index]["app_id"]:
278+
# wait the configurable time before checking the device_group info page again
279+
time.sleep(nebula_manager_check_in_time)
280+
281+
monotonic_id_increase = False
282+
283+
# get the device_group configuration
284+
remote_device_group_info = get_device_group_info(nebula_connection, device_group)
285+
286+
# logic that checks if the each app_id was increased and updates the app containers if the answer is yes
287+
# the logic also starts containers of newly added apps to the device_group
288+
for remote_nebula_app in remote_device_group_info["reply"]["apps"]:
289+
if remote_nebula_app["app_name"] in local_device_group_info["reply"]["apps_list"]:
290+
local_app_index = local_device_group_info["reply"]["apps_list"].index(remote_nebula_app["app_name"])
291+
if remote_nebula_app["app_id"] > local_device_group_info["reply"]["apps"][local_app_index]["app_id"]:
292+
monotonic_id_increase = True
293+
if remote_nebula_app["running"] is False:
294+
print("stopping app " + remote_nebula_app["app_name"] +
295+
" do to changes in the app configuration")
296+
stop_containers(remote_nebula_app)
297+
elif remote_nebula_app["rolling_restart"] is True and \
298+
local_device_group_info["reply"]["apps"][local_app_index]["running"] is True:
299+
print("rolling app " + remote_nebula_app["app_name"] +
300+
" do to changes in the app configuration")
301+
roll_containers(remote_nebula_app)
302+
else:
303+
print("restarting app " + remote_nebula_app["app_name"] +
304+
" do to changes in the app configuration")
305+
restart_containers(remote_nebula_app)
306+
else:
307+
print("restarting app " + remote_nebula_app["app_name"] + " do to changes in the app configuration")
289308
monotonic_id_increase = True
290-
if remote_nebula_app["running"] is False:
291-
print("stopping app " + remote_nebula_app["app_name"] +
292-
" do to changes in the app configuration")
293-
stop_containers(remote_nebula_app)
294-
elif remote_nebula_app["rolling_restart"] is True and \
295-
local_device_group_info["reply"]["apps"][local_app_index]["running"] is True:
296-
print("rolling app " + remote_nebula_app["app_name"] +
297-
" do to changes in the app configuration")
298-
roll_containers(remote_nebula_app)
299-
else:
300-
print("restarting app " + remote_nebula_app["app_name"] +
309+
restart_containers(remote_nebula_app)
310+
311+
# logic that removes containers of apps that was removed from the device_group
312+
if remote_device_group_info["reply"]["device_group_id"] > local_device_group_info["reply"]["device_group_id"]:
313+
monotonic_id_increase = True
314+
for local_nebula_app in local_device_group_info["reply"]["apps"]:
315+
if local_nebula_app["app_name"] not in remote_device_group_info["reply"]["apps_list"]:
316+
print("removing app " + local_nebula_app["app_name"] +
301317
" do to changes in the app configuration")
302-
restart_containers(remote_nebula_app)
303-
else:
304-
print("restarting app " + remote_nebula_app["app_name"] + " do to changes in the app configuration")
318+
stop_containers(local_nebula_app)
319+
320+
# logic that runs image pruning if prune_id increased
321+
if remote_device_group_info["reply"]["prune_id"] > local_device_group_info["reply"]["prune_id"]:
322+
print("pruning images do to changes in the app configuration")
305323
monotonic_id_increase = True
306-
restart_containers(remote_nebula_app)
307-
308-
# logic that removes containers of apps that was removed from the device_group
309-
if remote_device_group_info["reply"]["device_group_id"] > local_device_group_info["reply"]["device_group_id"]:
310-
monotonic_id_increase = True
311-
for local_nebula_app in local_device_group_info["reply"]["apps"]:
312-
if local_nebula_app["app_name"] not in remote_device_group_info["reply"]["apps_list"]:
313-
print("removing app " + local_nebula_app["app_name"] + " do to changes in the app configuration")
314-
stop_containers(local_nebula_app)
315-
316-
# logic that runs image pruning if prune_id increased
317-
if remote_device_group_info["reply"]["prune_id"] > local_device_group_info["reply"]["prune_id"]:
318-
print("pruning images do to changes in the app configuration")
319-
monotonic_id_increase = True
320-
prune_images()
321-
322-
# set the in memory device_group info to be the one recently received if any id increased
323-
if monotonic_id_increase is True:
324-
local_device_group_info = remote_device_group_info
324+
prune_images()
325+
326+
# set the in memory device_group info to be the one recently received if any id increased
327+
if monotonic_id_increase is True:
328+
local_device_group_info = remote_device_group_info
329+
except Exception as e:
330+
print >> sys.stderr, e
331+
print("failed main loop - exiting")
332+
os._exit(2)

0 commit comments

Comments
 (0)