Skip to content

Commit 60fae9e

Browse files
committed
Refactor dask config checking
1 parent a3a4de7 commit 60fae9e

File tree

2 files changed

+187
-92
lines changed

2 files changed

+187
-92
lines changed

src/lsst/rsp/startup/services/labrunner/labrunner.py

Lines changed: 180 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -539,124 +539,212 @@ def _merge_pgpass(self) -> None:
539539
def _setup_dask(self) -> None:
540540
self._logger.debug("Setting up dask dashboard proxy information")
541541
cfgdir = self._home / ".config" / "dask"
542-
replaced = False
542+
good_dashboard_config = False
543543
if cfgdir.is_dir():
544-
replaced = self._replace_outdated_proxy(cfgdir)
545-
if not replaced:
544+
good_dashboard_config = self._tidy_extant_config(cfgdir)
545+
# If we found and replaced the dashboard config, or if it was
546+
# already correct, we do not need to write a new file.
547+
#
548+
# If there is no config dir, there's nothing to tidy.
549+
if not good_dashboard_config:
550+
# We need to write a new file with the correct config.
546551
self._inject_new_proxy(cfgdir / "dashboard.yaml")
547552

548-
def _replace_outdated_proxy(self, cfgdir: Path) -> bool:
553+
def _tidy_extant_config(self, cfgdir: Path) -> bool:
554+
#
555+
# This is the controversial method. We have had (at least) four
556+
# regimes of dask usage in the RSP.
557+
#
558+
# 1) Back in the mists of time (2018-ish), dask was present, and
559+
# all configuration was left to the user.
560+
# 2) For a while in 2019-2021-ish, we had a pretty sophisticated
561+
# system that allowed users to spawn whole additional pods, and we used
562+
# this for a really cool demo with Gaia DR1 data. But then we moved to
563+
# the Nublado controller from KubeSpawner, and that no longer worked...
564+
# but we didn't do anything about the user config, so users had broken
565+
# config left over.
566+
# 3) from 2022-ish-to-2025 dask was not present. The broken config
567+
# thus didn't cause any harm.
568+
# 4) in 2025, we added lsdb to the RSP. lsdb relies on dask. Suddenly
569+
# the abandoned config could cause harm, and without config, the wrong
570+
# dashboard information is presented to the user, which makes the lsdb
571+
# tutorial for Rubin DP1 data needlessly confusing.
572+
#
573+
# This is an attempt to clean that mess up.
574+
#
575+
# First we check for any files that don't do anything. We know the
576+
# config will be YAML (dask config can also be JSON, but the RSP
577+
# machinery never wrote any such files, so we assume any JSON is
578+
# user-generated and not directly our problem), and those files will
579+
# be named with "yaml" or "yml" suffixes (both exist in extant user
580+
# config) per https://github.com/dask/dask/blob/main/dask/config.py .
581+
#
582+
# "Don't do anything" means that when deserialized to a Python object,
583+
# that object is None or empty, or it's a dictionary that contains only
584+
# empty objects as its leaves. We move these files aside, with a date
585+
# suffix so that dask will no longer try to load them.
586+
#
587+
# Second, assuming the file survived that process, we check
588+
# specifically for the dashboard link, and correct it from its old,
589+
# non-user-domain-aware form, to a form that will be correct whether or
590+
# not user domains are enabled. We save the original file with a date
591+
# suffix; again, dask will no longer try to load it.
592+
#
593+
# Other settings should stay the same; this may mean that the user has
594+
# settings for in-cluster kubernetes-driven workers, and those will
595+
# fail to spawn, but we haven't yet figured out how to safely remove
596+
# that configuration.
597+
#
598+
# If, after doing all of this, at least one file contains the correct
599+
# dashboard config, return True. Otherwise, return False.
600+
549601
retval = False
602+
550603
for suffix in ("yaml", "yml"):
551-
try:
552-
files = cfgdir.glob(f"*.{suffix}")
604+
files = cfgdir.glob(f"*.{suffix}")
605+
if files:
553606
for fl in files:
554-
obj = yaml.safe_load(fl.read_text())
555-
flensed = self._flense_dict(obj) if obj else None
556-
if not flensed:
557-
self._logger.warning(
558-
f"{fl} is empty after flensing; removing"
559-
)
560-
fl.unlink()
561-
continue
562-
563-
# Look for "distributed.dashboard.link"
564-
# It may have an older, non-user-domain-aware link
565-
# in it, and if so, then we need to replace it
566-
# with the newer, user-domain-aware one.
567-
#
568-
# Dask does the template-from-environment substitution
569-
# so these are just strings. The point is that "old"
570-
# is not correct in a user-domain-aware world, but
571-
# "new" works in either case (and also is something
572-
# JupyterHub gives us for free, and does not rely on our
573-
# very-RSP-specific-and-going-away-with-service-discovery
574-
# EXTERNAL_INSTANCE_URL variable).
575-
576-
old = "{EXTERNAL_INSTANCE_URL}{JUPYTERHUB_SERVICE_PREFIX}"
577-
new = "{JUPYTERHUB_PUBLIC_URL}"
578-
try:
579-
val = flensed["distributed"]["dashboard"]["link"]
580-
if not isinstance(val, str):
581-
self._logger.warning(
582-
"distributed.dashboard.link is not a string"
583-
)
584-
continue
585-
except KeyError:
586-
# We don't have the structure
587-
self._logger.debug(
588-
f"{fl!s} does not contain"
589-
" distributed.dashboard.link"
590-
)
591-
continue
592-
if val.find(old) < 0:
593-
# The structure is there but doesn't have the
594-
# old-style link.
595-
self._logger.debug(f"{val} does not contain {old}")
596-
continue
597-
newval = val.replace(old, new)
598-
flensed["distributed"]["dashboard"]["link"] = newval
599-
self._logger.info(f"Replaced link in {fl!s}: {old}->{new}")
600-
fl.write_text(yaml.dump(flensed, default_flow_style=False))
601-
retval = True
602-
except Exception:
603-
if fl:
604-
self._logger.exception(f"Failed to read/write {fl!s}")
605-
else:
606-
self._logger.exception(f"Failed to load yaml from {files}")
607+
today = (
608+
datetime.datetime.now(tz=datetime.UTC)
609+
.date()
610+
.isoformat()
611+
)
612+
bk = Path(f"{fl!s}.{today}")
613+
newcfg = self._clean_empty_config(fl, bk)
614+
if not newcfg:
615+
continue # next file
616+
retval = self._fix_dashboard(newcfg, fl, bk)
607617
return retval
608618

619+
def _clean_empty_config(self, fl: Path, bk: Path) -> dict[str, Any] | None:
620+
# returns the deserialized yaml object if 1) it was deserializable
621+
# in the first place, and 2) it survived flensing.
622+
try:
623+
obj = yaml.safe_load(fl.read_text())
624+
except Exception:
625+
self._logger.exception(
626+
f"Failed to deserialize {fl!s} as yaml; moving to {bk}"
627+
)
628+
obj = None
629+
flensed = self._flense_dict(obj) if obj else None
630+
if not flensed:
631+
self._logger.warning(
632+
f"{fl} is empty after flensing; moving to {bk}"
633+
)
634+
shutil.move(fl, bk)
635+
return None
636+
# It's legal YAML and it's not empty
637+
return flensed
638+
639+
def _fix_dashboard(self, cfg: dict[str, Any], fl: Path, bk: Path) -> bool:
640+
# Look for "distributed.dashboard.link".
641+
# It may have an older, non-user-domain-aware link in it,
642+
# and if so, then we need to replace it with the newer,
643+
# user-domain-aware one.
644+
645+
# Dask does the template-from-environment substitution so these are
646+
# just strings. The point is that "old" is not correct in a
647+
# user-domain-aware world, but "new" works in either case (and also
648+
# is something JupyterHub gives us for free, and does not rely on our
649+
# very-RSP-specific-and-going-away-with-service-discovery
650+
# EXTERNAL_INSTANCE_URL variable).
651+
652+
# We return True if the deserialized contents of the file named by fl
653+
# (which will be passed to us as cfg) is a dashboard config with the
654+
# new template (whether initially or after correction) and False
655+
# otherwise.
656+
657+
old = "{EXTERNAL_INSTANCE_URL}{JUPYTERHUB_SERVICE_PREFIX}"
658+
new = "{JUPYTERHUB_PUBLIC_URL}"
659+
660+
try:
661+
val = cfg["distributed"]["dashboard"]["link"]
662+
if not isinstance(val, str):
663+
# Pretty sure this is an error, but leave it as the user's
664+
# problem.
665+
self._logger.warning(
666+
"distributed.dashboard.link is not a string"
667+
)
668+
return False
669+
except KeyError:
670+
# We don't have the structure. This file is not our problem.
671+
self._logger.debug(
672+
f"{fl!s} does not contain `distributed.dashboard.link`"
673+
)
674+
return False
675+
if val.find(new) > -1:
676+
# The structure is there and is already correct.
677+
# Return True and don't update anything.
678+
return True
679+
if val.find(old) < 0:
680+
# The structure is there but doesn't have the old-style link.
681+
# Assume, again, that's intentional.
682+
self._logger.debug(f"{val} does not contain {old}")
683+
return False
684+
685+
# At this point, we have found distributed.dashboard.link.
686+
# It is a string, and it contains the old-style template so we want
687+
# to copy the original file to something without a yaml/yml suffix,
688+
# and replace the contents of the file with the old data but the
689+
# corrected link.
690+
try:
691+
# Make a backup.
692+
shutil.copy2(fl, bk)
693+
except Exception:
694+
self._logger.exception(f"Failed to back up {fl!s} to {bk!s}")
695+
return False
696+
newval = val.replace(old, new)
697+
if newval == val:
698+
self._logger.warning(
699+
f"Replacing '{old}' with '{new}' in '{val}' had no effect"
700+
)
701+
return False
702+
cfg["distributed"]["dashboard"]["link"] = newval
703+
self._logger.info(f"Replaced link in {fl!s}: {old}->{new}")
704+
try:
705+
fl.write_text(yaml.dump(cfg, default_flow_style=False))
706+
except Exception:
707+
self._logger.exception(f"Failed to write '{cfg}' to {fl!s}")
708+
return False
709+
return True
710+
609711
def _inject_new_proxy(self, tgt: Path) -> None:
610712
# Conventional for RSP.
611713
parent = tgt.parent
612714
try:
613715
parent.mkdir(parents=True, exist_ok=True)
614716
except FileExistsError:
615717
self._logger.exception(
616-
f"{tgt!s} exists and is not a directory; aborting"
718+
f"{parent!s} exists and is not a directory; aborting"
617719
)
618720
return
619721
newlink = "{JUPYTERHUB_PUBLIC_URL}proxy/{port}/status"
620722
goodlink = {"distributed": {"dashboard": {"link": newlink}}}
621723
if tgt.exists():
622724
try:
623-
obj = yaml.safe_load(tgt.read_text())
624-
newobj = self._replace_proxy_link(obj, goodlink)
625-
if newobj:
626-
tgt.write_text(yaml.dump(newobj, default_flow_style=False))
725+
obj = self._flense_dict(yaml.safe_load(tgt.read_text()))
726+
if obj is None:
727+
obj = {}
728+
# We'll turn it into an empty dict and get it in the
729+
# update. Why was there an empty dashboard.yaml? Weird.
730+
elif obj == goodlink:
731+
# This is the expected case. There's only one entry in
732+
# the target dashboard.yaml, and it's already correct.
733+
return
627734
else:
628-
self._logger.warning(f"Removing {tgt!s} because empty")
629-
tgt.unlink()
735+
self._logger.warning(
736+
f"{tgt!s} exists; contains '{obj}'"
737+
f" not just '{goodlink}'"
738+
)
739+
obj.update(goodlink)
630740
except Exception:
631741
self._logger.exception(f"Failed to load {tgt!s}")
742+
else:
743+
obj = goodlink
632744
try:
633-
tgt.write_text(yaml.dump(goodlink, default_flow_style=False))
634-
except Exception:
635-
self._logger.exception(f"Failed to write {tgt!s}")
636-
637-
def _replace_proxy_link(
638-
self,
639-
obj: dict[str, Any] | None,
640-
goodlink: dict[str, dict[str, dict[str, str]]],
641-
) -> dict[str, Any]:
642-
flensed = self._flense_dict(obj)
643-
if flensed is None:
644-
flensed = goodlink
645-
newlink = goodlink["distributed"]["dashboard"]["link"]
646-
try:
647-
current = flensed["distributed"]["dashboard"]["link"]
648-
if current == newlink:
649-
self._logger.debug("dask dashboard link is already correct")
650-
return flensed # Nothing to do.
651-
self._logger.warning(
652-
f"Current link is {current}; replacing with {newlink}"
653-
)
654-
except KeyError:
655-
pass # Setting doesn't currently exist.
745+
tgt.write_text(yaml.dump(obj, default_flow_style=False))
656746
except Exception:
657-
self._logger.exception(f"Failed to parse {flensed}")
658-
flensed.update(goodlink)
659-
return flensed
747+
self._logger.exception(f"Failed to write '{obj}' to {tgt!s}")
660748

661749
def _flense_dict(
662750
self, obj: dict[str, Any] | None

tests/startup_test.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ def test_dask_config() -> None:
310310

311311
fl_file = dask_dir / "flense.yaml"
312312
assert not fl_file.exists()
313+
313314
fl_file.write_text(yaml.dump(nullobj, default_flow_style=False))
314315
assert fl_file.exists()
315316

@@ -327,6 +328,12 @@ def test_dask_config() -> None:
327328
assert not cm_file.exists()
328329
assert def_file.exists()
329330

331+
# Test that we created a backup of the null file and the commentary
332+
fl_bk = dask_dir.glob("flense.yaml.*")
333+
assert len(list(fl_bk)) == 1
334+
cm_bk = dask_dir.glob("Comment.yaml.*")
335+
assert len(list(cm_bk)) == 1
336+
330337

331338
@pytest.mark.usefixtures("_rsp_env")
332339
def test_copy_logging_profile(monkeypatch: pytest.MonkeyPatch) -> None:

0 commit comments

Comments
 (0)