Skip to content

Commit 2e49301

Browse files
authored
Merge pull request #94 from lsst-sqre/tickets/DM-52022
tickets/DM-52022: fix dask dashboard config
2 parents 16802e5 + 3821d75 commit 2e49301

File tree

4 files changed

+307
-0
lines changed

4 files changed

+307
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<!-- Delete the sections that don't apply -->
2+
3+
### New features
4+
5+
- We now attempt to detect leftover, no longer correct dask config and correct it so the dask dashboard doesn't present incorrect information.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ provision-landing-page = "lsst.rsp.startup.cli:provision_landing_page"
4545
dev = [
4646
# Typing
4747
"types-deprecated",
48+
"types-pyyaml",
4849
"types-requests",
4950
# Testing
5051
"coverage[toml]",

src/lsst/rsp/startup/services/labrunner/labrunner.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from urllib.parse import parse_qsl, urlparse
1717

1818
import structlog
19+
import yaml
1920

2021
from .... import get_access_token, get_digest
2122
from ....utils import get_jupyterlab_config_dir, get_runtime_mounts_dir
@@ -468,6 +469,7 @@ def _try_emergency_cleanup(self) -> None:
468469
def _copy_files_to_user_homedir(self) -> None:
469470
self._logger.debug("Copying files to user home directory")
470471
self._copy_butler_credentials()
472+
self._setup_dask()
471473
self._copy_logging_profile()
472474
self._copy_dircolors()
473475
self._copy_etc_skel()
@@ -534,6 +536,237 @@ def _merge_pgpass(self) -> None:
534536
for connection in config:
535537
f.write(f"{connection}:{config[connection]}\n")
536538

539+
def _setup_dask(self) -> None:
540+
self._logger.debug("Setting up dask dashboard proxy information")
541+
cfgdir = self._home / ".config" / "dask"
542+
good_dashboard_config = False
543+
if cfgdir.is_dir():
544+
good_dashboard_config = self._tidy_extant_config(cfgdir)
545+
# If we found and replaced the dashboard config, or if it was
546+
# already correct, we do not need to write a new file.
547+
#
548+
# If there is no config dir, there's nothing to tidy.
549+
if not good_dashboard_config:
550+
# We need to write a new file with the correct config.
551+
self._inject_new_proxy(cfgdir / "dashboard.yaml")
552+
553+
def _tidy_extant_config(self, cfgdir: Path) -> bool:
554+
#
555+
# This is the controversial method. We have had (at least) four
556+
# regimes of dask usage in the RSP.
557+
#
558+
# 1) Back in the mists of time (2018-ish), dask was present, and
559+
# all configuration was left to the user.
560+
# 2) For a while in 2019-2021-ish, we had a pretty sophisticated
561+
# system that allowed users to spawn whole additional pods, and we used
562+
# this for a really cool demo with Gaia DR1 data. But then we moved to
563+
# the Nublado controller from KubeSpawner, and that no longer worked...
564+
# but we didn't do anything about the user config, so users had broken
565+
# config left over.
566+
# 3) from 2022-ish-to-2025 dask was not present. The broken config
567+
# thus didn't cause any harm.
568+
# 4) in 2025, we added lsdb to the RSP. lsdb relies on dask. Suddenly
569+
# the abandoned config could cause harm, and without config, the wrong
570+
# dashboard information is presented to the user, which makes the lsdb
571+
# tutorial for Rubin DP1 data needlessly confusing.
572+
#
573+
# This is an attempt to clean that mess up.
574+
#
575+
# First we check for any files that don't do anything. We know the
576+
# config will be YAML (dask config can also be JSON, but the RSP
577+
# machinery never wrote any such files, so we assume any JSON is
578+
# user-generated and not directly our problem), and those files will
579+
# be named with "yaml" or "yml" suffixes (both exist in extant user
580+
# config) per https://github.com/dask/dask/blob/main/dask/config.py .
581+
#
582+
# "Don't do anything" means that when deserialized to a Python object,
583+
# that object is None or empty, or it's a dictionary that contains only
584+
# empty objects as its leaves. We move these files aside, with a date
585+
# suffix so that dask will no longer try to load them.
586+
#
587+
# Second, assuming the file survived that process, we check
588+
# specifically for the dashboard link, and correct it from its old,
589+
# non-user-domain-aware form, to a form that will be correct whether or
590+
# not user domains are enabled. We save the original file with a date
591+
# suffix; again, dask will no longer try to load it.
592+
#
593+
# Other settings should stay the same; this may mean that the user has
594+
# settings for in-cluster kubernetes-driven workers, and those will
595+
# fail to spawn, but we haven't yet figured out how to safely remove
596+
# that configuration.
597+
#
598+
# If, after doing all of this, at least one file contains the correct
599+
# dashboard config, return True. Otherwise, return False.
600+
601+
retval = False
602+
603+
for suffix in ("yaml", "yml"):
604+
files = list(cfgdir.glob(f"*.{suffix}"))
605+
if files:
606+
for fl in files:
607+
today = (
608+
datetime.datetime.now(tz=datetime.UTC)
609+
.date()
610+
.isoformat()
611+
)
612+
bk = Path(f"{fl!s}.{today}")
613+
newcfg = self._clean_empty_config(fl, bk)
614+
if not newcfg:
615+
continue # next file
616+
retval = self._fix_dashboard(newcfg, fl, bk)
617+
return retval
618+
619+
def _clean_empty_config(self, fl: Path, bk: Path) -> dict[str, Any] | None:
620+
# returns the deserialized yaml object if 1) it was deserializable
621+
# in the first place, and 2) it survived flensing.
622+
try:
623+
obj = yaml.safe_load(fl.read_text())
624+
except Exception:
625+
self._logger.exception(
626+
f"Failed to deserialize {fl!s} as yaml; moving to {bk}"
627+
)
628+
obj = None
629+
flensed = self._flense_dict(obj) if obj else None
630+
if not flensed:
631+
self._logger.warning(
632+
f"{fl} is empty after flensing; moving to {bk}"
633+
)
634+
shutil.move(fl, bk)
635+
return None
636+
# It's legal YAML and it's not empty
637+
return flensed
638+
639+
def _fix_dashboard(self, cfg: dict[str, Any], fl: Path, bk: Path) -> bool:
640+
# Look for "distributed.dashboard.link".
641+
# It may have an older, non-user-domain-aware link in it,
642+
# and if so, then we need to replace it with the newer,
643+
# user-domain-aware one.
644+
645+
# Dask does the template-from-environment substitution so these are
646+
# just strings. The point is that "old" is not correct in a
647+
# user-domain-aware world, but "new" works in either case (and also
648+
# is something JupyterHub gives us for free, and does not rely on our
649+
# very-RSP-specific-and-going-away-with-service-discovery
650+
# EXTERNAL_INSTANCE_URL variable).
651+
652+
# We return True if the deserialized contents of the file named by fl
653+
# (which will be passed to us as cfg) is a dashboard config with the
654+
# new template (whether initially or after correction) and False
655+
# otherwise.
656+
657+
old = "{EXTERNAL_INSTANCE_URL}{JUPYTERHUB_SERVICE_PREFIX}"
658+
new = "{JUPYTERHUB_PUBLIC_URL}"
659+
660+
try:
661+
val = cfg["distributed"]["dashboard"]["link"]
662+
if not isinstance(val, str):
663+
# Pretty sure this is an error, but leave it as the user's
664+
# problem.
665+
self._logger.warning(
666+
"distributed.dashboard.link is not a string"
667+
)
668+
return False
669+
except KeyError:
670+
# We don't have the structure. This file is not our problem.
671+
self._logger.debug(
672+
f"{fl!s} does not contain `distributed.dashboard.link`"
673+
)
674+
return False
675+
if val.find(new) > -1:
676+
# The structure is there and is already correct.
677+
# Return True and don't update anything.
678+
return True
679+
if val.find(old) < 0:
680+
# The structure is there but doesn't have the old-style link.
681+
# Assume, again, that's intentional.
682+
self._logger.debug(f"{val} does not contain {old}")
683+
return False
684+
685+
# At this point, we have found distributed.dashboard.link.
686+
# It is a string, and it contains the old-style template so we want
687+
# to copy the original file to something without a yaml/yml suffix,
688+
# and replace the contents of the file with the old data but the
689+
# corrected link.
690+
try:
691+
# Make a backup.
692+
shutil.copy2(fl, bk)
693+
except Exception:
694+
self._logger.exception(f"Failed to back up {fl!s} to {bk!s}")
695+
return False
696+
newval = val.replace(old, new)
697+
if newval == val:
698+
self._logger.warning(
699+
f"Replacing '{old}' with '{new}' in '{val}' had no effect"
700+
)
701+
return False
702+
cfg["distributed"]["dashboard"]["link"] = newval
703+
self._logger.info(f"Replaced link in {fl!s}: {old}->{new}")
704+
try:
705+
fl.write_text(yaml.dump(cfg, default_flow_style=False))
706+
except Exception:
707+
self._logger.exception(f"Failed to write '{cfg}' to {fl!s}")
708+
return False
709+
return True
710+
711+
def _inject_new_proxy(self, tgt: Path) -> None:
712+
# Conventional for RSP.
713+
parent = tgt.parent
714+
try:
715+
parent.mkdir(parents=True, exist_ok=True)
716+
except FileExistsError:
717+
self._logger.exception(
718+
f"{parent!s} exists and is not a directory; aborting"
719+
)
720+
return
721+
newlink = "{JUPYTERHUB_PUBLIC_URL}proxy/{port}/status"
722+
goodlink = {"distributed": {"dashboard": {"link": newlink}}}
723+
if tgt.exists():
724+
try:
725+
obj = self._flense_dict(yaml.safe_load(tgt.read_text()))
726+
if obj is None:
727+
obj = {}
728+
# We'll turn it into an empty dict and get it in the
729+
# update. Why was there an empty dashboard.yaml? Weird.
730+
elif obj == goodlink:
731+
# This is the expected case. There's only one entry in
732+
# the target dashboard.yaml, and it's already correct.
733+
return
734+
else:
735+
self._logger.warning(
736+
f"{tgt!s} exists; contains '{obj}'"
737+
f" not just '{goodlink}'"
738+
)
739+
obj.update(goodlink)
740+
except Exception:
741+
self._logger.exception(f"Failed to load {tgt!s}")
742+
else:
743+
obj = goodlink
744+
try:
745+
tgt.write_text(yaml.dump(obj, default_flow_style=False))
746+
except Exception:
747+
self._logger.exception(f"Failed to write '{obj}' to {tgt!s}")
748+
749+
def _flense_dict(
750+
self, obj: dict[str, Any] | None
751+
) -> dict[str, Any] | None:
752+
"""Recursively walk a dict; any place a null value is found, it
753+
and its corresponding key are removed.
754+
"""
755+
if not obj:
756+
return None
757+
retval: dict[str, Any] = {}
758+
for key, val in obj.items():
759+
if val is None:
760+
continue
761+
if not isinstance(val, dict):
762+
retval[key] = val
763+
continue
764+
flensed = self._flense_dict(val)
765+
if flensed is None:
766+
continue
767+
retval[key] = flensed
768+
return retval if retval else None
769+
537770
def _copy_logging_profile(self) -> None:
538771
self._logger.debug("Copying logging profile if needed")
539772
user_profile = (

tests/startup_test.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from pathlib import Path
99

1010
import pytest
11+
import yaml
1112

1213
import lsst.rsp
1314
from lsst.rsp.startup.services.labrunner import LabRunner
@@ -267,6 +268,73 @@ def test_copy_butler_credentials(monkeypatch: pytest.MonkeyPatch) -> None:
267268
assert cp["tertiary"]["aws_secret_access_key"] == "key03"
268269

269270

271+
@pytest.mark.usefixtures("_rsp_env")
272+
def test_dask_config() -> None:
273+
newlink = "{JUPYTERHUB_PUBLIC_URL}proxy/{port}/status"
274+
275+
# First, just see if we create the default proxy settings.
276+
lr = LabRunner()
277+
dask_dir = lr._home / ".config" / "dask"
278+
assert not dask_dir.exists()
279+
lr._setup_dask()
280+
assert dask_dir.exists()
281+
def_file = dask_dir / "dashboard.yaml"
282+
assert def_file.exists()
283+
obj = yaml.safe_load(def_file.read_text())
284+
assert obj["distributed"]["dashboard"]["link"] == newlink
285+
286+
def_file.unlink()
287+
288+
# Now test that we convert an old-style one to a user-domain config
289+
old_file = dask_dir / "lsst_dask.yml"
290+
assert not old_file.exists()
291+
292+
obj["distributed"]["dashboard"]["link"] = (
293+
"{EXTERNAL_INSTANCE_URL}{JUPYTERHUB_SERVICE_PREFIX}proxy/{port}/status"
294+
)
295+
old_file.write_text(yaml.dump(obj, default_flow_style=False))
296+
297+
assert not def_file.exists()
298+
assert old_file.exists()
299+
300+
lr._setup_dask() # Should replace the text.
301+
obj = yaml.safe_load(old_file.read_text())
302+
assert obj["distributed"]["dashboard"]["link"] == newlink
303+
304+
old_file.unlink()
305+
assert not old_file.exists()
306+
307+
# Test that we remove empty dict keys
308+
nullobj = {"key1": {"key2": {"key3": None}}}
309+
assert lr._flense_dict(nullobj) is None
310+
311+
fl_file = dask_dir / "flense.yaml"
312+
assert not fl_file.exists()
313+
314+
fl_file.write_text(yaml.dump(nullobj, default_flow_style=False))
315+
assert fl_file.exists()
316+
317+
cm_file = dask_dir / "Comment.yaml"
318+
assert not cm_file.exists()
319+
cm_file.write_text("# Nothing but commentary\n")
320+
assert cm_file.exists()
321+
322+
assert not def_file.exists()
323+
324+
# This should create the defaults, and should remove the flensed
325+
# config and the only-comments file.
326+
lr._setup_dask()
327+
assert not fl_file.exists()
328+
assert not cm_file.exists()
329+
assert def_file.exists()
330+
331+
# Test that we created a backup of the null file and the commentary
332+
fl_bk = dask_dir.glob("flense.yaml.*")
333+
assert len(list(fl_bk)) == 1
334+
cm_bk = dask_dir.glob("Comment.yaml.*")
335+
assert len(list(cm_bk)) == 1
336+
337+
270338
@pytest.mark.usefixtures("_rsp_env")
271339
def test_copy_logging_profile(monkeypatch: pytest.MonkeyPatch) -> None:
272340
lr = LabRunner()

0 commit comments

Comments
 (0)