|
16 | 16 | from urllib.parse import parse_qsl, urlparse
|
17 | 17 |
|
18 | 18 | import structlog
|
| 19 | +import yaml |
19 | 20 |
|
20 | 21 | from .... import get_access_token, get_digest
|
21 | 22 | from ....utils import get_jupyterlab_config_dir, get_runtime_mounts_dir
|
@@ -468,6 +469,7 @@ def _try_emergency_cleanup(self) -> None:
|
468 | 469 | def _copy_files_to_user_homedir(self) -> None:
|
469 | 470 | self._logger.debug("Copying files to user home directory")
|
470 | 471 | self._copy_butler_credentials()
|
| 472 | + self._setup_dask() |
471 | 473 | self._copy_logging_profile()
|
472 | 474 | self._copy_dircolors()
|
473 | 475 | self._copy_etc_skel()
|
@@ -534,6 +536,237 @@ def _merge_pgpass(self) -> None:
|
534 | 536 | for connection in config:
|
535 | 537 | f.write(f"{connection}:{config[connection]}\n")
|
536 | 538 |
|
| 539 | + def _setup_dask(self) -> None: |
| 540 | + self._logger.debug("Setting up dask dashboard proxy information") |
| 541 | + cfgdir = self._home / ".config" / "dask" |
| 542 | + good_dashboard_config = False |
| 543 | + if cfgdir.is_dir(): |
| 544 | + good_dashboard_config = self._tidy_extant_config(cfgdir) |
| 545 | + # If we found and replaced the dashboard config, or if it was |
| 546 | + # already correct, we do not need to write a new file. |
| 547 | + # |
| 548 | + # If there is no config dir, there's nothing to tidy. |
| 549 | + if not good_dashboard_config: |
| 550 | + # We need to write a new file with the correct config. |
| 551 | + self._inject_new_proxy(cfgdir / "dashboard.yaml") |
| 552 | + |
| 553 | + def _tidy_extant_config(self, cfgdir: Path) -> bool: |
| 554 | + # |
| 555 | + # This is the controversial method. We have had (at least) four |
| 556 | + # regimes of dask usage in the RSP. |
| 557 | + # |
| 558 | + # 1) Back in the mists of time (2018-ish), dask was present, and |
| 559 | + # all configuration was left to the user. |
| 560 | + # 2) For a while in 2019-2021-ish, we had a pretty sophisticated |
| 561 | + # system that allowed users to spawn whole additional pods, and we used |
| 562 | + # this for a really cool demo with Gaia DR1 data. But then we moved to |
| 563 | + # the Nublado controller from KubeSpawner, and that no longer worked... |
| 564 | + # but we didn't do anything about the user config, so users had broken |
| 565 | + # config left over. |
| 566 | + # 3) from 2022-ish-to-2025 dask was not present. The broken config |
| 567 | + # thus didn't cause any harm. |
| 568 | + # 4) in 2025, we added lsdb to the RSP. lsdb relies on dask. Suddenly |
| 569 | + # the abandoned config could cause harm, and without config, the wrong |
| 570 | + # dashboard information is presented to the user, which makes the lsdb |
| 571 | + # tutorial for Rubin DP1 data needlessly confusing. |
| 572 | + # |
| 573 | + # This is an attempt to clean that mess up. |
| 574 | + # |
| 575 | + # First we check for any files that don't do anything. We know the |
| 576 | + # config will be YAML (dask config can also be JSON, but the RSP |
| 577 | + # machinery never wrote any such files, so we assume any JSON is |
| 578 | + # user-generated and not directly our problem), and those files will |
| 579 | + # be named with "yaml" or "yml" suffixes (both exist in extant user |
| 580 | + # config) per https://github.com/dask/dask/blob/main/dask/config.py . |
| 581 | + # |
| 582 | + # "Don't do anything" means that when deserialized to a Python object, |
| 583 | + # that object is None or empty, or it's a dictionary that contains only |
| 584 | + # empty objects as its leaves. We move these files aside, with a date |
| 585 | + # suffix so that dask will no longer try to load them. |
| 586 | + # |
| 587 | + # Second, assuming the file survived that process, we check |
| 588 | + # specifically for the dashboard link, and correct it from its old, |
| 589 | + # non-user-domain-aware form, to a form that will be correct whether or |
| 590 | + # not user domains are enabled. We save the original file with a date |
| 591 | + # suffix; again, dask will no longer try to load it. |
| 592 | + # |
| 593 | + # Other settings should stay the same; this may mean that the user has |
| 594 | + # settings for in-cluster kubernetes-driven workers, and those will |
| 595 | + # fail to spawn, but we haven't yet figured out how to safely remove |
| 596 | + # that configuration. |
| 597 | + # |
| 598 | + # If, after doing all of this, at least one file contains the correct |
| 599 | + # dashboard config, return True. Otherwise, return False. |
| 600 | + |
| 601 | + retval = False |
| 602 | + |
| 603 | + for suffix in ("yaml", "yml"): |
| 604 | + files = list(cfgdir.glob(f"*.{suffix}")) |
| 605 | + if files: |
| 606 | + for fl in files: |
| 607 | + today = ( |
| 608 | + datetime.datetime.now(tz=datetime.UTC) |
| 609 | + .date() |
| 610 | + .isoformat() |
| 611 | + ) |
| 612 | + bk = Path(f"{fl!s}.{today}") |
| 613 | + newcfg = self._clean_empty_config(fl, bk) |
| 614 | + if not newcfg: |
| 615 | + continue # next file |
| 616 | + retval = self._fix_dashboard(newcfg, fl, bk) |
| 617 | + return retval |
| 618 | + |
| 619 | + def _clean_empty_config(self, fl: Path, bk: Path) -> dict[str, Any] | None: |
| 620 | + # returns the deserialized yaml object if 1) it was deserializable |
| 621 | + # in the first place, and 2) it survived flensing. |
| 622 | + try: |
| 623 | + obj = yaml.safe_load(fl.read_text()) |
| 624 | + except Exception: |
| 625 | + self._logger.exception( |
| 626 | + f"Failed to deserialize {fl!s} as yaml; moving to {bk}" |
| 627 | + ) |
| 628 | + obj = None |
| 629 | + flensed = self._flense_dict(obj) if obj else None |
| 630 | + if not flensed: |
| 631 | + self._logger.warning( |
| 632 | + f"{fl} is empty after flensing; moving to {bk}" |
| 633 | + ) |
| 634 | + shutil.move(fl, bk) |
| 635 | + return None |
| 636 | + # It's legal YAML and it's not empty |
| 637 | + return flensed |
| 638 | + |
| 639 | + def _fix_dashboard(self, cfg: dict[str, Any], fl: Path, bk: Path) -> bool: |
| 640 | + # Look for "distributed.dashboard.link". |
| 641 | + # It may have an older, non-user-domain-aware link in it, |
| 642 | + # and if so, then we need to replace it with the newer, |
| 643 | + # user-domain-aware one. |
| 644 | + |
| 645 | + # Dask does the template-from-environment substitution so these are |
| 646 | + # just strings. The point is that "old" is not correct in a |
| 647 | + # user-domain-aware world, but "new" works in either case (and also |
| 648 | + # is something JupyterHub gives us for free, and does not rely on our |
| 649 | + # very-RSP-specific-and-going-away-with-service-discovery |
| 650 | + # EXTERNAL_INSTANCE_URL variable). |
| 651 | + |
| 652 | + # We return True if the deserialized contents of the file named by fl |
| 653 | + # (which will be passed to us as cfg) is a dashboard config with the |
| 654 | + # new template (whether initially or after correction) and False |
| 655 | + # otherwise. |
| 656 | + |
| 657 | + old = "{EXTERNAL_INSTANCE_URL}{JUPYTERHUB_SERVICE_PREFIX}" |
| 658 | + new = "{JUPYTERHUB_PUBLIC_URL}" |
| 659 | + |
| 660 | + try: |
| 661 | + val = cfg["distributed"]["dashboard"]["link"] |
| 662 | + if not isinstance(val, str): |
| 663 | + # Pretty sure this is an error, but leave it as the user's |
| 664 | + # problem. |
| 665 | + self._logger.warning( |
| 666 | + "distributed.dashboard.link is not a string" |
| 667 | + ) |
| 668 | + return False |
| 669 | + except KeyError: |
| 670 | + # We don't have the structure. This file is not our problem. |
| 671 | + self._logger.debug( |
| 672 | + f"{fl!s} does not contain `distributed.dashboard.link`" |
| 673 | + ) |
| 674 | + return False |
| 675 | + if val.find(new) > -1: |
| 676 | + # The structure is there and is already correct. |
| 677 | + # Return True and don't update anything. |
| 678 | + return True |
| 679 | + if val.find(old) < 0: |
| 680 | + # The structure is there but doesn't have the old-style link. |
| 681 | + # Assume, again, that's intentional. |
| 682 | + self._logger.debug(f"{val} does not contain {old}") |
| 683 | + return False |
| 684 | + |
| 685 | + # At this point, we have found distributed.dashboard.link. |
| 686 | + # It is a string, and it contains the old-style template so we want |
| 687 | + # to copy the original file to something without a yaml/yml suffix, |
| 688 | + # and replace the contents of the file with the old data but the |
| 689 | + # corrected link. |
| 690 | + try: |
| 691 | + # Make a backup. |
| 692 | + shutil.copy2(fl, bk) |
| 693 | + except Exception: |
| 694 | + self._logger.exception(f"Failed to back up {fl!s} to {bk!s}") |
| 695 | + return False |
| 696 | + newval = val.replace(old, new) |
| 697 | + if newval == val: |
| 698 | + self._logger.warning( |
| 699 | + f"Replacing '{old}' with '{new}' in '{val}' had no effect" |
| 700 | + ) |
| 701 | + return False |
| 702 | + cfg["distributed"]["dashboard"]["link"] = newval |
| 703 | + self._logger.info(f"Replaced link in {fl!s}: {old}->{new}") |
| 704 | + try: |
| 705 | + fl.write_text(yaml.dump(cfg, default_flow_style=False)) |
| 706 | + except Exception: |
| 707 | + self._logger.exception(f"Failed to write '{cfg}' to {fl!s}") |
| 708 | + return False |
| 709 | + return True |
| 710 | + |
| 711 | + def _inject_new_proxy(self, tgt: Path) -> None: |
| 712 | + # Conventional for RSP. |
| 713 | + parent = tgt.parent |
| 714 | + try: |
| 715 | + parent.mkdir(parents=True, exist_ok=True) |
| 716 | + except FileExistsError: |
| 717 | + self._logger.exception( |
| 718 | + f"{parent!s} exists and is not a directory; aborting" |
| 719 | + ) |
| 720 | + return |
| 721 | + newlink = "{JUPYTERHUB_PUBLIC_URL}proxy/{port}/status" |
| 722 | + goodlink = {"distributed": {"dashboard": {"link": newlink}}} |
| 723 | + if tgt.exists(): |
| 724 | + try: |
| 725 | + obj = self._flense_dict(yaml.safe_load(tgt.read_text())) |
| 726 | + if obj is None: |
| 727 | + obj = {} |
| 728 | + # We'll turn it into an empty dict and get it in the |
| 729 | + # update. Why was there an empty dashboard.yaml? Weird. |
| 730 | + elif obj == goodlink: |
| 731 | + # This is the expected case. There's only one entry in |
| 732 | + # the target dashboard.yaml, and it's already correct. |
| 733 | + return |
| 734 | + else: |
| 735 | + self._logger.warning( |
| 736 | + f"{tgt!s} exists; contains '{obj}'" |
| 737 | + f" not just '{goodlink}'" |
| 738 | + ) |
| 739 | + obj.update(goodlink) |
| 740 | + except Exception: |
| 741 | + self._logger.exception(f"Failed to load {tgt!s}") |
| 742 | + else: |
| 743 | + obj = goodlink |
| 744 | + try: |
| 745 | + tgt.write_text(yaml.dump(obj, default_flow_style=False)) |
| 746 | + except Exception: |
| 747 | + self._logger.exception(f"Failed to write '{obj}' to {tgt!s}") |
| 748 | + |
| 749 | + def _flense_dict( |
| 750 | + self, obj: dict[str, Any] | None |
| 751 | + ) -> dict[str, Any] | None: |
| 752 | + """Recursively walk a dict; any place a null value is found, it |
| 753 | + and its corresponding key are removed. |
| 754 | + """ |
| 755 | + if not obj: |
| 756 | + return None |
| 757 | + retval: dict[str, Any] = {} |
| 758 | + for key, val in obj.items(): |
| 759 | + if val is None: |
| 760 | + continue |
| 761 | + if not isinstance(val, dict): |
| 762 | + retval[key] = val |
| 763 | + continue |
| 764 | + flensed = self._flense_dict(val) |
| 765 | + if flensed is None: |
| 766 | + continue |
| 767 | + retval[key] = flensed |
| 768 | + return retval if retval else None |
| 769 | + |
537 | 770 | def _copy_logging_profile(self) -> None:
|
538 | 771 | self._logger.debug("Copying logging profile if needed")
|
539 | 772 | user_profile = (
|
|
0 commit comments