@@ -539,124 +539,212 @@ def _merge_pgpass(self) -> None:
539
539
def _setup_dask (self ) -> None :
540
540
self ._logger .debug ("Setting up dask dashboard proxy information" )
541
541
cfgdir = self ._home / ".config" / "dask"
542
- replaced = False
542
+ good_dashboard_config = False
543
543
if cfgdir .is_dir ():
544
- replaced = self ._replace_outdated_proxy (cfgdir )
545
- if not replaced :
544
+ good_dashboard_config = self ._tidy_extant_config (cfgdir )
545
+ # If we found and replaced the dashboard config, or if it was
546
+ # already correct, we do not need to write a new file.
547
+ #
548
+ # If there is no config dir, there's nothing to tidy.
549
+ if not good_dashboard_config :
550
+ # We need to write a new file with the correct config.
546
551
self ._inject_new_proxy (cfgdir / "dashboard.yaml" )
547
552
548
- def _replace_outdated_proxy (self , cfgdir : Path ) -> bool :
553
+ def _tidy_extant_config (self , cfgdir : Path ) -> bool :
554
+ #
555
+ # This is the controversial method. We have had (at least) four
556
+ # regimes of dask usage in the RSP.
557
+ #
558
+ # 1) Back in the mists of time (2018-ish), dask was present, and
559
+ # all configuration was left to the user.
560
+ # 2) For a while in 2019-2021-ish, we had a pretty sophisticated
561
+ # system that allowed users to spawn whole additional pods, and we used
562
+ # this for a really cool demo with Gaia DR1 data. But then we moved to
563
+ # the Nublado controller from KubeSpawner, and that no longer worked...
564
+ # but we didn't do anything about the user config, so users had broken
565
+ # config left over.
566
+ # 3) from 2022-ish-to-2025 dask was not present. The broken config
567
+ # thus didn't cause any harm.
568
+ # 4) in 2025, we added lsdb to the RSP. lsdb relies on dask. Suddenly
569
+ # the abandoned config could cause harm, and without config, the wrong
570
+ # dashboard information is presented to the user, which makes the lsdb
571
+ # tutorial for Rubin DP1 data needlessly confusing.
572
+ #
573
+ # This is an attempt to clean that mess up.
574
+ #
575
+ # First we check for any files that don't do anything. We know the
576
+ # config will be YAML (dask config can also be JSON, but the RSP
577
+ # machinery never wrote any such files, so we assume any JSON is
578
+ # user-generated and not directly our problem), and those files will
579
+ # be named with "yaml" or "yml" suffixes (both exist in extant user
580
+ # config) per https://github.com/dask/dask/blob/main/dask/config.py .
581
+ #
582
+ # "Don't do anything" means that when deserialized to a Python object,
583
+ # that object is None or empty, or it's a dictionary that contains only
584
+ # empty objects as its leaves. We move these files aside, with a date
585
+ # suffix so that dask will no longer try to load them.
586
+ #
587
+ # Second, assuming the file survived that process, we check
588
+ # specifically for the dashboard link, and correct it from its old,
589
+ # non-user-domain-aware form, to a form that will be correct whether or
590
+ # not user domains are enabled. We save the original file with a date
591
+ # suffix; again, dask will no longer try to load it.
592
+ #
593
+ # Other settings should stay the same; this may mean that the user has
594
+ # settings for in-cluster kubernetes-driven workers, and those will
595
+ # fail to spawn, but we haven't yet figured out how to safely remove
596
+ # that configuration.
597
+ #
598
+ # If, after doing all of this, at least one file contains the correct
599
+ # dashboard config, return True. Otherwise, return False.
600
+
549
601
retval = False
602
+
550
603
for suffix in ("yaml" , "yml" ):
551
- try :
552
- files = cfgdir . glob ( f"*. { suffix } " )
604
+ files = cfgdir . glob ( f"*. { suffix } " )
605
+ if files :
553
606
for fl in files :
554
- obj = yaml .safe_load (fl .read_text ())
555
- flensed = self ._flense_dict (obj ) if obj else None
556
- if not flensed :
557
- self ._logger .warning (
558
- f"{ fl } is empty after flensing; removing"
559
- )
560
- fl .unlink ()
561
- continue
562
-
563
- # Look for "distributed.dashboard.link"
564
- # It may have an older, non-user-domain-aware link
565
- # in it, and if so, then we need to replace it
566
- # with the newer, user-domain-aware one.
567
- #
568
- # Dask does the template-from-environment substitution
569
- # so these are just strings. The point is that "old"
570
- # is not correct in a user-domain-aware world, but
571
- # "new" works in either case (and also is something
572
- # JupyterHub gives us for free, and does not rely on our
573
- # very-RSP-specific-and-going-away-with-service-discovery
574
- # EXTERNAL_INSTANCE_URL variable).
575
-
576
- old = "{EXTERNAL_INSTANCE_URL}{JUPYTERHUB_SERVICE_PREFIX}"
577
- new = "{JUPYTERHUB_PUBLIC_URL}"
578
- try :
579
- val = flensed ["distributed" ]["dashboard" ]["link" ]
580
- if not isinstance (val , str ):
581
- self ._logger .warning (
582
- "distributed.dashboard.link is not a string"
583
- )
584
- continue
585
- except KeyError :
586
- # We don't have the structure
587
- self ._logger .debug (
588
- f"{ fl !s} does not contain"
589
- " distributed.dashboard.link"
590
- )
591
- continue
592
- if val .find (old ) < 0 :
593
- # The structure is there but doesn't have the
594
- # old-style link.
595
- self ._logger .debug (f"{ val } does not contain { old } " )
596
- continue
597
- newval = val .replace (old , new )
598
- flensed ["distributed" ]["dashboard" ]["link" ] = newval
599
- self ._logger .info (f"Replaced link in { fl !s} : { old } ->{ new } " )
600
- fl .write_text (yaml .dump (flensed , default_flow_style = False ))
601
- retval = True
602
- except Exception :
603
- if fl :
604
- self ._logger .exception (f"Failed to read/write { fl !s} " )
605
- else :
606
- self ._logger .exception (f"Failed to load yaml from { files } " )
607
+ today = (
608
+ datetime .datetime .now (tz = datetime .UTC )
609
+ .date ()
610
+ .isoformat ()
611
+ )
612
+ bk = Path (f"{ fl !s} .{ today } " )
613
+ newcfg = self ._clean_empty_config (fl , bk )
614
+ if not newcfg :
615
+ continue # next file
616
+ retval = self ._fix_dashboard (newcfg , fl , bk )
607
617
return retval
608
618
619
+ def _clean_empty_config (self , fl : Path , bk : Path ) -> dict [str , Any ] | None :
620
+ # returns the deserialized yaml object if 1) it was deserializable
621
+ # in the first place, and 2) it survived flensing.
622
+ try :
623
+ obj = yaml .safe_load (fl .read_text ())
624
+ except Exception :
625
+ self ._logger .exception (
626
+ f"Failed to deserialize { fl !s} as yaml; moving to { bk } "
627
+ )
628
+ obj = None
629
+ flensed = self ._flense_dict (obj ) if obj else None
630
+ if not flensed :
631
+ self ._logger .warning (
632
+ f"{ fl } is empty after flensing; moving to { bk } "
633
+ )
634
+ shutil .move (fl , bk )
635
+ return None
636
+ # It's legal YAML and it's not empty
637
+ return flensed
638
+
639
+ def _fix_dashboard (self , cfg : dict [str , Any ], fl : Path , bk : Path ) -> bool :
640
+ # Look for "distributed.dashboard.link".
641
+ # It may have an older, non-user-domain-aware link in it,
642
+ # and if so, then we need to replace it with the newer,
643
+ # user-domain-aware one.
644
+
645
+ # Dask does the template-from-environment substitution so these are
646
+ # just strings. The point is that "old" is not correct in a
647
+ # user-domain-aware world, but "new" works in either case (and also
648
+ # is something JupyterHub gives us for free, and does not rely on our
649
+ # very-RSP-specific-and-going-away-with-service-discovery
650
+ # EXTERNAL_INSTANCE_URL variable).
651
+
652
+ # We return True if the deserialized contents of the file named by fl
653
+ # (which will be passed to us as cfg) is a dashboard config with the
654
+ # new template (whether initially or after correction) and False
655
+ # otherwise.
656
+
657
+ old = "{EXTERNAL_INSTANCE_URL}{JUPYTERHUB_SERVICE_PREFIX}"
658
+ new = "{JUPYTERHUB_PUBLIC_URL}"
659
+
660
+ try :
661
+ val = cfg ["distributed" ]["dashboard" ]["link" ]
662
+ if not isinstance (val , str ):
663
+ # Pretty sure this is an error, but leave it as the user's
664
+ # problem.
665
+ self ._logger .warning (
666
+ "distributed.dashboard.link is not a string"
667
+ )
668
+ return False
669
+ except KeyError :
670
+ # We don't have the structure. This file is not our problem.
671
+ self ._logger .debug (
672
+ f"{ fl !s} does not contain `distributed.dashboard.link`"
673
+ )
674
+ return False
675
+ if val .find (new ) > - 1 :
676
+ # The structure is there and is already correct.
677
+ # Return True and don't update anything.
678
+ return True
679
+ if val .find (old ) < 0 :
680
+ # The structure is there but doesn't have the old-style link.
681
+ # Assume, again, that's intentional.
682
+ self ._logger .debug (f"{ val } does not contain { old } " )
683
+ return False
684
+
685
+ # At this point, we have found distributed.dashboard.link.
686
+ # It is a string, and it contains the old-style template so we want
687
+ # to copy the original file to something without a yaml/yml suffix,
688
+ # and replace the contents of the file with the old data but the
689
+ # corrected link.
690
+ try :
691
+ # Make a backup.
692
+ shutil .copy2 (fl , bk )
693
+ except Exception :
694
+ self ._logger .exception (f"Failed to back up { fl !s} to { bk !s} " )
695
+ return False
696
+ newval = val .replace (old , new )
697
+ if newval == val :
698
+ self ._logger .warning (
699
+ f"Replacing '{ old } ' with '{ new } ' in '{ val } ' had no effect"
700
+ )
701
+ return False
702
+ cfg ["distributed" ]["dashboard" ]["link" ] = newval
703
+ self ._logger .info (f"Replaced link in { fl !s} : { old } ->{ new } " )
704
+ try :
705
+ fl .write_text (yaml .dump (cfg , default_flow_style = False ))
706
+ except Exception :
707
+ self ._logger .exception (f"Failed to write '{ cfg } ' to { fl !s} " )
708
+ return False
709
+ return True
710
+
609
711
def _inject_new_proxy (self , tgt : Path ) -> None :
610
712
# Conventional for RSP.
611
713
parent = tgt .parent
612
714
try :
613
715
parent .mkdir (parents = True , exist_ok = True )
614
716
except FileExistsError :
615
717
self ._logger .exception (
616
- f"{ tgt !s} exists and is not a directory; aborting"
718
+ f"{ parent !s} exists and is not a directory; aborting"
617
719
)
618
720
return
619
721
newlink = "{JUPYTERHUB_PUBLIC_URL}proxy/{port}/status"
620
722
goodlink = {"distributed" : {"dashboard" : {"link" : newlink }}}
621
723
if tgt .exists ():
622
724
try :
623
- obj = yaml .safe_load (tgt .read_text ())
624
- newobj = self ._replace_proxy_link (obj , goodlink )
625
- if newobj :
626
- tgt .write_text (yaml .dump (newobj , default_flow_style = False ))
725
+ obj = self ._flense_dict (yaml .safe_load (tgt .read_text ()))
726
+ if obj is None :
727
+ obj = {}
728
+ # We'll turn it into an empty dict and get it in the
729
+ # update. Why was there an empty dashboard.yaml? Weird.
730
+ elif obj == goodlink :
731
+ # This is the expected case. There's only one entry in
732
+ # the target dashboard.yaml, and it's already correct.
733
+ return
627
734
else :
628
- self ._logger .warning (f"Removing { tgt !s} because empty" )
629
- tgt .unlink ()
735
+ self ._logger .warning (
736
+ f"{ tgt !s} exists; contains '{ obj } '"
737
+ f" not just '{ goodlink } '"
738
+ )
739
+ obj .update (goodlink )
630
740
except Exception :
631
741
self ._logger .exception (f"Failed to load { tgt !s} " )
742
+ else :
743
+ obj = goodlink
632
744
try :
633
- tgt .write_text (yaml .dump (goodlink , default_flow_style = False ))
634
- except Exception :
635
- self ._logger .exception (f"Failed to write { tgt !s} " )
636
-
637
- def _replace_proxy_link (
638
- self ,
639
- obj : dict [str , Any ] | None ,
640
- goodlink : dict [str , dict [str , dict [str , str ]]],
641
- ) -> dict [str , Any ]:
642
- flensed = self ._flense_dict (obj )
643
- if flensed is None :
644
- flensed = goodlink
645
- newlink = goodlink ["distributed" ]["dashboard" ]["link" ]
646
- try :
647
- current = flensed ["distributed" ]["dashboard" ]["link" ]
648
- if current == newlink :
649
- self ._logger .debug ("dask dashboard link is already correct" )
650
- return flensed # Nothing to do.
651
- self ._logger .warning (
652
- f"Current link is { current } ; replacing with { newlink } "
653
- )
654
- except KeyError :
655
- pass # Setting doesn't currently exist.
745
+ tgt .write_text (yaml .dump (obj , default_flow_style = False ))
656
746
except Exception :
657
- self ._logger .exception (f"Failed to parse { flensed } " )
658
- flensed .update (goodlink )
659
- return flensed
747
+ self ._logger .exception (f"Failed to write '{ obj } ' to { tgt !s} " )
660
748
661
749
def _flense_dict (
662
750
self , obj : dict [str , Any ] | None
0 commit comments