Skip to content

Commit 6592848

Browse files
authored
Merge pull request #89 from lsst-sqre/tickets/DM-51608
tickets/DM-51608: Clean up abnormal-start error handling a little.
2 parents 1ccd675 + 92f3090 commit 6592848

File tree

3 files changed

+134
-40
lines changed

3 files changed

+134
-40
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<!-- Delete the sections that don't apply -->
2+
3+
### Other changes
4+
5+
- Provide better advice when user Lab startup is abnormal.

src/lsst/rsp/startup/exceptions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,4 +162,5 @@ def from_os_error(cls, exc: OSError) -> Self:
162162
strerror = (
163163
exc.strerror or os.strerror(errnum) or f"Unknown error {errnum}"
164164
)
165-
return cls(errnum, strerror, exc.filename, exc.filename2)
165+
winerror = "" # Change if we ever need windows support
166+
return cls(errnum, strerror, exc.filename, winerror, exc.filename2)

src/lsst/rsp/startup/services/labrunner/labrunner.py

Lines changed: 127 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import sys
1212
import time
1313
from pathlib import Path
14+
from textwrap import dedent
1415
from typing import Any
1516
from urllib.parse import parse_qsl, urlparse
1617

@@ -176,7 +177,8 @@ def _check_user_scratch_subdir(self, path: Path) -> Path | None:
176177
# Given a path we will test that SCRATCH_PATH/user/path can be
177178
# created as a writable directory (or that it already exists
178179
# as a writable directory). If it can be (or is), we return the
179-
# whole path, and if not, we return None.
180+
# whole path, and if not, we return None. If we can set it,
181+
# we also set the SCRATCH_DIR environment variable to point to it.
180182
#
181183
# This will only be readable by the user; they can chmod() it if
182184
# they want to share, but for TMPDIR and DAF_BUTLER_CACHE_DIRECTORY
@@ -196,11 +198,12 @@ def _check_user_scratch_subdir(self, path: Path) -> Path | None:
196198
self._logger.warning("Could not determine user from environment")
197199
return None
198200
schema = self._env.get("HOMEDIR_SCHEMA", "username")
199-
user_scratch_path = scratch_path / user / path
201+
user_scratch_dir = scratch_path / user
200202
# This is pretty ad-hoc, but USDF uses the first letter in the
201203
# username for both home and scratch
202204
if schema == "initialThenUsername":
203-
user_scratch_path = scratch_path / user[0] / user / path
205+
user_scratch_dir = scratch_path / user[0] / user
206+
user_scratch_path = user_scratch_dir / path
204207
try:
205208
user_scratch_path.mkdir(parents=True, exist_ok=True, mode=0o700)
206209
except OSError as exc:
@@ -212,6 +215,8 @@ def _check_user_scratch_subdir(self, path: Path) -> Path | None:
212215
self._logger.warning(f"Unable to write to {user_scratch_path!s}")
213216
return None
214217
self._logger.debug(f"Using user scratch path {user_scratch_path!s}")
218+
# Set user-specific top dir as SCRATCH_DIR
219+
self._env["SCRATCH_DIR"] = f"{user_scratch_dir!s}"
215220
return user_scratch_path
216221

217222
def _set_tmpdir_if_scratch_available(self) -> None:
@@ -241,16 +246,13 @@ def _set_butler_cache(self) -> None:
241246
dbcd = self._env.get(env_v, "")
242247
if dbcd:
243248
self._logger.debug(
244-
f"Not setting DAF_BUTLER_CACHE_DIRECTORY: already set to"
245-
f" {dbcd}"
249+
f"Not setting {env_v}: already set to" f" {dbcd}"
246250
)
247251
return
248252
temp_path = self._check_user_scratch_subdir(Path("butler_cache"))
249253
if temp_path:
250254
self._env[env_v] = str(temp_path)
251-
self._logger.debug(
252-
f"Set DAF_BUTLER_CACHE_DIRECTORY to {temp_path!s}"
253-
)
255+
self._logger.debug(f"Set {env_v} to {temp_path!s}")
254256
return
255257
# In any sane RSP environment, /tmp will not be shared (it will
256258
# be either tmpfs or on ephemeral storage, and in any case not
@@ -742,37 +744,30 @@ def _set_timeout_variables(self) -> list[str]:
742744
result.append(f"--{timeout_map[setting]}={val}")
743745
return result
744746

745-
def _make_abnormal_landing_page(self) -> None:
746-
# This is very ad-hoc. Revisit after DP1.
747-
# What we're doing is writing in an empty, ephemeral filesystem,
748-
# to drop a document explaining what's going on, and to tweak the
749-
# display settings such that markdown is displayed in its rendered
750-
# form.
751-
abnormal = bool(self._env.get("ABNORMAL_STARTUP", ""))
752-
if not abnormal:
747+
def _make_abnormal_startup_environment(self) -> None:
748+
# What we're doing is writing (we hope) someplace safe, be that
749+
# an empty, ephemeral filesystem (such as /tmp in any sanely-configured
750+
# K8s-based RSP) or in scratch space somewhere.
751+
#
752+
# Performance is irrelevant. As we explain to the user, they should
753+
# not be using this lab for anything other than immediate problem
754+
# amelioration.
755+
756+
# Try a sanity check and ensure that we are in fact in a broken state.
757+
if not self._broken:
753758
return
754-
user = self._env["USER"]
755-
home = self._env.get("NUBLADO_HOME", "") or self._env.get("HOME", "")
756-
if not home:
757-
home = f"/home/{user}" # We're just guessing at this point.
758-
txt = "# Abnormal startup\n"
759-
txt += "\nYour Lab container did not start normally.\n"
760-
txt += f"Error: `{self._env.get("ABNORMAL_STARTUP_MESSAGE","")}`\n"
761-
txt += "\nIf that looks like a file space error, try using the "
762-
txt += f"terminal to remove unneeded files in `{home}`. You can "
763-
txt += "use the `quota` command to check how much space is in use. "
764-
txt += "After that, shut down and restart the Lab.\n"
765-
txt += "\nOtherwise, please open an issue with your RSP site"
766-
txt += " administrator.\n"
759+
760+
txt = self._make_abnormal_landing_markdown()
767761
s_obj = {"defaultViewers": {"markdown": "Markdown Preview"}}
768762
s_txt = json.dumps(s_obj)
769763

770764
try:
771-
welcome = Path("/tmp/notebooks/tutorials/welcome.md")
765+
temphome = self._env.get("SCRATCH_DIR", "/tmp")
766+
welcome = Path(temphome) / "notebooks" / "tutorials" / "welcome.md"
772767
welcome.parent.mkdir(exist_ok=True, parents=True)
773768
welcome.write_text(txt)
774769
settings = (
775-
Path("/tmp")
770+
Path(temphome)
776771
/ ".jupyter"
777772
/ "lab"
778773
/ "user-settings"
@@ -783,21 +778,114 @@ def _make_abnormal_landing_page(self) -> None:
783778
settings.parent.mkdir(exist_ok=True, parents=True)
784779
settings.write_text(s_txt)
785780
except Exception:
786-
self._logger.exception("Writing abnormal startup files failed")
781+
self._logger.exception(
782+
"Writing files to report abnormal startup failed"
783+
)
784+
785+
def _make_abnormal_landing_markdown(self) -> str:
786+
user = self._env["USER"]
787+
home = self._env.get(
788+
"NUBLADO_HOME",
789+
self._env.get(
790+
"HOME",
791+
f"/home/{user}", # Guess, albeit a good one.
792+
),
793+
)
794+
795+
errmsg = self._env.get("ABNORMAL_STARTUP_MESSAGE", "<no message>")
796+
errcode = self._env.get("ABNORMAL_STARTUP_ERRORCODE", "EUNKNOWN")
797+
798+
self._logger.error(
799+
f"Abnormal startup: errorcode {errcode}; message {errmsg}"
800+
)
801+
802+
open_an_issue = dedent(
803+
f"""
804+
805+
Please open an issue with your RSP site administrator with the
806+
following information: `{errmsg}`
807+
"""
808+
)
809+
810+
# Start with generic error text. It's very simple markdown, with a
811+
# heading and literal text only.
812+
813+
txt = dedent("""
814+
# Abnormal startup
815+
816+
Your Lab container did not start normally.
817+
818+
Do not trust this lab for work you want to keep.
819+
820+
""")
821+
822+
# Now add error-specific advice.
823+
match errcode:
824+
case "EDQUOT":
825+
txt += dedent(
826+
f"""
827+
You have exceeded your quota. Try using the terminal to
828+
remove unneeded files in `{home}`. You can use the
829+
`quota` command to check your usage.
830+
831+
After that, shut down and restart the lab. If that does
832+
not result in a working lab:
833+
"""
834+
)
835+
case "ENOSPC":
836+
txt += dedent(
837+
f"""
838+
You have run out of filesystem space. Try using the
839+
terminal to remove unneeded files in `{home}`. Since the
840+
filesystem is full, this may not be something you can
841+
correct.
842+
843+
After you have trimmed whatever possible, shut down and
844+
restart the lab.
845+
846+
If that does not result in a working lab:
847+
"""
848+
)
849+
case "EROFS" | "EACCES":
850+
txt += dedent(
851+
"""
852+
You do not have permission to write. Ask your RSP
853+
administrator to check ownership and permissions on your
854+
directories.
855+
"""
856+
)
857+
case "EBADENV":
858+
txt += dedent(
859+
"""
860+
You are missing environment variables necessary for RSP
861+
operation.
862+
"""
863+
)
864+
case _:
865+
pass
866+
txt += dedent(open_an_issue)
867+
return txt
787868

788869
def _start(self) -> None:
789-
abnormal = bool(self._env.get("ABNORMAL_STARTUP", ""))
790870
log_level = "DEBUG" if self._debug else "INFO"
791871
notebook_dir = f"{self._home!s}"
792-
if abnormal:
872+
if self._broken:
793873
self._logger.warning(
794874
f"Abnormal startup: {self._env['ABNORMAL_STARTUP_MESSAGE']}"
795875
)
796-
self._make_abnormal_landing_page()
797-
self._logger.warning("Launching with homedir='/tmp'")
798-
self._env["HOME"] = "/tmp"
799-
os.environ["HOME"] = "/tmp"
800-
notebook_dir = "/tmp"
876+
self._make_abnormal_startup_environment()
877+
#
878+
# We will check to see if we got SCRATCH_DIR set before we broke,
879+
# and if so, use that, which would be a user-specific path on a
880+
# scratch filesystem. If we didn't, we just use "/tmp" and hope
881+
# for the best. Any reasonably-configured RSP running under K8s
882+
# will not have a shared "/tmp".
883+
#
884+
temphome = self._env.get("SCRATCH_DIR", "/tmp")
885+
self._logger.warning(f"Launching with homedir='{temphome}'")
886+
self._env["HOME"] = temphome
887+
os.environ["HOME"] = temphome
888+
notebook_dir = temphome
801889

802890
cmd = [
803891
"jupyterhub-singleuser",

0 commit comments

Comments
 (0)