Fix lint

EnricoMi · EnricoMi · commit 7132bad1cf9e · 2025-06-26T21:35:19.000+02:00
diff --git a/queries/polars/cloud_utils.py b/queries/polars/cloud_utils.py
@@ -0,0 +1,78 @@
+import base64
+import json
+import pathlib
+from uuid import UUID
+
+import polars_cloud as pc
+
+from settings import Settings
+
+settings = Settings()
+
+
+def reuse_compute_context(filename: str, log_reuse: bool) -> pc.ComputeContext | None:
+    with pathlib.Path(filename).open("r", encoding="utf8") as r:
+        context_args = json.load(r)
+
+    required_keys = ["workspace_id", "compute_id"]
+    for key in required_keys:
+        assert key in context_args, f"Key {key} not in {filename}"
+    if log_reuse:
+        print(f"Reusing existing compute context: {context_args['compute_id']}")
+    context_args = {key: UUID(context_args.get(key)) for key in required_keys}
+    try:
+        ctx = pc.ComputeContext.connect(**context_args)
+        ctx.start(wait=True)
+        assert(ctx.get_status() == pc.ComputeContextStatus.RUNNING)
+        return ctx
+    except RuntimeError as e:
+        print(f"Cannot reuse existing compute context: {e.args}")
+        return None
+
+
+def get_compute_context_args() -> dict[str, str | int]:
+    return {
+        key: value
+        for key, value in {
+            "cpus": settings.run.polars_cloud_cpus,
+            "memory": settings.run.polars_cloud_memory,
+            "instance_type": settings.run.polars_cloud_instance_type,
+            "cluster_size": settings.run.polars_cloud_cluster_size,
+            "workspace": settings.run.polars_cloud_workspace,
+        }.items()
+        if value is not None
+    }
+
+
+def get_compute_context_filename(context_args: dict[str, str | int]) -> str:
+    hash = base64.b64encode(str(context_args).encode("utf-8")).decode()
+    return f".polars-cloud-compute-context-{hash}.json"
+
+
+def get_compute_context(*, create_if_no_reuse: bool = True, log_create: bool = False, log_reuse: bool = False) -> pc.ComputeContext:
+    context_args = get_compute_context_args()
+    context_filename = get_compute_context_filename(context_args)
+    if pathlib.Path(context_filename).is_file():
+        ctx = reuse_compute_context(context_filename, log_reuse)
+        if ctx:
+            return ctx
+
+    # start new compute context
+    if not create_if_no_reuse:
+        raise RuntimeError("Cannot reuse compute context")
+    if log_create:
+        print(f"Starting new compute context: {context_args}")
+    ctx = pc.ComputeContext(**context_args)  # type: ignore[arg-type]
+    ctx.start(wait=True)
+    assert(ctx.get_status() == pc.ComputeContextStatus.RUNNING)
+    context_args = {"workspace_id": str(ctx.workspace.id), "compute_id": str(ctx._compute_id)}
+    with pathlib.Path(context_filename).open("w", encoding="utf8") as w:
+        json.dump(context_args, w)
+    return ctx
+
+
+def stop_compute_context(ctx: pc.ComputeContext) -> None:
+    ctx.stop(wait=True)
+    context_args = get_compute_context_args()
+    context_filename = get_compute_context_filename(context_args)
+    pathlib.Path(context_filename).unlink(missing_ok=True)
diff --git a/queries/polars/utils.py b/queries/polars/utils.py
@@ -4,12 +4,15 @@
 from typing import Literal
 
 import polars as pl
+
 from queries.common_utils import (
     check_query_result_pl,
-    execute_all as common_execute_all,
     get_table_path,
     run_query_generic,
 )
+from queries.common_utils import (
+    execute_all as common_execute_all,
+)
 from queries.polars.cloud_utils import get_compute_context, stop_compute_context
 from settings import Settings
 
@@ -18,7 +21,7 @@
 
 def execute_all() -> None:
     if not settings.run.polars_cloud:
-        return execute_all("polars")
+        return common_execute_all("polars")
 
     # for polars cloud we have to create the compute context,
     # reuse it across the queries, and stop it in the end
@@ -32,18 +35,20 @@ def execute_all() -> None:
 
 def _scan_ds(table_name: str) -> pl.LazyFrame:
     path = get_table_path(table_name)
-    # pathlib.Path normalizes consecutive slashes, unless Path.from_uri is used (Python >= 3.13)
-    if isinstance(path, pathlib.Path) and str(path).startswith("s3:/") and not str(path).startswith("s3://"):
-        path = f"s3://{str(path)[4:]}"
+    # pathlib.Path normalizes consecutive slashes,
+    # unless Path.from_uri is used (Python >= 3.13)
+    path_str = str(path)
+    if path_str.startswith("s3:/") and not path_str.startswith("s3://"):
+        path_str = f"s3://{str(path)[4:]}"
 
     if settings.run.io_type == "skip":
-        return pl.read_parquet(path, rechunk=True).lazy()
+        return pl.read_parquet(path_str, rechunk=True).lazy()
     if settings.run.io_type == "parquet":
-        return pl.scan_parquet(path)
+        return pl.scan_parquet(path_str)
     elif settings.run.io_type == "feather":
-        return pl.scan_ipc(path)
+        return pl.scan_ipc(path_str)
     elif settings.run.io_type == "csv":
-        return pl.scan_csv(path, try_parse_dates=True)
+        return pl.scan_csv(path_str, try_parse_dates=True)
     else:
         msg = f"unsupported file type: {settings.run.io_type!r}"
         raise ValueError(msg)
@@ -184,11 +189,7 @@ def run_query(query_number: int, lf: pl.LazyFrame) -> None:
         ctx = get_compute_context(create_if_no_reuse=False)
 
         def query():  # type: ignore[no-untyped-def]
-            result = lf.remote(context=ctx).distributed().collect()
-
-            if settings.run.show_results:
-                print(result.plan())
-            return result.lazy().collect()
+            return lf.remote(context=ctx).distributed().collect()
     else:
         query = partial(
             lf.collect,