Add public replay.json endpoint for shareable workflows

tw4l · tw4l · commit da801c1a2324 · 2025-07-30T15:58:19.000-04:00
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -16,7 +16,7 @@
 import urllib.parse
 
 import aiohttp
-from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response
 import pymongo
 
 from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@@ -27,17 +27,20 @@
     CrawlConfigOut,
     CrawlConfigTags,
     CrawlOut,
+    CrawlOutWithResources,
     UpdateCrawlConfig,
     Organization,
     User,
     PaginatedCrawlConfigOutResponse,
     PaginatedSeedResponse,
     PaginatedConfigRevisionResponse,
+    SUCCESSFUL_STATES,
     FAILED_STATES,
     CrawlerChannel,
     CrawlerChannels,
     StartedResponse,
     SuccessResponse,
+    EmptyResponse,
     CrawlConfigAddedResponse,
     CrawlConfigSearchValues,
     CrawlConfigUpdateResponse,
@@ -825,6 +828,29 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
 
         return None
 
+    async def get_last_successful_crawl_out(
+        self,
+        cid: UUID,
+        org: Organization,
+        request: Request,
+    ) -> Optional[CrawlOutWithResources]:
+        """Return the last successful crawl out with resources for this config, if any"""
+        match_query = {
+            "cid": cid,
+            "oid": org.id,
+            "finished": {"$ne": None},
+            "state": {"$in": SUCCESSFUL_STATES},
+        }
+        last_crawl = await self.crawls.find_one(
+            match_query, sort=[("finished", pymongo.DESCENDING)]
+        )
+        if last_crawl:
+            return await self.crawl_ops.get_crawl_out(
+                last_crawl["_id"], org, "crawl", headers=dict(request.headers)
+            )
+
+        return None
+
     async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
         """recompute stats by incrementing size counter and number of crawls"""
         update_query: dict[str, object] = {}
@@ -1479,6 +1505,7 @@ def init_crawl_config_api(
 
     org_crawl_dep = org_ops.org_crawl_dep
     org_viewer_dep = org_ops.org_viewer_dep
+    org_public = org_ops.org_public
 
     @router.get("", response_model=PaginatedCrawlConfigOutResponse)
     async def get_crawl_configs(
@@ -1595,6 +1622,38 @@ async def get_all_crawler_proxies(
 
         return ops.get_crawler_proxies()
 
+    @app.get(
+        "/orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
+        response_model=CrawlOutWithResources,
+    )
+    async def get_crawl_config_latest_crawl_public_replay(
+        request: Request,
+        response: Response,
+        cid: UUID,
+        org: Organization = Depends(org_public),
+    ):
+        crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True)
+        if not crawl_config.shareable:
+            raise HTTPException(status_code=404, detail="crawl_config_not_found")
+
+        last_successful_crawl_out = await ops.get_last_successful_crawl_out(
+            cid, org, request
+        )
+
+        response.headers["Access-Control-Allow-Origin"] = "*"
+        response.headers["Access-Control-Allow-Headers"] = "*"
+        return last_successful_crawl_out
+
+    @app.options(
+        "orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
+        response_model=EmptyResponse,
+    )
+    async def get_replay_preflight(response: Response):
+        response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
+        response.headers["Access-Control-Allow-Origin"] = "*"
+        response.headers["Access-Control-Allow-Headers"] = "*"
+        return {}
+
     @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
     async def get_crawl_config_seeds(
         cid: UUID,
diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py
@@ -995,3 +995,63 @@ def test_delete_in_use_seed_file(
     )
     assert r.status_code == 200
     assert r.json()["id"] == seed_file_id
+
+
+def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
+    # Verify workflow is not shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json()["shareable"] is False
+
+    # Verify public replay.json returns 404 while not shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
+    )
+    assert r.status_code == 404
+    assert r.json()["detail"] == "crawl_config_not_found"
+
+    # Mark workflow as shareable
+    r = requests.patch(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
+        headers=admin_auth_headers,
+        json={"shareable": True},
+    )
+    assert r.status_code == 200
+
+    data = r.json()
+    assert data["updated"]
+    assert data["settings_changed"]
+    assert data["metadata_changed"] is False
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json()["shareable"]
+
+    # Verify public replay.json returns last successful crawl while shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["id"] == admin_crawl_id
+    assert data["oid"] == default_org_id
+    assert data["cid"] == _admin_crawl_cid
+    assert data["type"] == "crawl"
+    assert data["state"] == "complete"
+
+    resources = data["resources"]
+    assert resources
+    assert resources[0]["path"]
+
+    assert len(data["initialPages"]) == 4
+    assert data["pagesQueryUrl"].endswith(
+        f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch"
+    )
+    assert data["downloadUrl"] is None