Add and return public page search for workflow

tw4l · tw4l · commit f4d0d9469c1b · 2025-07-30T16:55:59.000-04:00
diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
@@ -162,6 +162,7 @@ async def get_crawl_out(
         type_: Optional[str] = None,
         skip_resources=False,
         headers: Optional[dict] = None,
+        cid: Optional[UUID] = None,
     ) -> CrawlOutWithResources:
         """Get crawl data for api output"""
         res = await self.get_crawl_raw(crawlid, org, type_)
@@ -183,9 +184,17 @@ async def get_crawl_out(
                 oid = res.get("oid")
                 if oid:
                     origin = get_origin(headers)
-                    res["pagesQueryUrl"] = (
-                        origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
-                    )
+                    # If cid is passed, construct pagesSearch query for public
+                    # shareable workflow
+                    if cid:
+                        res["pagesQueryUrl"] = (
+                            origin
+                            + f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch"
+                        )
+                    else:
+                        res["pagesQueryUrl"] = (
+                            origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
+                        )
 
                 # this will now disable the downloadUrl in RWP
                 res["downloadUrl"] = None
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -832,9 +832,10 @@ async def get_last_successful_crawl_out(
         self,
         cid: UUID,
         org: Organization,
-        request: Request,
+        request: Optional[Request] = None,
     ) -> Optional[CrawlOutWithResources]:
         """Return the last successful crawl out with resources for this config, if any"""
+        headers = dict(request.headers) if request else None
         match_query = {
             "cid": cid,
             "oid": org.id,
@@ -846,7 +847,7 @@ async def get_last_successful_crawl_out(
         )
         if last_crawl:
             return await self.crawl_ops.get_crawl_out(
-                last_crawl["_id"], org, "crawl", headers=dict(request.headers)
+                last_crawl["_id"], org, "crawl", headers=headers, cid=cid
             )
 
         return None
diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py
@@ -266,6 +266,7 @@ def main() -> None:
         storage_ops,
         background_job_ops,
         coll_ops,
+        crawl_config_ops,
         current_active_user,
     )
 
diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
@@ -1084,7 +1084,15 @@ async def process_finished_crawls():
 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
 def init_pages_api(
-    app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
+    app,
+    mdb,
+    crawl_ops,
+    org_ops,
+    storage_ops,
+    background_job_ops,
+    coll_ops,
+    crawl_config_ops,
+    user_dep,
 ) -> PageOps:
     """init pages API"""
     # pylint: disable=invalid-name
@@ -1336,6 +1344,47 @@ async def get_search_pages_list(
         )
         return {"items": pages}
 
+    @app.get(
+        "/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch",
+        tags=["pages", "crawlconfigs"],
+        response_model=PageOutItemsResponse,
+    )
+    async def get_search_pages_list_shareable_crawl_config(
+        cid: UUID,
+        org: Organization = Depends(org_public),
+        search: Optional[str] = None,
+        url: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        isSeed: Optional[bool] = None,
+        depth: Optional[int] = None,
+        pageSize: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+    ):
+        """Retrieve paginated list of pages for last successful crawl of workflow"""
+        crawl_config = await crawl_config_ops.get_crawl_config(
+            cid, org.id, active_only=True
+        )
+        if not crawl_config.shareable:
+            raise HTTPException(status_code=404, detail="crawl_config_not_found")
+
+        last_successful_crawl_out = (
+            await crawl_config_ops.get_last_successful_crawl_out(cid, org)
+        )
+
+        pages, _ = await ops.list_pages(
+            crawl_ids=[last_successful_crawl_out.id],
+            search=search,
+            url=url,
+            ts=ts,
+            is_seed=isSeed,
+            depth=depth,
+            org=org,
+            page_size=pageSize,
+            page=page,
+            include_total=False,
+        )
+        return {"items": pages}
+
     @app.get(
         "/orgs/{oid}/collections/{coll_id}/public/pages",
         tags=["pages", "collections"],
diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py
@@ -1013,6 +1013,12 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
     assert r.status_code == 404
     assert r.json()["detail"] == "crawl_config_not_found"
 
+    # Verify public pagesSearch endpoint returns 404 while not shareable
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
+    )
+    assert r.status_code == 404
+
     # Mark workflow as shareable
     r = requests.patch(
         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
@@ -1051,7 +1057,20 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
     assert resources[0]["path"]
 
     assert len(data["initialPages"]) == 4
-    assert data["pagesQueryUrl"].endswith(
-        f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch"
+
+    pages_query_url = data["pagesQueryUrl"]
+    assert pages_query_url.endswith(
+        f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
     )
     assert data["downloadUrl"] is None
+
+    # Verify pages search endpoint is accessible and works
+    r = requests.get(pages_query_url)
+    assert r.status_code == 200
+    data = r.json()
+    assert data["items"]
+    for page in data["items"]:
+        assert page["id"]
+        assert page["oid"] == default_org_id
+        assert page["crawl_id"] == admin_crawl_id
+        assert page["url"]

Original file line number	Diff line number	Diff line change
`@@ -266,6 +266,7 @@ def main() -> None:`
`266`	`266`	`storage_ops,`
`267`	`267`	`background_job_ops,`
`268`	`268`	`coll_ops,`
	`269`	`+ crawl_config_ops,`
`269`	`270`	`current_active_user,`
`270`	`271`	`)`
`271`	`272`