Skip to content

Commit f4d0d94

Browse files
committed
Add and return public page search for workflow
1 parent da801c1 commit f4d0d94

File tree

5 files changed

+87
-8
lines changed

5 files changed

+87
-8
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ async def get_crawl_out(
162162
type_: Optional[str] = None,
163163
skip_resources=False,
164164
headers: Optional[dict] = None,
165+
cid: Optional[UUID] = None,
165166
) -> CrawlOutWithResources:
166167
"""Get crawl data for api output"""
167168
res = await self.get_crawl_raw(crawlid, org, type_)
@@ -183,9 +184,17 @@ async def get_crawl_out(
183184
oid = res.get("oid")
184185
if oid:
185186
origin = get_origin(headers)
186-
res["pagesQueryUrl"] = (
187-
origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
188-
)
187+
# If cid is passed, construct pagesSearch query for public
188+
# shareable workflow
189+
if cid:
190+
res["pagesQueryUrl"] = (
191+
origin
192+
+ f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch"
193+
)
194+
else:
195+
res["pagesQueryUrl"] = (
196+
origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
197+
)
189198

190199
# this will now disable the downloadUrl in RWP
191200
res["downloadUrl"] = None

backend/btrixcloud/crawlconfigs.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -832,9 +832,10 @@ async def get_last_successful_crawl_out(
832832
self,
833833
cid: UUID,
834834
org: Organization,
835-
request: Request,
835+
request: Optional[Request] = None,
836836
) -> Optional[CrawlOutWithResources]:
837837
"""Return the last successful crawl out with resources for this config, if any"""
838+
headers = dict(request.headers) if request else None
838839
match_query = {
839840
"cid": cid,
840841
"oid": org.id,
@@ -846,7 +847,7 @@ async def get_last_successful_crawl_out(
846847
)
847848
if last_crawl:
848849
return await self.crawl_ops.get_crawl_out(
849-
last_crawl["_id"], org, "crawl", headers=dict(request.headers)
850+
last_crawl["_id"], org, "crawl", headers=headers, cid=cid
850851
)
851852

852853
return None

backend/btrixcloud/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ def main() -> None:
266266
storage_ops,
267267
background_job_ops,
268268
coll_ops,
269+
crawl_config_ops,
269270
current_active_user,
270271
)
271272

backend/btrixcloud/pages.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1084,7 +1084,15 @@ async def process_finished_crawls():
10841084
# ============================================================================
10851085
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
10861086
def init_pages_api(
1087-
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
1087+
app,
1088+
mdb,
1089+
crawl_ops,
1090+
org_ops,
1091+
storage_ops,
1092+
background_job_ops,
1093+
coll_ops,
1094+
crawl_config_ops,
1095+
user_dep,
10881096
) -> PageOps:
10891097
"""init pages API"""
10901098
# pylint: disable=invalid-name
@@ -1336,6 +1344,47 @@ async def get_search_pages_list(
13361344
)
13371345
return {"items": pages}
13381346

1347+
@app.get(
1348+
"/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch",
1349+
tags=["pages", "crawlconfigs"],
1350+
response_model=PageOutItemsResponse,
1351+
)
1352+
async def get_search_pages_list_shareable_crawl_config(
1353+
cid: UUID,
1354+
org: Organization = Depends(org_public),
1355+
search: Optional[str] = None,
1356+
url: Optional[str] = None,
1357+
ts: Optional[datetime] = None,
1358+
isSeed: Optional[bool] = None,
1359+
depth: Optional[int] = None,
1360+
pageSize: int = DEFAULT_PAGE_SIZE,
1361+
page: int = 1,
1362+
):
1363+
"""Retrieve paginated list of pages for last successful crawl of workflow"""
1364+
crawl_config = await crawl_config_ops.get_crawl_config(
1365+
cid, org.id, active_only=True
1366+
)
1367+
if not crawl_config.shareable:
1368+
raise HTTPException(status_code=404, detail="crawl_config_not_found")
1369+
1370+
last_successful_crawl_out = (
1371+
await crawl_config_ops.get_last_successful_crawl_out(cid, org)
1372+
)
1373+
1374+
pages, _ = await ops.list_pages(
1375+
crawl_ids=[last_successful_crawl_out.id],
1376+
search=search,
1377+
url=url,
1378+
ts=ts,
1379+
is_seed=isSeed,
1380+
depth=depth,
1381+
org=org,
1382+
page_size=pageSize,
1383+
page=page,
1384+
include_total=False,
1385+
)
1386+
return {"items": pages}
1387+
13391388
@app.get(
13401389
"/orgs/{oid}/collections/{coll_id}/public/pages",
13411390
tags=["pages", "collections"],

backend/test/test_crawlconfigs.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,12 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
10131013
assert r.status_code == 404
10141014
assert r.json()["detail"] == "crawl_config_not_found"
10151015

1016+
# Verify public pagesSearch endpoint returns 404 while not shareable
1017+
r = requests.get(
1018+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
1019+
)
1020+
assert r.status_code == 404
1021+
10161022
# Mark workflow as shareable
10171023
r = requests.patch(
10181024
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
@@ -1051,7 +1057,20 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
10511057
assert resources[0]["path"]
10521058

10531059
assert len(data["initialPages"]) == 4
1054-
assert data["pagesQueryUrl"].endswith(
1055-
f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch"
1060+
1061+
pages_query_url = data["pagesQueryUrl"]
1062+
assert pages_query_url.endswith(
1063+
f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
10561064
)
10571065
assert data["downloadUrl"] is None
1066+
1067+
# Verify pages search endpoint is accessible and works
1068+
r = requests.get(pages_query_url)
1069+
assert r.status_code == 200
1070+
data = r.json()
1071+
assert data["items"]
1072+
for page in data["items"]:
1073+
assert page["id"]
1074+
assert page["oid"] == default_org_id
1075+
assert page["crawl_id"] == admin_crawl_id
1076+
assert page["url"]

0 commit comments

Comments
 (0)