Skip to content

Commit da801c1

Browse files
committed
Add public replay.json endpoint for shareable workflows
1 parent 971bc0b commit da801c1

File tree

2 files changed

+120
-1
lines changed

2 files changed

+120
-1
lines changed

backend/btrixcloud/crawlconfigs.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import urllib.parse
1717

1818
import aiohttp
19-
from fastapi import APIRouter, Depends, HTTPException, Query
19+
from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response
2020
import pymongo
2121

2222
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@@ -27,17 +27,20 @@
2727
CrawlConfigOut,
2828
CrawlConfigTags,
2929
CrawlOut,
30+
CrawlOutWithResources,
3031
UpdateCrawlConfig,
3132
Organization,
3233
User,
3334
PaginatedCrawlConfigOutResponse,
3435
PaginatedSeedResponse,
3536
PaginatedConfigRevisionResponse,
37+
SUCCESSFUL_STATES,
3638
FAILED_STATES,
3739
CrawlerChannel,
3840
CrawlerChannels,
3941
StartedResponse,
4042
SuccessResponse,
43+
EmptyResponse,
4144
CrawlConfigAddedResponse,
4245
CrawlConfigSearchValues,
4346
CrawlConfigUpdateResponse,
@@ -825,6 +828,29 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
825828

826829
return None
827830

831+
async def get_last_successful_crawl_out(
832+
self,
833+
cid: UUID,
834+
org: Organization,
835+
request: Request,
836+
) -> Optional[CrawlOutWithResources]:
837+
"""Return the last successful crawl out with resources for this config, if any"""
838+
match_query = {
839+
"cid": cid,
840+
"oid": org.id,
841+
"finished": {"$ne": None},
842+
"state": {"$in": SUCCESSFUL_STATES},
843+
}
844+
last_crawl = await self.crawls.find_one(
845+
match_query, sort=[("finished", pymongo.DESCENDING)]
846+
)
847+
if last_crawl:
848+
return await self.crawl_ops.get_crawl_out(
849+
last_crawl["_id"], org, "crawl", headers=dict(request.headers)
850+
)
851+
852+
return None
853+
828854
async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
829855
"""recompute stats by incrementing size counter and number of crawls"""
830856
update_query: dict[str, object] = {}
@@ -1479,6 +1505,7 @@ def init_crawl_config_api(
14791505

14801506
org_crawl_dep = org_ops.org_crawl_dep
14811507
org_viewer_dep = org_ops.org_viewer_dep
1508+
org_public = org_ops.org_public
14821509

14831510
@router.get("", response_model=PaginatedCrawlConfigOutResponse)
14841511
async def get_crawl_configs(
@@ -1595,6 +1622,38 @@ async def get_all_crawler_proxies(
15951622

15961623
return ops.get_crawler_proxies()
15971624

1625+
@app.get(
1626+
"/orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
1627+
response_model=CrawlOutWithResources,
1628+
)
1629+
async def get_crawl_config_latest_crawl_public_replay(
1630+
request: Request,
1631+
response: Response,
1632+
cid: UUID,
1633+
org: Organization = Depends(org_public),
1634+
):
1635+
crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True)
1636+
if not crawl_config.shareable:
1637+
raise HTTPException(status_code=404, detail="crawl_config_not_found")
1638+
1639+
last_successful_crawl_out = await ops.get_last_successful_crawl_out(
1640+
cid, org, request
1641+
)
1642+
1643+
response.headers["Access-Control-Allow-Origin"] = "*"
1644+
response.headers["Access-Control-Allow-Headers"] = "*"
1645+
return last_successful_crawl_out
1646+
1647+
@app.options(
1648+
"orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
1649+
response_model=EmptyResponse,
1650+
)
1651+
async def get_replay_preflight(response: Response):
1652+
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
1653+
response.headers["Access-Control-Allow-Origin"] = "*"
1654+
response.headers["Access-Control-Allow-Headers"] = "*"
1655+
return {}
1656+
15981657
@router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
15991658
async def get_crawl_config_seeds(
16001659
cid: UUID,

backend/test/test_crawlconfigs.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -995,3 +995,63 @@ def test_delete_in_use_seed_file(
995995
)
996996
assert r.status_code == 200
997997
assert r.json()["id"] == seed_file_id
998+
999+
1000+
def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
1001+
# Verify workflow is not shareable
1002+
r = requests.get(
1003+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
1004+
headers=admin_auth_headers,
1005+
)
1006+
assert r.status_code == 200
1007+
assert r.json()["shareable"] is False
1008+
1009+
# Verify public replay.json returns 404 while not shareable
1010+
r = requests.get(
1011+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
1012+
)
1013+
assert r.status_code == 404
1014+
assert r.json()["detail"] == "crawl_config_not_found"
1015+
1016+
# Mark workflow as shareable
1017+
r = requests.patch(
1018+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
1019+
headers=admin_auth_headers,
1020+
json={"shareable": True},
1021+
)
1022+
assert r.status_code == 200
1023+
1024+
data = r.json()
1025+
assert data["updated"]
1026+
assert data["settings_changed"]
1027+
assert data["metadata_changed"] is False
1028+
1029+
r = requests.get(
1030+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
1031+
headers=admin_auth_headers,
1032+
)
1033+
assert r.status_code == 200
1034+
assert r.json()["shareable"]
1035+
1036+
# Verify public replay.json returns last successful crawl while shareable
1037+
r = requests.get(
1038+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
1039+
)
1040+
assert r.status_code == 200
1041+
data = r.json()
1042+
1043+
assert data["id"] == admin_crawl_id
1044+
assert data["oid"] == default_org_id
1045+
assert data["cid"] == _admin_crawl_cid
1046+
assert data["type"] == "crawl"
1047+
assert data["state"] == "complete"
1048+
1049+
resources = data["resources"]
1050+
assert resources
1051+
assert resources[0]["path"]
1052+
1053+
assert len(data["initialPages"]) == 4
1054+
assert data["pagesQueryUrl"].endswith(
1055+
f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch"
1056+
)
1057+
assert data["downloadUrl"] is None

0 commit comments

Comments
 (0)