|
16 | 16 | import urllib.parse
|
17 | 17 |
|
18 | 18 | import aiohttp
|
19 |
| -from fastapi import APIRouter, Depends, HTTPException, Query |
| 19 | +from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response |
20 | 20 | import pymongo
|
21 | 21 |
|
22 | 22 | from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
|
27 | 27 | CrawlConfigOut,
|
28 | 28 | CrawlConfigTags,
|
29 | 29 | CrawlOut,
|
| 30 | + CrawlOutWithResources, |
30 | 31 | UpdateCrawlConfig,
|
31 | 32 | Organization,
|
32 | 33 | User,
|
33 | 34 | PaginatedCrawlConfigOutResponse,
|
34 | 35 | PaginatedSeedResponse,
|
35 | 36 | PaginatedConfigRevisionResponse,
|
| 37 | + SUCCESSFUL_STATES, |
36 | 38 | FAILED_STATES,
|
37 | 39 | CrawlerChannel,
|
38 | 40 | CrawlerChannels,
|
39 | 41 | StartedResponse,
|
40 | 42 | SuccessResponse,
|
| 43 | + EmptyResponse, |
41 | 44 | CrawlConfigAddedResponse,
|
42 | 45 | CrawlConfigSearchValues,
|
43 | 46 | CrawlConfigUpdateResponse,
|
@@ -825,6 +828,29 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
|
825 | 828 |
|
826 | 829 | return None
|
827 | 830 |
|
| 831 | + async def get_last_successful_crawl_out( |
| 832 | + self, |
| 833 | + cid: UUID, |
| 834 | + org: Organization, |
| 835 | + request: Request, |
| 836 | + ) -> Optional[CrawlOutWithResources]: |
| 837 | + """Return the last successful crawl out with resources for this config, if any""" |
| 838 | + match_query = { |
| 839 | + "cid": cid, |
| 840 | + "oid": org.id, |
| 841 | + "finished": {"$ne": None}, |
| 842 | + "state": {"$in": SUCCESSFUL_STATES}, |
| 843 | + } |
| 844 | + last_crawl = await self.crawls.find_one( |
| 845 | + match_query, sort=[("finished", pymongo.DESCENDING)] |
| 846 | + ) |
| 847 | + if last_crawl: |
| 848 | + return await self.crawl_ops.get_crawl_out( |
| 849 | + last_crawl["_id"], org, "crawl", headers=dict(request.headers) |
| 850 | + ) |
| 851 | + |
| 852 | + return None |
| 853 | + |
828 | 854 | async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
|
829 | 855 | """recompute stats by incrementing size counter and number of crawls"""
|
830 | 856 | update_query: dict[str, object] = {}
|
@@ -1479,6 +1505,7 @@ def init_crawl_config_api(
|
1479 | 1505 |
|
1480 | 1506 | org_crawl_dep = org_ops.org_crawl_dep
|
1481 | 1507 | org_viewer_dep = org_ops.org_viewer_dep
|
| 1508 | + org_public = org_ops.org_public |
1482 | 1509 |
|
1483 | 1510 | @router.get("", response_model=PaginatedCrawlConfigOutResponse)
|
1484 | 1511 | async def get_crawl_configs(
|
@@ -1595,6 +1622,38 @@ async def get_all_crawler_proxies(
|
1595 | 1622 |
|
1596 | 1623 | return ops.get_crawler_proxies()
|
1597 | 1624 |
|
| 1625 | + @app.get( |
| 1626 | + "/orgs/{oid}/crawlconfigs/{cid}/public/replay.json", |
| 1627 | + response_model=CrawlOutWithResources, |
| 1628 | + ) |
| 1629 | + async def get_crawl_config_latest_crawl_public_replay( |
| 1630 | + request: Request, |
| 1631 | + response: Response, |
| 1632 | + cid: UUID, |
| 1633 | + org: Organization = Depends(org_public), |
| 1634 | + ): |
| 1635 | + crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True) |
| 1636 | + if not crawl_config.shareable: |
| 1637 | + raise HTTPException(status_code=404, detail="crawl_config_not_found") |
| 1638 | + |
| 1639 | + last_successful_crawl_out = await ops.get_last_successful_crawl_out( |
| 1640 | + cid, org, request |
| 1641 | + ) |
| 1642 | + |
| 1643 | + response.headers["Access-Control-Allow-Origin"] = "*" |
| 1644 | + response.headers["Access-Control-Allow-Headers"] = "*" |
| 1645 | + return last_successful_crawl_out |
| 1646 | + |
| 1647 | + @app.options( |
| 1648 | + "orgs/{oid}/crawlconfigs/{cid}/public/replay.json", |
| 1649 | + response_model=EmptyResponse, |
| 1650 | + ) |
| 1651 | + async def get_replay_preflight(response: Response): |
| 1652 | + response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" |
| 1653 | + response.headers["Access-Control-Allow-Origin"] = "*" |
| 1654 | + response.headers["Access-Control-Allow-Headers"] = "*" |
| 1655 | + return {} |
| 1656 | + |
1598 | 1657 | @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
|
1599 | 1658 | async def get_crawl_config_seeds(
|
1600 | 1659 | cid: UUID,
|
|
0 commit comments