Skip to content

Commit 8d32afa

Browse files
committed
Add nightly test for scheduled crawls
1 parent c0cf6e6 commit 8d32afa

File tree

1 file changed

+90
-0
lines changed

1 file changed

+90
-0
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import time
2+
3+
import pytest
4+
import requests
5+
6+
from .conftest import API_PREFIX
7+
8+
# Every five minutes
9+
SCHEDULE = "*/5 * * * *"
10+
11+
12+
@pytest.fixture(scope="session")
13+
def scheduled_config_id(admin_auth_headers, default_org_id):
14+
# Start crawl
15+
crawl_data = {
16+
"runNow": False,
17+
"schedule": SCHEDULE,
18+
"name": "Scheduled crawl",
19+
"config": {
20+
"seeds": [{"url": "https://webrecorder.net"}],
21+
"scopeType": "page",
22+
"limit": 1,
23+
},
24+
}
25+
r = requests.post(
26+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
27+
headers=admin_auth_headers,
28+
json=crawl_data,
29+
)
30+
data = r.json()
31+
return data["id"]
32+
33+
34+
def test_scheduled_crawl(admin_auth_headers, default_org_id, scheduled_config_id):
35+
# Ensure workflow exists with correct schedule, no crawls yet
36+
r = requests.get(
37+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{scheduled_config_id}",
38+
headers=admin_auth_headers,
39+
)
40+
assert r.status_code == 200
41+
data = r.json()
42+
43+
assert data["schedule"] == SCHEDULE
44+
45+
assert data["crawlCount"] == 0
46+
assert data["crawlAttemptCount"] == 0
47+
assert data["crawlSuccessfulCount"] == 0
48+
49+
assert data["lastCrawlId"] is None
50+
assert data["lastCrawlState"] is None
51+
52+
# Wait until a crawl completes (up to 20 minutes)
53+
attempts = 0
54+
max_attempts = 120
55+
56+
while True:
57+
attempts += 1
58+
59+
if attempts > max_attempts:
60+
break
61+
62+
r = requests.get(
63+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{scheduled_config_id}",
64+
headers=admin_auth_headers,
65+
)
66+
assert r.status_code == 200
67+
data = r.json()
68+
69+
last_crawl_id = data.get("lastCrawlId")
70+
last_crawl_state = data.get("lastCrawlState")
71+
72+
if not last_crawl_id or last_crawl_state not in ("complete", "failed"):
73+
time.sleep(10)
74+
75+
# Recheck workflow stats
76+
r = requests.get(
77+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{scheduled_config_id}",
78+
headers=admin_auth_headers,
79+
)
80+
assert r.status_code == 200
81+
data = r.json()
82+
83+
assert data["schedule"] == SCHEDULE
84+
85+
assert data["crawlCount"] >= 1
86+
assert data["crawlAttemptCount"] >= 1
87+
assert data["crawlSuccessfulCount"] >= 1
88+
89+
assert data["lastCrawlId"]
90+
assert data["lastCrawlState"] == "complete"

0 commit comments

Comments
 (0)