|
| 1 | +import time |
| 2 | + |
| 3 | +import pytest |
| 4 | +import requests |
| 5 | + |
| 6 | +from .conftest import API_PREFIX |
| 7 | + |
| 8 | +# Every five minutes |
| 9 | +SCHEDULE = "*/5 * * * *" |
| 10 | + |
| 11 | + |
| 12 | +@pytest.fixture(scope="session") |
| 13 | +def scheduled_config_id(admin_auth_headers, default_org_id): |
| 14 | + # Start crawl |
| 15 | + crawl_data = { |
| 16 | + "runNow": False, |
| 17 | + "schedule": SCHEDULE, |
| 18 | + "name": "Scheduled crawl", |
| 19 | + "config": { |
| 20 | + "seeds": [{"url": "https://webrecorder.net"}], |
| 21 | + "scopeType": "page", |
| 22 | + "limit": 1, |
| 23 | + }, |
| 24 | + } |
| 25 | + r = requests.post( |
| 26 | + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", |
| 27 | + headers=admin_auth_headers, |
| 28 | + json=crawl_data, |
| 29 | + ) |
| 30 | + data = r.json() |
| 31 | + return data["id"] |
| 32 | + |
| 33 | + |
| 34 | +def test_scheduled_crawl(admin_auth_headers, default_org_id, scheduled_config_id): |
| 35 | + # Ensure workflow exists with correct schedule, no crawls yet |
| 36 | + r = requests.get( |
| 37 | + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{scheduled_config_id}", |
| 38 | + headers=admin_auth_headers, |
| 39 | + ) |
| 40 | + assert r.status_code == 200 |
| 41 | + data = r.json() |
| 42 | + |
| 43 | + assert data["schedule"] == SCHEDULE |
| 44 | + |
| 45 | + assert data["crawlCount"] == 0 |
| 46 | + assert data["crawlAttemptCount"] == 0 |
| 47 | + assert data["crawlSuccessfulCount"] == 0 |
| 48 | + |
| 49 | + assert data["lastCrawlId"] is None |
| 50 | + assert data["lastCrawlState"] is None |
| 51 | + |
| 52 | + # Wait until a crawl completes (up to 20 minutes) |
| 53 | + attempts = 0 |
| 54 | + max_attempts = 120 |
| 55 | + |
| 56 | + while True: |
| 57 | + attempts += 1 |
| 58 | + |
| 59 | + if attempts > max_attempts: |
| 60 | + break |
| 61 | + |
| 62 | + r = requests.get( |
| 63 | + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{scheduled_config_id}", |
| 64 | + headers=admin_auth_headers, |
| 65 | + ) |
| 66 | + assert r.status_code == 200 |
| 67 | + data = r.json() |
| 68 | + |
| 69 | + last_crawl_id = data.get("lastCrawlId") |
| 70 | + last_crawl_state = data.get("lastCrawlState") |
| 71 | + |
| 72 | + if not last_crawl_id or last_crawl_state not in ("complete", "failed"): |
| 73 | + time.sleep(10) |
| 74 | + |
| 75 | + # Recheck workflow stats |
| 76 | + r = requests.get( |
| 77 | + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{scheduled_config_id}", |
| 78 | + headers=admin_auth_headers, |
| 79 | + ) |
| 80 | + assert r.status_code == 200 |
| 81 | + data = r.json() |
| 82 | + |
| 83 | + assert data["schedule"] == SCHEDULE |
| 84 | + |
| 85 | + assert data["crawlCount"] >= 1 |
| 86 | + assert data["crawlAttemptCount"] >= 1 |
| 87 | + assert data["crawlSuccessfulCount"] >= 1 |
| 88 | + |
| 89 | + assert data["lastCrawlId"] |
| 90 | + assert data["lastCrawlState"] == "complete" |
0 commit comments