Skip to content

Commit 6cc6e88

Browse files
committed
Merge branch 'main' into email-microservice
2 parents c91ed0a + 5a4add8 commit 6cc6e88

File tree

32 files changed

+550
-76
lines changed

32 files changed

+550
-76
lines changed

.github/workflows/weblate-reformat.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# Formats and builds frontend UI translation files on pull requests
2+
# from https://github.com/weblate/browsertrix/tree/weblate-browsertrix-browsertrix
3+
#
4+
# Pull requests are automatically created by Hosted Weblate.
5+
# See https://docs.browsertrix.com/develop/localization/
16
name: Weblate Reformat
27
on:
38
pull_request_target

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,4 @@ Translations are managed through Weblate, a web-based and open source translatio
5858

5959
Browsertrix is made available under the [AGPLv3 License](https://github.com/webrecorder/browsertrix?tab=AGPL-3.0-1-ov-file#readme).
6060

61-
Documentation is made available under the Creative Commons Attribution 4.0 International License.
61+
Documentation is made available under the Creative Commons Attribution 4.0 International License.

backend/btrixcloud/crawlconfigs.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
852852
update_query["lastCrawlSize"] = sum(
853853
file_.get("size", 0) for file_ in last_crawl.get("files", [])
854854
)
855+
update_query["lastCrawlStats"] = last_crawl.get("stats")
855856
update_query["lastCrawlStopping"] = False
856857
update_query["isCrawlRunning"] = False
857858

@@ -866,6 +867,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
866867
update_query["lastCrawlTime"] = None
867868
update_query["lastCrawlState"] = None
868869
update_query["lastCrawlSize"] = 0
870+
update_query["lastCrawlStats"] = None
869871
update_query["lastRun"] = None
870872
update_query["isCrawlRunning"] = False
871873

@@ -895,6 +897,7 @@ async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
895897
crawlconfig.lastCrawlShouldPause = crawl.shouldPause
896898
crawlconfig.lastCrawlPausedAt = crawl.pausedAt
897899
crawlconfig.lastCrawlPausedExpiry = None
900+
crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None
898901
if crawl.pausedAt:
899902
crawlconfig.lastCrawlPausedExpiry = (
900903
crawl.pausedAt + self.paused_expiry_delta
@@ -976,21 +979,21 @@ async def make_inactive_or_delete(
976979

977980
# if no crawls have been run, actually delete
978981
if not crawlconfig.crawlAttemptCount:
979-
if crawlconfig.config and crawlconfig.config.seedFileId:
980-
try:
981-
await self.file_ops.delete_seed_file(
982-
crawlconfig.config.seedFileId, org
983-
)
984-
except HTTPException:
985-
pass
986-
987982
result = await self.crawl_configs.delete_one(
988983
{"_id": crawlconfig.id, "oid": crawlconfig.oid}
989984
)
990985

991986
if result.deleted_count != 1:
992987
raise HTTPException(status_code=404, detail="failed_to_delete")
993988

989+
if crawlconfig and crawlconfig.config.seedFileId:
990+
try:
991+
await self.file_ops.delete_seed_file(
992+
crawlconfig.config.seedFileId, org
993+
)
994+
except HTTPException:
995+
pass
996+
994997
status = "deleted"
995998

996999
else:
@@ -1420,6 +1423,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
14201423
update_query["lastStartedByName"] = last_crawl.get("userName")
14211424
update_query["lastCrawlState"] = last_crawl.get("state")
14221425
update_query["lastCrawlSize"] = last_crawl_size
1426+
update_query["lastCrawlStats"] = last_crawl.get("stats")
14231427
update_query["lastCrawlStopping"] = False
14241428
update_query["isCrawlRunning"] = False
14251429

backend/btrixcloud/models.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ class UserOrgInfoOut(BaseModel):
243243
TYPE_FAILED_STATES = Literal[
244244
"canceled",
245245
"failed",
246+
"failed_not_logged_in",
246247
"skipped_storage_quota_reached",
247248
"skipped_time_quota_reached",
248249
]
@@ -273,6 +274,15 @@ class UserOrgInfoOut(BaseModel):
273274
ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]
274275

275276

277+
# ============================================================================
278+
class CrawlStats(BaseModel):
279+
"""Crawl Stats for pages and size"""
280+
281+
found: int = 0
282+
done: int = 0
283+
size: int = 0
284+
285+
276286
# ============================================================================
277287

278288
### CRAWL CONFIGS ###
@@ -349,6 +359,7 @@ class RawCrawlConfig(BaseModel):
349359

350360
useSitemap: Optional[bool] = False
351361
failOnFailedSeed: Optional[bool] = False
362+
failOnContentCheck: Optional[bool] = False
352363

353364
logging: Optional[str] = None
354365
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
@@ -510,6 +521,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
510521
lastCrawlShouldPause: Optional[bool] = False
511522
lastCrawlPausedAt: Optional[datetime] = None
512523
lastCrawlPausedExpiry: Optional[datetime] = None
524+
lastCrawlStats: Optional[CrawlStats] = None
513525
profileName: Optional[str] = None
514526

515527
createdByName: Optional[str] = None
@@ -772,15 +784,6 @@ class CrawlFileOut(BaseModel):
772784
expireAt: Optional[str] = None
773785

774786

775-
# ============================================================================
776-
class CrawlStats(BaseModel):
777-
"""Crawl Stats for pages and size"""
778-
779-
found: int = 0
780-
done: int = 0
781-
size: int = 0
782-
783-
784787
# ============================================================================
785788
class CoreCrawlable(BaseModel):
786789
# pylint: disable=too-few-public-methods

backend/btrixcloud/operator/crawls.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os
55
import math
66
from pprint import pprint
7-
from typing import Optional, Any, Sequence
7+
from typing import Optional, Any, Sequence, Literal
88
from datetime import datetime, timedelta
99
from uuid import UUID
1010

@@ -827,15 +827,26 @@ async def fail_crawl(
827827
crawl: CrawlSpec,
828828
status: CrawlStatus,
829829
pods: dict,
830-
stats: Optional[CrawlStats] = None,
830+
stats: CrawlStats,
831+
redis: Redis,
831832
) -> bool:
832833
"""Mark crawl as failed, log crawl state and print crawl logs, if possible"""
833834
prev_state = status.state
834835

835-
if not await self.mark_finished(crawl, status, "failed", stats=stats):
836+
failed_state: Literal["failed", "failed_not_logged_in"] = "failed"
837+
838+
fail_reason = await redis.get(f"{crawl.id}:failReason")
839+
840+
if fail_reason == "not_logged_in":
841+
failed_state = "failed_not_logged_in"
842+
843+
if not await self.mark_finished(crawl, status, failed_state, stats=stats):
836844
return False
837845

838-
if not self.log_failed_crawl_lines or prev_state == "failed":
846+
if not self.log_failed_crawl_lines or prev_state in (
847+
"failed",
848+
"failed_not_logged_in",
849+
):
839850
return True
840851

841852
pod_names = list(pods.keys())
@@ -1579,7 +1590,7 @@ async def update_crawl_state(
15791590
# check if one-page crawls actually succeeded
15801591
# if only one page found, and no files, assume failed
15811592
if status.pagesFound == 1 and not status.filesAdded:
1582-
await self.fail_crawl(crawl, status, pods, stats)
1593+
await self.fail_crawl(crawl, status, pods, stats, redis)
15831594
return status
15841595

15851596
state: TYPE_NON_RUNNING_STATES
@@ -1602,7 +1613,7 @@ async def update_crawl_state(
16021613
if status.stopping and not status.pagesDone:
16031614
await self.mark_finished(crawl, status, "canceled", stats)
16041615
else:
1605-
await self.fail_crawl(crawl, status, pods, stats)
1616+
await self.fail_crawl(crawl, status, pods, stats, redis)
16061617

16071618
# check for other statuses, default to "running"
16081619
else:

backend/test/test_crawlconfigs.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,11 @@ def test_workflow_total_size_and_last_crawl_stats(
520520
assert workflow["lastRun"]
521521
assert workflow["lastCrawlSize"] > 0
522522

523+
stats = workflow["lastCrawlStats"]
524+
assert stats["found"] > 0
525+
assert stats["done"] > 0
526+
assert stats["size"] > 0
527+
523528
if last_crawl_id == admin_crawl_id:
524529
global _admin_crawl_cid
525530
_admin_crawl_cid = workflow["id"]
@@ -545,6 +550,11 @@ def test_workflow_total_size_and_last_crawl_stats(
545550
assert data["lastRun"]
546551
assert data["lastCrawlSize"] > 0
547552

553+
stats = data["lastCrawlStats"]
554+
assert stats["found"] > 0
555+
assert stats["done"] > 0
556+
assert stats["size"] > 0
557+
548558

549559
def test_incremental_workflow_total_size_and_last_crawl_stats(
550560
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
@@ -564,6 +574,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
564574
last_crawl_started = data["lastCrawlStartTime"]
565575
last_crawl_finished = data["lastCrawlTime"]
566576
last_run = data["lastRun"]
577+
last_stats = data["lastCrawlStats"]
567578

568579
# Run new crawl in this workflow
569580
r = requests.post(
@@ -602,6 +613,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
602613
assert data["lastCrawlStartTime"] > last_crawl_started
603614
assert data["lastCrawlTime"] > last_crawl_finished
604615
assert data["lastRun"] > last_run
616+
stats = data["lastCrawlStats"]
617+
assert stats["found"] > 0
618+
assert stats["done"] > 0
619+
assert stats["size"] > 0
605620

606621
# Delete new crawl
607622
r = requests.post(
@@ -628,6 +643,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
628643
assert data["lastCrawlStartTime"] == last_crawl_started
629644
assert data["lastCrawlTime"] == last_crawl_finished
630645
assert data["lastRun"] == last_run
646+
assert data["lastCrawlStats"] == last_stats
631647

632648

633649
def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):

backend/test_nightly/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def error_crawl_id(admin_auth_headers, default_org_id):
316316
headers=admin_auth_headers,
317317
)
318318
data = r.json()
319-
if data["state"] == "complete":
319+
if data["state"] in ("failed", "complete"):
320320
return crawl_id
321321
time.sleep(5)
322322

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import time
2+
3+
import pytest
4+
import requests
5+
6+
from .conftest import API_PREFIX
7+
8+
config_id = None
9+
10+
11+
@pytest.fixture(scope="session")
12+
def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id):
13+
# Start crawl
14+
crawl_data = {
15+
"runNow": True,
16+
"name": "Fail Crawl Not Logged In",
17+
"config": {
18+
"seeds": [{"url": "https://x.com/webrecorder_io"}],
19+
"scopeType": "page",
20+
"limit": 1,
21+
"failOnContentCheck": True,
22+
},
23+
}
24+
r = requests.post(
25+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
26+
headers=admin_auth_headers,
27+
json=crawl_data,
28+
)
29+
data = r.json()
30+
31+
global config_id
32+
config_id = data["id"]
33+
34+
crawl_id = data["run_now_job"]
35+
36+
while True:
37+
r = requests.get(
38+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
39+
headers=admin_auth_headers,
40+
)
41+
data = r.json()
42+
if data["state"] == "running":
43+
# Give crawl time to start properly
44+
time.sleep(30)
45+
return crawl_id
46+
time.sleep(5)
47+
48+
49+
@pytest.fixture(scope="session")
50+
def failed_crawl_finished(
51+
admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id
52+
):
53+
# Wait for crawl to complete
54+
while True:
55+
r = requests.get(
56+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
57+
headers=admin_auth_headers,
58+
)
59+
data = r.json()
60+
if data["state"] in ("complete", "failed", "failed_not_logged_in"):
61+
# Give some time for WACZ files to be stored
62+
time.sleep(30)
63+
break
64+
time.sleep(5)
65+
66+
67+
def test_fail_crawl_not_logged_in(
68+
admin_auth_headers,
69+
default_org_id,
70+
fail_not_logged_in_crawl_id,
71+
failed_crawl_finished,
72+
):
73+
# Ensure crawl has expected state
74+
r = requests.get(
75+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
76+
headers=admin_auth_headers,
77+
)
78+
assert r.status_code == 200
79+
data = r.json()
80+
assert data["state"] == "failed_not_logged_in"
81+
82+
# Ensure workflow lastCrawlState has expected state
83+
r = requests.get(
84+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
85+
headers=admin_auth_headers,
86+
)
87+
assert r.status_code == 200
88+
data = r.json()
89+
assert data["lastCrawlState"] == "failed_not_logged_in"

chart/templates/ingress.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ metadata:
1010
{{- if .Values.ingress.useOldClassAnnotation }}
1111
kubernetes.io/ingress.class: {{ .Values.ingress_class | default "nginx" }}
1212
{{- end }}
13+
{{- if eq ( .Values.ingress_class | default "nginx" ) "nginx" }}
1314
nginx.ingress.kubernetes.io/proxy-body-size: "0"
1415
nginx.ingress.kubernetes.io/proxy-request-buffering: "off"
1516
# for larger uploads to not timeout
@@ -22,6 +23,10 @@ metadata:
2223
{{- else }}
2324
nginx.ingress.kubernetes.io/ssl-redirect: "false"
2425
{{- end }}
26+
{{- end }}
27+
{{- range $key, $value := .Values.ingress.annotations }}
28+
{{ $key }}: {{ $value | quote }}
29+
{{- end }}
2530

2631
spec:
2732
{{- if not .Values.ingress.useOldClassAnnotation }}

chart/test/test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ crawler_channels:
2626
image: "docker.io/webrecorder/browsertrix-crawler:latest"
2727

2828
- id: test
29-
image: "docker.io/webrecorder/browsertrix-crawler:1.7.0-beta.0"
29+
image: "docker.io/webrecorder/browsertrix-crawler:latest"
3030

3131
mongo_auth:
3232
# specify either username + password (for local mongo)

0 commit comments

Comments
 (0)