webrecorder
diff --git a/‎.github/workflows/weblate-reformat.yaml
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/weblate-reformat.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/btrixcloud/crawlconfigs.py
Lines changed: 12 additions & 8 deletions b/‎backend/btrixcloud/crawlconfigs.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎backend/btrixcloud/models.py
Lines changed: 12 additions & 9 deletions b/‎backend/btrixcloud/models.py
Lines changed: 12 additions & 9 deletions
diff --git a/‎backend/btrixcloud/operator/crawls.py
Lines changed: 17 additions & 6 deletions b/‎backend/btrixcloud/operator/crawls.py
Lines changed: 17 additions & 6 deletions
diff --git a/‎backend/test/test_crawlconfigs.py
Lines changed: 16 additions & 0 deletions b/‎backend/test/test_crawlconfigs.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎backend/test_nightly/conftest.py
Lines changed: 1 addition & 1 deletion b/‎backend/test_nightly/conftest.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/test_nightly/test_crawl_not_logged_in.py
Lines changed: 89 additions & 0 deletions b/‎backend/test_nightly/test_crawl_not_logged_in.py
Lines changed: 89 additions & 0 deletions
diff --git a/‎chart/templates/ingress.yaml
Lines changed: 5 additions & 0 deletions b/‎chart/templates/ingress.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎chart/test/test.yaml
Lines changed: 1 addition & 1 deletion b/‎chart/test/test.yaml
Lines changed: 1 addition & 1 deletion
@@ -1,3 +1,8 @@
+# Formats and builds frontend UI translation files on pull requests
+# from https://github.com/weblate/browsertrix/tree/weblate-browsertrix-browsertrix
+#
+# Pull requests are automatically created by Hosted Weblate.
+# See https://docs.browsertrix.com/develop/localization/
 name: Weblate Reformat
 on:
   pull_request_target
 
@@ -58,4 +58,4 @@ Translations are managed through Weblate, a web-based and open source translatio
 
 Browsertrix is made available under the [AGPLv3 License](https://github.com/webrecorder/browsertrix?tab=AGPL-3.0-1-ov-file#readme).
 
-Documentation is made available under the Creative Commons Attribution 4.0 International License.
+Documentation is made available under the Creative Commons Attribution 4.0 International License. 
@@ -852,6 +852,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
                 update_query["lastCrawlSize"] = sum(
                     file_.get("size", 0) for file_ in last_crawl.get("files", [])
                 )
+                update_query["lastCrawlStats"] = last_crawl.get("stats")
                 update_query["lastCrawlStopping"] = False
                 update_query["isCrawlRunning"] = False
 
@@ -866,6 +867,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
                 update_query["lastCrawlTime"] = None
                 update_query["lastCrawlState"] = None
                 update_query["lastCrawlSize"] = 0
+                update_query["lastCrawlStats"] = None
                 update_query["lastRun"] = None
                 update_query["isCrawlRunning"] = False
 
@@ -895,6 +897,7 @@ async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
         crawlconfig.lastCrawlShouldPause = crawl.shouldPause
         crawlconfig.lastCrawlPausedAt = crawl.pausedAt
         crawlconfig.lastCrawlPausedExpiry = None
+        crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None
         if crawl.pausedAt:
             crawlconfig.lastCrawlPausedExpiry = (
                 crawl.pausedAt + self.paused_expiry_delta
@@ -976,21 +979,21 @@ async def make_inactive_or_delete(
 
         # if no crawls have been run, actually delete
         if not crawlconfig.crawlAttemptCount:
-            if crawlconfig.config and crawlconfig.config.seedFileId:
-                try:
-                    await self.file_ops.delete_seed_file(
-                        crawlconfig.config.seedFileId, org
-                    )
-                except HTTPException:
-                    pass
-
             result = await self.crawl_configs.delete_one(
                 {"_id": crawlconfig.id, "oid": crawlconfig.oid}
             )
 
             if result.deleted_count != 1:
                 raise HTTPException(status_code=404, detail="failed_to_delete")
 
+            if crawlconfig and crawlconfig.config.seedFileId:
+                try:
+                    await self.file_ops.delete_seed_file(
+                        crawlconfig.config.seedFileId, org
+                    )
+                except HTTPException:
+                    pass
+
             status = "deleted"
 
         else:
@@ -1420,6 +1423,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
             update_query["lastStartedByName"] = last_crawl.get("userName")
             update_query["lastCrawlState"] = last_crawl.get("state")
             update_query["lastCrawlSize"] = last_crawl_size
+            update_query["lastCrawlStats"] = last_crawl.get("stats")
             update_query["lastCrawlStopping"] = False
             update_query["isCrawlRunning"] = False
 
 
@@ -243,6 +243,7 @@ class UserOrgInfoOut(BaseModel):
 TYPE_FAILED_STATES = Literal[
     "canceled",
     "failed",
+    "failed_not_logged_in",
     "skipped_storage_quota_reached",
     "skipped_time_quota_reached",
 ]
@@ -273,6 +274,15 @@ class UserOrgInfoOut(BaseModel):
 ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]
 
 
+# ============================================================================
+class CrawlStats(BaseModel):
+    """Crawl Stats for pages and size"""
+
+    found: int = 0
+    done: int = 0
+    size: int = 0
+
+
 # ============================================================================
 
 ### CRAWL CONFIGS ###
@@ -349,6 +359,7 @@ class RawCrawlConfig(BaseModel):
 
     useSitemap: Optional[bool] = False
     failOnFailedSeed: Optional[bool] = False
+    failOnContentCheck: Optional[bool] = False
 
     logging: Optional[str] = None
     behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
@@ -510,6 +521,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
     lastCrawlShouldPause: Optional[bool] = False
     lastCrawlPausedAt: Optional[datetime] = None
     lastCrawlPausedExpiry: Optional[datetime] = None
+    lastCrawlStats: Optional[CrawlStats] = None
     profileName: Optional[str] = None
 
     createdByName: Optional[str] = None
@@ -772,15 +784,6 @@ class CrawlFileOut(BaseModel):
     expireAt: Optional[str] = None
 
 
-# ============================================================================
-class CrawlStats(BaseModel):
-    """Crawl Stats for pages and size"""
-
-    found: int = 0
-    done: int = 0
-    size: int = 0
-
-
 # ============================================================================
 class CoreCrawlable(BaseModel):
     # pylint: disable=too-few-public-methods
 
@@ -4,7 +4,7 @@
 import os
 import math
 from pprint import pprint
-from typing import Optional, Any, Sequence
+from typing import Optional, Any, Sequence, Literal
 from datetime import datetime, timedelta
 from uuid import UUID
 
@@ -827,15 +827,26 @@ async def fail_crawl(
         crawl: CrawlSpec,
         status: CrawlStatus,
         pods: dict,
-        stats: Optional[CrawlStats] = None,
+        stats: CrawlStats,
+        redis: Redis,
     ) -> bool:
         """Mark crawl as failed, log crawl state and print crawl logs, if possible"""
         prev_state = status.state
 
-        if not await self.mark_finished(crawl, status, "failed", stats=stats):
+        failed_state: Literal["failed", "failed_not_logged_in"] = "failed"
+
+        fail_reason = await redis.get(f"{crawl.id}:failReason")
+
+        if fail_reason == "not_logged_in":
+            failed_state = "failed_not_logged_in"
+
+        if not await self.mark_finished(crawl, status, failed_state, stats=stats):
             return False
 
-        if not self.log_failed_crawl_lines or prev_state == "failed":
+        if not self.log_failed_crawl_lines or prev_state in (
+            "failed",
+            "failed_not_logged_in",
+        ):
             return True
 
         pod_names = list(pods.keys())
@@ -1579,7 +1590,7 @@ async def update_crawl_state(
             # check if one-page crawls actually succeeded
             # if only one page found, and no files, assume failed
             if status.pagesFound == 1 and not status.filesAdded:
-                await self.fail_crawl(crawl, status, pods, stats)
+                await self.fail_crawl(crawl, status, pods, stats, redis)
                 return status
 
             state: TYPE_NON_RUNNING_STATES
@@ -1602,7 +1613,7 @@ async def update_crawl_state(
             if status.stopping and not status.pagesDone:
                 await self.mark_finished(crawl, status, "canceled", stats)
             else:
-                await self.fail_crawl(crawl, status, pods, stats)
+                await self.fail_crawl(crawl, status, pods, stats, redis)
 
         # check for other statuses, default to "running"
         else:
 
@@ -520,6 +520,11 @@ def test_workflow_total_size_and_last_crawl_stats(
             assert workflow["lastRun"]
             assert workflow["lastCrawlSize"] > 0
 
+            stats = workflow["lastCrawlStats"]
+            assert stats["found"] > 0
+            assert stats["done"] > 0
+            assert stats["size"] > 0
+
             if last_crawl_id == admin_crawl_id:
                 global _admin_crawl_cid
                 _admin_crawl_cid = workflow["id"]
@@ -545,6 +550,11 @@ def test_workflow_total_size_and_last_crawl_stats(
     assert data["lastRun"]
     assert data["lastCrawlSize"] > 0
 
+    stats = data["lastCrawlStats"]
+    assert stats["found"] > 0
+    assert stats["done"] > 0
+    assert stats["size"] > 0
+
 
 def test_incremental_workflow_total_size_and_last_crawl_stats(
     crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
@@ -564,6 +574,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
     last_crawl_started = data["lastCrawlStartTime"]
     last_crawl_finished = data["lastCrawlTime"]
     last_run = data["lastRun"]
+    last_stats = data["lastCrawlStats"]
 
     # Run new crawl in this workflow
     r = requests.post(
@@ -602,6 +613,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
     assert data["lastCrawlStartTime"] > last_crawl_started
     assert data["lastCrawlTime"] > last_crawl_finished
     assert data["lastRun"] > last_run
+    stats = data["lastCrawlStats"]
+    assert stats["found"] > 0
+    assert stats["done"] > 0
+    assert stats["size"] > 0
 
     # Delete new crawl
     r = requests.post(
@@ -628,6 +643,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
     assert data["lastCrawlStartTime"] == last_crawl_started
     assert data["lastCrawlTime"] == last_crawl_finished
     assert data["lastRun"] == last_run
+    assert data["lastCrawlStats"] == last_stats
 
 
 def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
 
@@ -316,7 +316,7 @@ def error_crawl_id(admin_auth_headers, default_org_id):
             headers=admin_auth_headers,
         )
         data = r.json()
-        if data["state"] == "complete":
+        if data["state"] in ("failed", "complete"):
             return crawl_id
         time.sleep(5)
 
 
@@ -0,0 +1,89 @@
+import time
+
+import pytest
+import requests
+
+from .conftest import API_PREFIX
+
+config_id = None
+
+
+@pytest.fixture(scope="session")
+def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id):
+    # Start crawl
+    crawl_data = {
+        "runNow": True,
+        "name": "Fail Crawl Not Logged In",
+        "config": {
+            "seeds": [{"url": "https://x.com/webrecorder_io"}],
+            "scopeType": "page",
+            "limit": 1,
+            "failOnContentCheck": True,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    global config_id
+    config_id = data["id"]
+
+    crawl_id = data["run_now_job"]
+
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "running":
+            # Give crawl time to start properly
+            time.sleep(30)
+            return crawl_id
+        time.sleep(5)
+
+
+@pytest.fixture(scope="session")
+def failed_crawl_finished(
+    admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id
+):
+    # Wait for crawl to complete
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in ("complete", "failed", "failed_not_logged_in"):
+            # Give some time for WACZ files to be stored
+            time.sleep(30)
+            break
+        time.sleep(5)
+
+
+def test_fail_crawl_not_logged_in(
+    admin_auth_headers,
+    default_org_id,
+    fail_not_logged_in_crawl_id,
+    failed_crawl_finished,
+):
+    # Ensure crawl has expected state
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["state"] == "failed_not_logged_in"
+
+    # Ensure workflow lastCrawlState has expected state
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["lastCrawlState"] == "failed_not_logged_in"
@@ -10,6 +10,7 @@ metadata:
     {{- if .Values.ingress.useOldClassAnnotation }}
     kubernetes.io/ingress.class: {{ .Values.ingress_class | default "nginx" }}
     {{- end }}
+    {{- if eq ( .Values.ingress_class | default "nginx" ) "nginx" }}
     nginx.ingress.kubernetes.io/proxy-body-size: "0"
     nginx.ingress.kubernetes.io/proxy-request-buffering: "off"
     # for larger uploads to not timeout
@@ -22,6 +23,10 @@ metadata:
     {{- else }}
     nginx.ingress.kubernetes.io/ssl-redirect: "false"
     {{- end }}
+    {{- end }}
+    {{- range $key, $value := .Values.ingress.annotations }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
 
 spec:
   {{- if not .Values.ingress.useOldClassAnnotation }}
 
@@ -26,7 +26,7 @@ crawler_channels:
     image: "docker.io/webrecorder/browsertrix-crawler:latest"
 
   - id: test
-    image: "docker.io/webrecorder/browsertrix-crawler:1.7.0-beta.0"
+    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 
 mongo_auth:
   # specify either username + password (for local mongo)
Original file line number	Diff line number	Diff line change
`@@ -58,4 +58,4 @@ Translations are managed through Weblate, a web-based and open source translatio`
`58`	`58`
`59`	`59`	`Browsertrix is made available under the [AGPLv3 License](https://github.com/webrecorder/browsertrix?tab=AGPL-3.0-1-ov-file#readme).`
`60`	`60`
`61`		`-Documentation is made available under the Creative Commons Attribution 4.0 International License.`
	`61`	`+Documentation is made available under the Creative Commons Attribution 4.0 International License.`
Original file line number	Diff line number	Diff line change
`@@ -316,7 +316,7 @@ def error_crawl_id(admin_auth_headers, default_org_id):`
`316`	`316`	`headers=admin_auth_headers,`
`317`	`317`	`)`
`318`	`318`	`data = r.json()`
`319`		`- if data["state"] == "complete":`
	`319`	`+ if data["state"] in ("failed", "complete"):`
`320`	`320`	`return crawl_id`
`321`	`321`	`time.sleep(5)`
`322`	`322`