diff --git a/backend/btrixcloud/crawl_logs.py b/backend/btrixcloud/crawl_logs.py new file mode 100644 index 0000000000..ac738f7dea --- /dev/null +++ b/backend/btrixcloud/crawl_logs.py @@ -0,0 +1,181 @@ +"""crawl logs""" + +from typing import TYPE_CHECKING, Any, Optional, Dict, Tuple, List + +import json +from uuid import UUID, uuid4 + +from fastapi import HTTPException +import pymongo + +from .models import CrawlLogLine, Organization +from .pagination import DEFAULT_PAGE_SIZE + +if TYPE_CHECKING: + from .orgs import OrgOps +else: + OrgOps = object + + +# ============================================================================ +class CrawlLogOps: + """crawl log management""" + + org_ops: OrgOps + + # pylint: disable=too-many-locals, too-many-arguments, invalid-name + + def __init__(self, mdb, org_ops): + self.logs = mdb["crawl_logs"] + self.org_ops = org_ops + + async def init_index(self): + """init index for crawl logs""" + await self.logs.create_index( + [ + ("crawlId", pymongo.HASHED), + ("oid", pymongo.ASCENDING), + ("qaRunId", pymongo.ASCENDING), + ("timestamp", pymongo.ASCENDING), + ] + ) + await self.logs.create_index( + [ + ("crawlId", pymongo.HASHED), + ("oid", pymongo.ASCENDING), + ("qaRunId", pymongo.ASCENDING), + ("logLevel", pymongo.ASCENDING), + ] + ) + await self.logs.create_index( + [ + ("crawlId", pymongo.HASHED), + ("oid", pymongo.ASCENDING), + ("qaRunId", pymongo.ASCENDING), + ("context", pymongo.ASCENDING), + ] + ) + await self.logs.create_index( + [ + ("crawlId", pymongo.HASHED), + ("oid", pymongo.ASCENDING), + ("qaRunId", pymongo.ASCENDING), + ("message", pymongo.ASCENDING), + ] + ) + + async def add_log_line( + self, + crawl_id: str, + oid: UUID, + log_line: str, + qa_run_id: Optional[str] = None, + ) -> bool: + """add crawl log line to database""" + try: + log_dict = json.loads(log_line) + + # Ensure details are a dictionary + # If they are a list, convert to a dict + details = None + log_dict_details = log_dict.get("details") + if log_dict_details: + if isinstance(log_dict_details, dict): + details = log_dict_details + else: + details = {"items": log_dict_details} + + log_to_add = CrawlLogLine( + id=uuid4(), + crawlId=crawl_id, + oid=oid, + qaRunId=qa_run_id, + timestamp=log_dict["timestamp"], + logLevel=log_dict["logLevel"], + context=log_dict["context"], + message=log_dict["message"], + details=details, + ) + res = await self.logs.insert_one(log_to_add.to_dict()) + return res is not None + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error adding log line for crawl {crawl_id} to database: {err}", + flush=True, + ) + return False + + async def get_crawl_logs( + self, + org: Organization, + crawl_id: str, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: str = "timestamp", + sort_direction: int = -1, + contexts: Optional[List[str]] = None, + log_levels: Optional[List[str]] = None, + qa_run_id: Optional[str] = None, + ) -> Tuple[list[CrawlLogLine], int]: + """list all logs for particular crawl""" + # pylint: disable=too-many-locals, duplicate-code + + # Zero-index page for query + page = page - 1 + skip = page_size * page + + match_query: Dict[str, Any] = { + "oid": org.id, + "crawlId": crawl_id, + "qaRunId": qa_run_id, + } + + if contexts: + match_query["context"] = {"$in": contexts} + + if log_levels: + match_query["logLevel"] = {"$in": log_levels} + + aggregate: List[Dict[str, Any]] = [{"$match": match_query}] + + if sort_by: + if sort_by not in ( + "timestamp", + "logLevel", + "context", + "message", + ): + raise HTTPException(status_code=400, detail="invalid_sort_by") + if sort_direction not in (1, -1): + raise HTTPException(status_code=400, detail="invalid_sort_direction") + + aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + + aggregate.extend( + [ + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } + }, + ] + ) + + cursor = self.logs.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] + + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError): + total = 0 + + log_lines = [CrawlLogLine.from_dict(res) for res in items] + + return log_lines, total diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index d0a2830961..42e2aed622 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -10,7 +10,7 @@ from datetime import datetime from uuid import UUID -from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator +from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator, Tuple from fastapi import Depends, HTTPException, Request from fastapi.responses import StreamingResponse @@ -22,7 +22,6 @@ from .utils import ( dt_now, date_to_str, - parse_jsonl_log_messages, stream_dict_list_as_csv, validate_regexes, scale_from_browser_windows, @@ -30,6 +29,7 @@ ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager +from .crawl_logs import CrawlLogOps from .models import ( UpdateCrawl, DeleteCrawlList, @@ -66,6 +66,7 @@ CrawlScaleResponse, CrawlQueueResponse, MatchCrawlQueueResponse, + CrawlLogLine, ) @@ -80,9 +81,10 @@ class CrawlOps(BaseCrawlOps): crawl_manager: CrawlManager - def __init__(self, crawl_manager: CrawlManager, *args): + def __init__(self, crawl_manager: CrawlManager, log_ops: CrawlLogOps, *args): super().__init__(*args) self.crawl_manager = crawl_manager + self.log_ops = log_ops self.crawl_configs.set_crawl_ops(self) self.colls.set_crawl_ops(self) self.event_webhook_ops.set_crawl_ops(self) @@ -669,31 +671,6 @@ async def is_upload(self, crawl_id: str): return False return res.get("type") == "upload" - async def add_crawl_error( - self, - crawl_id: str, - is_qa: bool, - error: str, - ) -> bool: - """add crawl error from redis to mongodb errors field""" - prefix = "" if not is_qa else "qa." - - res = await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$push": {f"{prefix}errors": error}} - ) - return res is not None - - async def add_crawl_behavior_log( - self, - crawl_id: str, - log_line: str, - ) -> bool: - """add crawl behavior log from redis to mongodb behaviorLogs field""" - res = await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$push": {"behaviorLogs": log_line}} - ) - return res is not None - async def add_crawl_file( self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int ) -> bool: @@ -1141,6 +1118,31 @@ async def get_qa_run_aggregate_stats( textMatch=text_results, ) + async def get_crawl_logs( + self, + org: Organization, + crawl_id: str, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: str = "timestamp", + sort_direction: int = 1, + contexts: Optional[List[str]] = None, + log_levels: Optional[List[str]] = None, + qa_run_id: Optional[str] = None, + ) -> Tuple[list[CrawlLogLine], int]: + """get crawl logs""" + return await self.log_ops.get_crawl_logs( + org, + crawl_id, + page_size=page_size, + page=page, + sort_by=sort_by, + sort_direction=sort_direction, + contexts=contexts, + log_levels=log_levels, + qa_run_id=qa_run_id, + ) + # ============================================================================ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): @@ -1162,11 +1164,13 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, too-many-statements -def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): +def init_crawls_api( + crawl_manager: CrawlManager, crawl_log_ops: CrawlLogOps, app, user_dep, *args +): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code - ops = CrawlOps(crawl_manager, *args) + ops = CrawlOps(crawl_manager, crawl_log_ops, *args) org_viewer_dep = ops.orgs.org_viewer_dep org_crawl_dep = ops.orgs.org_crawl_dep @@ -1694,15 +1698,20 @@ async def get_crawl_errors( pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, org: Organization = Depends(org_viewer_dep), + sortBy: str = "timestamp", + sortDirection: int = 1, ): - crawl = await ops.get_crawl(crawl_id, org) - - skip = (page - 1) * pageSize - upper_bound = skip + pageSize - - errors = crawl.errors[skip:upper_bound] if crawl.errors else [] - parsed_errors = parse_jsonl_log_messages(errors) - return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize) + log_lines, total = await ops.get_crawl_logs( + org, + crawl_id, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + log_levels=["error", "fatal"], + qa_run_id=None, + ) + return paginated_format(log_lines, total, page, pageSize) @app.get( "/orgs/{oid}/crawls/{crawl_id}/behaviorLogs", @@ -1714,18 +1723,19 @@ async def get_crawl_behavior_logs( pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, org: Organization = Depends(org_viewer_dep), + sortBy: str = "timestamp", + sortDirection: int = 1, ): - crawl = await ops.get_crawl(crawl_id, org) - - skip = (page - 1) * pageSize - upper_bound = skip + pageSize - - behavior_logs = ( - crawl.behaviorLogs[skip:upper_bound] if crawl.behaviorLogs else [] - ) - parsed_logs = parse_jsonl_log_messages(behavior_logs) - return paginated_format( - parsed_logs, len(crawl.behaviorLogs or []), page, pageSize + log_lines, total = await ops.get_crawl_logs( + org, + crawl_id, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + contexts=["behavior", "behaviorScript", "behaviorScriptCustom"], + qa_run_id=None, ) + return paginated_format(log_lines, total, page, pageSize) return ops diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 0d9b366273..7008d99972 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -20,6 +20,7 @@ from .users import UserManager from .orgs import OrgOps from .crawlconfigs import CrawlConfigOps + from .crawl_logs import CrawlLogOps from .crawls import CrawlOps from .colls import CollectionOps from .invites import InviteOps @@ -30,10 +31,10 @@ else: UserManager = OrgOps = CrawlConfigOps = CrawlOps = CollectionOps = InviteOps = ( StorageOps - ) = PageOps = BackgroundJobOps = FileUploadOps = object + ) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = object -CURR_DB_VERSION = "0049" +CURR_DB_VERSION = "0050" # ============================================================================ @@ -100,6 +101,7 @@ async def update_and_prepare_db( page_ops: PageOps, background_job_ops: BackgroundJobOps, file_ops: FileUploadOps, + crawl_log_ops: CrawlLogOps, ) -> None: """Prepare database for application. @@ -112,7 +114,14 @@ async def update_and_prepare_db( await ping_db(mdb) print("Database setup started", flush=True) if await run_db_migrations( - mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops, file_ops + mdb, + user_manager, + page_ops, + org_ops, + background_job_ops, + coll_ops, + file_ops, + crawl_log_ops, ): await drop_indexes(mdb) @@ -126,6 +135,7 @@ async def update_and_prepare_db( page_ops, storage_ops, file_ops, + crawl_log_ops, ) await user_manager.create_super_user() await org_ops.create_default_org() @@ -136,7 +146,14 @@ async def update_and_prepare_db( # ============================================================================ # pylint: disable=too-many-locals, too-many-arguments async def run_db_migrations( - mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops, file_ops + mdb, + user_manager, + page_ops, + org_ops, + background_job_ops, + coll_ops, + file_ops, + crawl_log_ops, ): """Run database migrations.""" @@ -176,6 +193,7 @@ async def run_db_migrations( background_job_ops=background_job_ops, coll_ops=coll_ops, file_ops=file_ops, + crawl_log_ops=crawl_log_ops, ) if await migration.run(): migrations_run = True @@ -213,9 +231,9 @@ async def drop_indexes(mdb): print("Dropping database indexes", flush=True) collection_names = await mdb.list_collection_names() for collection in collection_names: - # Don't drop pages automatically, as these are large + # Don't drop pages or logs automatically, as these are large # indices and slow to recreate - if collection == "pages": + if collection in ("pages", "crawl_logs"): continue try: @@ -238,6 +256,7 @@ async def create_indexes( page_ops, storage_ops, file_ops, + crawl_log_ops, ): """Create database indexes.""" print("Creating database indexes", flush=True) @@ -250,6 +269,7 @@ async def create_indexes( await page_ops.init_index() await storage_ops.init_index() await file_ops.init_index() + await crawl_log_ops.init_index() # ============================================================================ diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 6ab720a65f..1fc54568da 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -29,6 +29,7 @@ from .storages import init_storages_api from .uploads import init_uploads_api from .crawlconfigs import init_crawl_config_api +from .crawl_logs import CrawlLogOps from .colls import init_collections_api from .crawls import init_crawls_api from .basecrawls import init_base_crawls_api @@ -195,6 +196,8 @@ def main() -> None: crawl_manager = CrawlManager() + crawl_log_ops = CrawlLogOps(mdb, org_ops) + storage_ops = init_storages_api( org_ops, crawl_manager, app, mdb, current_active_user ) @@ -254,7 +257,7 @@ def main() -> None: base_crawl_ops = init_base_crawls_api(*base_crawl_init) - crawls = init_crawls_api(crawl_manager, *base_crawl_init) + crawls = init_crawls_api(crawl_manager, crawl_log_ops, *base_crawl_init) upload_ops = init_uploads_api(*base_crawl_init) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index a2ec503892..269fa99400 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -47,6 +47,7 @@ async def main(): file_ops, _, _, + _, ) = init_ops() # Run job (generic) diff --git a/backend/btrixcloud/main_migrations.py b/backend/btrixcloud/main_migrations.py index 8e1a3e3c10..3771b9b376 100644 --- a/backend/btrixcloud/main_migrations.py +++ b/backend/btrixcloud/main_migrations.py @@ -36,6 +36,7 @@ async def main() -> int: user_manager, invite_ops, file_ops, + crawl_log_ops, _, mdb, ) = init_ops() @@ -52,6 +53,7 @@ async def main() -> int: page_ops, background_job_ops, file_ops, + crawl_log_ops, ) return 0 diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index 6afb3b7330..3c0453b321 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -41,6 +41,7 @@ def main(): _, _, _, + crawl_log_ops, _, _, ) = init_ops() @@ -55,6 +56,7 @@ def main(): event_webhook_ops, background_job_ops, page_ops, + crawl_log_ops, ) diff --git a/backend/btrixcloud/migrations/migration_0050_crawl_logs.py b/backend/btrixcloud/migrations/migration_0050_crawl_logs.py new file mode 100644 index 0000000000..6361b289b1 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0050_crawl_logs.py @@ -0,0 +1,123 @@ +""" +Migration 0050 - Move crawl logs to seperate mongo collection +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0050" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.crawl_log_ops = kwargs.get("crawl_log_ops") + + async def migrate_up(self): + """Perform migration up. Move crawl logs to separate mongo collection.""" + # pylint: disable=duplicate-code, line-too-long, too-many-locals + if self.crawl_log_ops is None: + print("Unable to move logs, missing ops", flush=True) + return + + crawls_mdb = self.mdb["crawls"] + + # Migrate error and behavior logs + match_query = { + "type": "crawl", + "$or": [{"errors": {"$ne": None}}, {"behaviorLogs": {"$ne": None}}], + } + + async for crawl_dict in crawls_mdb.find(match_query): + crawl_id = crawl_dict["_id"] + error_logs = crawl_dict.get("errors", []) + behavior_logs = crawl_dict.get("behaviorLogs", []) + + try: + while error_logs: + error_log = error_logs.pop(0) + await self.crawl_log_ops.add_log_line( + crawl_id=crawl_id, + oid=crawl_dict["oid"], + log_line=error_log, + qa_run_id=None, + ) + + while behavior_logs: + behavior_log = behavior_logs.pop(0) + await self.crawl_log_ops.add_log_line( + crawl_id=crawl_id, + oid=crawl_dict["oid"], + log_line=behavior_log, + qa_run_id=None, + ) + + await crawls_mdb.find_one_and_update( + {"_id": crawl_id}, + { + "$set": { + "errors": None, + "behaviorLogs": None, + } + }, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error moving logs for crawl {crawl_id}: {err}", + flush=True, + ) + + # Migrate qaFinished logs + qa_query = { + "type": "crawl", + "qaFinished": {"$nin": [None, {}]}, + } + + async for crawl_with_qa in crawls_mdb.find(qa_query): + crawl_id = crawl_with_qa["_id"] + qa_finished = crawl_with_qa.get("qaFinished") + if not qa_finished: + continue + for qa_run_id in qa_finished: + qa_error_logs = qa_finished[qa_run_id].get("errors", []) + qa_behavior_logs = qa_finished[qa_run_id].get("behaviorLogs", []) + + try: + while qa_error_logs: + qa_error_log = qa_error_logs.pop(0) + await self.crawl_log_ops.add_log_line( + crawl_id=crawl_id, + oid=crawl_with_qa["oid"], + log_line=qa_error_log, + qa_run_id=qa_run_id, + ) + + while qa_behavior_logs: + qa_behavior_log = qa_behavior_logs.pop(0) + await self.crawl_log_ops.add_log_line( + crawl_id=crawl_id, + oid=crawl_with_qa["oid"], + log_line=qa_behavior_log, + qa_run_id=qa_run_id, + ) + + await crawls_mdb.find_one_and_update( + {"_id": crawl_id}, + { + "$set": { + f"qaFinished.{qa_run_id}.errors": None, + f"qaFinished.{qa_run_id}.behaviorLogs": None, + } + }, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error moving logs for crawl {crawl_id} QA run {qa_run_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 7996a67a97..bf4766bef6 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -812,8 +812,9 @@ class CoreCrawlable(BaseModel): fileSize: int = 0 fileCount: int = 0 - errors: Optional[List[str]] = [] - behaviorLogs: Optional[List[str]] = [] + # Retained for backward compatibility + errors: Optional[List[str]] = Field(default=[], deprecated=True) + behaviorLogs: Optional[List[str]] = Field(default=[], deprecated=True) # ============================================================================ @@ -884,9 +885,6 @@ class CrawlOut(BaseMongoModel): tags: Optional[List[str]] = [] - errors: Optional[List[str]] = [] - behaviorLogs: Optional[List[str]] = [] - collectionIds: Optional[List[UUID]] = [] crawlExecSeconds: int = 0 @@ -929,6 +927,10 @@ class CrawlOut(BaseMongoModel): # pages will have this explicitly set to 2 version: Optional[int] = 1 + # Retained for backward compatibility + errors: Optional[List[str]] = Field(default=[], deprecated=True) + behaviorLogs: Optional[List[str]] = Field(default=[], deprecated=True) + # ============================================================================ class UpdateCrawl(BaseModel): @@ -1088,17 +1090,6 @@ class CrawlScaleResponse(BaseModel): browserWindows: int -# ============================================================================ -class CrawlLogMessage(BaseModel): - """Crawl log message""" - - timestamp: str - logLevel: str - context: str - message: str - details: Any - - # ============================================================================ ### UPLOADED CRAWLS ### @@ -1149,6 +1140,34 @@ def prepare_filename(self, filename): return name + "-" + randstr.decode("utf-8") + ext +# ============================================================================ + +### LOGS ### + + +# ============================================================================ +class CrawlLogLine(BaseMongoModel): + """Model for crawler log lines""" + + id: UUID + + crawlId: str + oid: UUID + + qaRunId: Optional[str] = None + + timestamp: datetime + logLevel: str + context: str + message: str + details: Optional[Dict[str, Any]] = None + + @property + def is_qa(self) -> bool: + """return true if log line is from qa run""" + return bool(self.qaRunId) + + # ============================================================================ ### USER-UPLOADED FILES ### @@ -3034,7 +3053,7 @@ class PaginatedWebhookNotificationResponse(PaginatedResponse): class PaginatedCrawlLogResponse(PaginatedResponse): """Response model for crawl logs""" - items: List[CrawlLogMessage] + items: List[CrawlLogLine] # ============================================================================ diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index 75b33dad03..6640e87ff3 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from btrixcloud.crawlconfigs import CrawlConfigOps from btrixcloud.crawls import CrawlOps + from btrixcloud.crawl_logs import CrawlLogOps from btrixcloud.orgs import OrgOps from btrixcloud.colls import CollectionOps from btrixcloud.storages import StorageOps @@ -22,7 +23,7 @@ from btrixcloud.pages import PageOps from redis.asyncio.client import Redis else: - CrawlConfigOps = CrawlOps = OrgOps = CollectionOps = Redis = object + CrawlConfigOps = CrawlOps = OrgOps = CollectionOps = Redis = CrawlLogOps = object StorageOps = EventWebhookOps = UserManager = BackgroundJobOps = PageOps = object @@ -154,6 +155,7 @@ class BaseOperator: event_webhook_ops: EventWebhookOps page_ops: PageOps user_ops: UserManager + crawl_log_ops: CrawlLogOps def __init__( self, @@ -166,6 +168,7 @@ def __init__( event_webhook_ops, background_job_ops, page_ops, + crawl_log_ops, ): self.k8s = k8s self.crawl_config_ops = crawl_config_ops @@ -177,6 +180,7 @@ def __init__( self.event_webhook_ops = event_webhook_ops self.page_ops = page_ops self.user_ops = crawl_config_ops.user_manager + self.crawl_log_ops = crawl_log_ops # to avoid background tasks being garbage collected # see: https://stackoverflow.com/a/74059981 diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 32d47a7f53..fe6a6941d3 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1018,15 +1018,21 @@ async def sync_crawl_state( crawl_error = await redis.rpop(f"{crawl.id}:{self.errors_key}") while crawl_error: - await self.crawl_ops.add_crawl_error( - crawl.db_crawl_id, crawl.is_qa, crawl_error + await self.crawl_log_ops.add_log_line( + crawl.db_crawl_id, + crawl.oid, + log_line=crawl_error, + qa_run_id=qa_run_id, ) crawl_error = await redis.rpop(f"{crawl.id}:{self.errors_key}") behavior_log = await redis.rpop(f"{crawl.id}:{self.behavior_logs_key}") while behavior_log: - await self.crawl_ops.add_crawl_behavior_log( - crawl.db_crawl_id, behavior_log + await self.crawl_log_ops.add_log_line( + crawl.db_crawl_id, + crawl.oid, + log_line=behavior_log, + qa_run_id=qa_run_id, ) behavior_log = await redis.rpop(f"{crawl.id}:{self.behavior_logs_key}") diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 6fdc03a15e..d821a7d909 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -12,6 +12,7 @@ from .colls import CollectionOps from .crawls import CrawlOps from .crawlconfigs import CrawlConfigOps +from .crawl_logs import CrawlLogOps from .file_uploads import FileUploadOps from .invites import InviteOps from .orgs import OrgOps @@ -39,6 +40,7 @@ def init_ops() -> Tuple[ UserManager, InviteOps, FileUploadOps, + CrawlLogOps, AsyncIOMotorClient, AsyncIOMotorDatabase, ]: @@ -57,6 +59,8 @@ def init_ops() -> Tuple[ crawl_manager = CrawlManager() + crawl_log_ops = CrawlLogOps(mdb, org_ops) + storage_ops = StorageOps(org_ops, crawl_manager, mdb) file_ops = FileUploadOps(mdb, org_ops, storage_ops) @@ -96,7 +100,7 @@ def init_ops() -> Tuple[ base_crawl_ops = BaseCrawlOps(*base_crawl_init) - crawl_ops = CrawlOps(crawl_manager, *base_crawl_init) + crawl_ops = CrawlOps(crawl_manager, crawl_log_ops, *base_crawl_init) upload_ops = UploadOps(*base_crawl_init) @@ -137,6 +141,7 @@ def init_ops() -> Tuple[ user_manager, invite_ops, file_ops, + crawl_log_ops, dbclient, mdb, ) diff --git a/backend/test_nightly/test_crawl_behavior_logs.py b/backend/test_nightly/test_crawl_behavior_logs.py new file mode 100644 index 0000000000..71aeef4551 --- /dev/null +++ b/backend/test_nightly/test_crawl_behavior_logs.py @@ -0,0 +1,64 @@ +import time + +import requests +import pytest + +from .conftest import API_PREFIX + + +@pytest.fixture(scope="session") +def behavior_log_crawl_id(admin_auth_headers, default_org_id): + crawl_data = { + "runNow": True, + "name": "Crawl with behavior logs", + "config": { + "seeds": [ + {"url": "https://x.com/webrecorder_io"}, + ], + "scopeType": "page", + "limit": 1, + }, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] in ("failed", "complete"): + return crawl_id + time.sleep(5) + + +@pytest.mark.timeout(1200) +def test_get_crawl_behavior_logs( + admin_auth_headers, default_org_id, behavior_log_crawl_id +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{behavior_log_crawl_id}/behaviorLogs", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] > 0 + assert data["items"] + + for item in data["items"]: + assert item["id"] + assert item["crawlId"] == behavior_log_crawl_id + assert item["oid"] == default_org_id + assert item["qaRunId"] is None + assert item["timestamp"] + assert item["logLevel"] + assert item["context"] in ("behavior", "behaviorScript", "behaviorScriptCustom") + assert item["message"] + assert item.get("details") or item.get("details") is None diff --git a/backend/test_nightly/test_crawl_errors.py b/backend/test_nightly/test_crawl_errors.py index 837f1ac052..a54d50e2a5 100644 --- a/backend/test_nightly/test_crawl_errors.py +++ b/backend/test_nightly/test_crawl_errors.py @@ -14,3 +14,14 @@ def test_get_crawl_errors(admin_auth_headers, default_org_id, error_crawl_id): data = r.json() assert data["total"] > 0 assert data["items"] + + for item in data["items"]: + assert item["id"] + assert item["crawlId"] == error_crawl_id + assert item["oid"] == default_org_id + assert item["qaRunId"] is None + assert item["timestamp"] + assert item["logLevel"] in ("error", "fatal") + assert item["context"] + assert item["message"] + assert item.get("details") or item.get("details") is None