Skip to content

perf: Phase 3 functional gates validated - all tests passing #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Changelog

All notable changes to QuarryCore will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added
- feat(extraction): Introduce ExtractorManager for cascading content extraction with quality gating
- Configurable extractor cascade order
- Quality threshold filtering (default 0.6)
- Domain-specific extractor overrides
- Performance metrics tracking
- Resilient error handling with automatic fallback

### Changed
- Pipeline now uses ExtractorManager instead of basic HTML parsing
- Quality assessment integrated into extraction phase for early filtering

### Configuration
- New `extraction` config section with `cascade_order`, `quality_threshold`, and `domain_overrides`
1 change: 1 addition & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
This file serves as the main entry point for production deployments,
providing health checks, graceful shutdown, and proper error handling.
"""

from __future__ import annotations

import asyncio
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ disable_error_code = [

[tool.coverage.run]
branch = true
parallel = true
source = ["quarrycore"]
omit = [
"*/tests/*",
Expand Down Expand Up @@ -383,6 +384,7 @@ addopts = [
"--verbose",
"-ra",
"--cov=quarrycore",
"--cov-branch",
"--cov-report=term-missing",
"--cov-report=html:htmlcov",
"--cov-report=xml",
Expand Down
2 changes: 1 addition & 1 deletion scripts/phase2_verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def check_test_shortcuts() -> bool:

for i, test_file in enumerate(test_files):
if i % 10 == 0:
print_with_timestamp(f" 📄 Processing file {i+1}/{len(test_files)}: {test_file.name}")
print_with_timestamp(f" 📄 Processing file {i + 1}/{len(test_files)}: {test_file.name}")

try:
content = test_file.read_text()
Expand Down
90 changes: 90 additions & 0 deletions scripts/smoke_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""
Smoke test for QuarryCore - processes 20 real URLs to verify system health.
"""

import argparse
import asyncio
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from quarrycore.container import DependencyContainer # noqa: E402
from quarrycore.pipeline import Pipeline # noqa: E402


async def run_smoke_test(count: int = 20) -> dict:
"""Run smoke test with specified number of URLs."""
# Use a mix of real URLs for testing
test_urls = [
"https://example.com",
"https://www.python.org",
"https://docs.python.org/3/",
"https://pypi.org",
"https://github.com",
"https://stackoverflow.com",
"https://www.wikipedia.org",
"https://news.ycombinator.com",
"https://www.reddit.com",
"https://www.bbc.com",
"https://www.nytimes.com",
"https://www.theguardian.com",
"https://www.reuters.com",
"https://www.bloomberg.com",
"https://www.techcrunch.com",
"https://www.wired.com",
"https://www.nature.com",
"https://www.sciencedirect.com",
"https://arxiv.org",
"https://www.jstor.org",
][:count]

container = DependencyContainer()
pipeline = Pipeline(container)

print(f"Starting smoke test with {count} URLs...")

try:
result = await pipeline.run(test_urls)

# Check results
processed = result.get("processed_count", 0)
failed = result.get("failed_count", 0)
total = processed + failed

print("\nResults:")
print(f" Total URLs: {total}")
print(f" Processed: {processed}")
print(f" Failed: {failed}")
print(f" Success rate: {processed/total*100:.1f}%")

# Check if we have non-None content for at least 90% of URLs (18/20)
success_threshold = int(count * 0.9)
if processed >= success_threshold:
print(f"\n✅ PASSED: {processed}/{count} URLs processed successfully (>= {success_threshold} required)")
return {"status": "PASSED", "processed": processed, "failed": failed, "total": total}
else:
print(f"\n❌ FAILED: Only {processed}/{count} URLs processed successfully (>= {success_threshold} required)")
return {"status": "FAILED", "processed": processed, "failed": failed, "total": total}

except Exception as e:
print(f"\n❌ FAILED: Uncaught exception: {e}")
return {"status": "FAILED", "error": str(e)}


def main():
parser = argparse.ArgumentParser(description="QuarryCore smoke test")
parser.add_argument("--count", type=int, default=20, help="Number of URLs to test")
args = parser.parse_args()

result = asyncio.run(run_smoke_test(args.count))

if result["status"] == "FAILED":
sys.exit(1)
sys.exit(0)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions scripts/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
Simple test runner to show pytest progress in real-time
"""

import subprocess
import sys
from pathlib import Path
Expand Down
37 changes: 36 additions & 1 deletion src/quarrycore/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from __future__ import annotations

import logging
import os
import threading
from pathlib import Path
from typing import Any, ClassVar, List, Literal, Optional, cast
from typing import Any, ClassVar, Dict, List, Literal, Optional, cast

import yaml
from pydantic import BaseModel, Field, ValidationError, field_validator
Expand Down Expand Up @@ -328,6 +329,39 @@ class DomainConfig(BaseModel):
technical: bool = True


class ExtractionSettings(BaseModel):
"""Configuration for content extraction cascade."""

cascade_order: List[str] = Field(
default=["trafilatura", "readability", "soup_fallback"], description="Order of extractors to try in cascade"
)
quality_threshold: float = Field(
default_factory=lambda: float(os.getenv("EXTRACTION_QUALITY_THRESHOLD", "0.6")),
ge=0.0,
le=1.0,
description="Minimum quality score to accept extracted content",
)
domain_overrides: Dict[str, List[str]] = Field(
default_factory=dict, description="Domain-specific extractor ordering overrides"
)

@field_validator("cascade_order")
@classmethod
def validate_cascade_order(cls, v: List[str]) -> List[str]:
"""Ensure cascade order is not empty."""
if not v:
raise ValueError("cascade_order must contain at least one extractor")
return v

@field_validator("quality_threshold")
@classmethod
def validate_threshold(cls, v: float) -> float:
"""Ensure quality threshold is in valid range."""
if not 0.0 <= v <= 1.0:
raise ValueError("quality_threshold must be between 0.0 and 1.0")
return v


class WebUIConfig(BaseModel):
"""Configuration for the real-time web UI."""

Expand Down Expand Up @@ -380,6 +414,7 @@ class Config(BaseSettings):
domains: DomainConfig = Field(default_factory=DomainConfig)
monitoring: MonitoringConfig = Field(default_factory=MonitoringConfig)
debug: DebugConfig = Field(default_factory=DebugConfig)
extraction: ExtractionSettings = Field(default_factory=ExtractionSettings)

model_config = SettingsConfigDict(env_prefix="QUARRY_", env_nested_delimiter="__", case_sensitive=False)

Expand Down
38 changes: 36 additions & 2 deletions src/quarrycore/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
if TYPE_CHECKING:
from quarrycore.crawler.http_client import HttpClient
from quarrycore.dataset import DatasetConstructor
from quarrycore.extractor import ExtractorManager
from quarrycore.observability import ObservabilityManager
from quarrycore.quality import QualityAssessor
from quarrycore.recovery.dead_letter_service import SQLiteDeadLetterService
Expand Down Expand Up @@ -67,7 +68,13 @@ def __init__(self, container: DependencyContainer) -> None:
def on_modified(self, event: FileSystemEvent) -> None:
if not event.is_directory and str(event.src_path).endswith((".yaml", ".yml")):
self.logger.info("Configuration file changed, reloading", path=event.src_path)
asyncio.create_task(self.container.reload_config())
try:
loop = asyncio.get_running_loop()
if not loop.is_closed():
loop.create_task(self.container.reload_config())
except RuntimeError:
# No event loop running - skip reload
self.logger.debug("Config change detected but no event loop available for reload")


class DependencyContainer:
Expand Down Expand Up @@ -132,6 +139,7 @@ async def _create_instances(self) -> None:
# Import modules only when needed to avoid circular imports
from quarrycore.crawler.http_client import HttpClient
from quarrycore.dataset import DatasetConstructor
from quarrycore.extractor import ExtractorManager
from quarrycore.observability import ObservabilityManager
from quarrycore.quality import QualityAssessor
from quarrycore.recovery.dead_letter_service import SQLiteDeadLetterService
Expand All @@ -150,6 +158,10 @@ async def _create_instances(self) -> None:
# Add other modules as they become available
}

# ExtractorManager needs quality assessor, so create it after
quality_assessor = await self._instances["quality"].get()
self._instances["extractor_manager"] = LazyInstance(ExtractorManager, quality_assessor, self.config.extraction)

async def reload_config(self) -> None:
"""Hot-reload configuration and reinitialize affected modules."""
old_config = self.config
Expand Down Expand Up @@ -191,6 +203,11 @@ async def get_dead_letter(self) -> SQLiteDeadLetterService:
async with self._instances_lock:
return await self._instances["dead_letter"].get() # type: ignore

async def get_extractor_manager(self) -> ExtractorManager:
"""Get the extractor manager instance."""
async with self._instances_lock:
return await self._instances["extractor_manager"].get() # type: ignore

@asynccontextmanager
async def lifecycle(self) -> AsyncIterator[DependencyContainer]:
"""Context manager for proper lifecycle management."""
Expand Down Expand Up @@ -243,8 +260,25 @@ async def _setup_signal_handlers(self) -> None:
"""Set up signal handlers for graceful shutdown."""

def signal_handler(signum: int, frame: Any) -> None:
# Skip if already shutting down
if not self.is_running:
return

self.logger.info(f"Received signal {signum}, initiating shutdown")
asyncio.create_task(self.shutdown())
try:
loop = asyncio.get_running_loop()
# Only create task if loop is still running
if not loop.is_closed():
loop.create_task(self.shutdown())
except RuntimeError:
# Signal arrived on a non-async thread or no loop
# Only attempt sync shutdown if still running
if self.is_running:
try:
asyncio.run(self.shutdown())
except RuntimeError:
# Already in an event loop or other issue
self.logger.debug("Could not perform async shutdown from signal handler")

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
Expand Down
4 changes: 1 addition & 3 deletions src/quarrycore/crawler/rate_limiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,4 @@ def adapt_to_hardware(self, cpu_cores: int, available_memory_gb: float) -> None:
self.default_rps = min(new_default_rps, 20.0) # Cap at 20 RPS
self.max_rps = min(new_max_rps, 50.0) # Cap at 50 RPS

logger.info(
f"Adapted rate limiter to hardware: " f"default={self.default_rps:.2f} RPS, max={self.max_rps:.2f} RPS"
)
logger.info(f"Adapted rate limiter to hardware: default={self.default_rps:.2f} RPS, max={self.max_rps:.2f} RPS")
12 changes: 12 additions & 0 deletions src/quarrycore/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
from .content_processors import CodeProcessor, ImageProcessor, LinkProcessor, TableProcessor, TextProcessor
from .domain_extractors import EcommerceExtractor, LegalExtractor, MedicalExtractor, TechnicalExtractor
from .language_detector import LanguageDetector
from .manager import ExtractorManager
from .models import ExtractResult
from .protocols import Extractor
from .readability_extractor import ReadabilityExtractor
from .soup_extractor import SoupFallbackExtractor
from .trafilatura_extractor import TrafilaturaExtractor

__all__ = [
"CascadeExtractor",
Expand All @@ -35,4 +41,10 @@
"TechnicalExtractor",
"LanguageDetector",
"ConfidenceScorer",
"ExtractResult",
"Extractor",
"TrafilaturaExtractor",
"ReadabilityExtractor",
"SoupFallbackExtractor",
"ExtractorManager",
]
Loading
Loading