Skip to content

Commit 4bf1caf

Browse files
authored
Merge pull request #3 from shua-ie/dev
perf: Phase 3 functional gates validated - all tests passing
2 parents 2e06690 + 321ec5d commit 4bf1caf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+5681
-228
lines changed

CHANGELOG.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Changelog
2+
3+
All notable changes to QuarryCore will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [Unreleased]
9+
10+
### Added
11+
- feat(extraction): Introduce ExtractorManager for cascading content extraction with quality gating
12+
- Configurable extractor cascade order
13+
- Quality threshold filtering (default 0.6)
14+
- Domain-specific extractor overrides
15+
- Performance metrics tracking
16+
- Resilient error handling with automatic fallback
17+
18+
### Changed
19+
- Pipeline now uses ExtractorManager instead of basic HTML parsing
20+
- Quality assessment integrated into extraction phase for early filtering
21+
22+
### Configuration
23+
- New `extraction` config section with `cascade_order`, `quality_threshold`, and `domain_overrides`

main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
This file serves as the main entry point for production deployments,
66
providing health checks, graceful shutdown, and proper error handling.
77
"""
8+
89
from __future__ import annotations
910

1011
import asyncio

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ disable_error_code = [
334334

335335
[tool.coverage.run]
336336
branch = true
337+
parallel = true
337338
source = ["quarrycore"]
338339
omit = [
339340
"*/tests/*",
@@ -383,6 +384,7 @@ addopts = [
383384
"--verbose",
384385
"-ra",
385386
"--cov=quarrycore",
387+
"--cov-branch",
386388
"--cov-report=term-missing",
387389
"--cov-report=html:htmlcov",
388390
"--cov-report=xml",

scripts/phase2_verify.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ def check_test_shortcuts() -> bool:
290290

291291
for i, test_file in enumerate(test_files):
292292
if i % 10 == 0:
293-
print_with_timestamp(f" 📄 Processing file {i+1}/{len(test_files)}: {test_file.name}")
293+
print_with_timestamp(f" 📄 Processing file {i + 1}/{len(test_files)}: {test_file.name}")
294294

295295
try:
296296
content = test_file.read_text()

scripts/smoke_run.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Smoke test for QuarryCore - processes 20 real URLs to verify system health.
4+
"""
5+
6+
import argparse
7+
import asyncio
8+
import sys
9+
from pathlib import Path
10+
11+
# Add src to path
12+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
13+
14+
from quarrycore.container import DependencyContainer # noqa: E402
15+
from quarrycore.pipeline import Pipeline # noqa: E402
16+
17+
18+
async def run_smoke_test(count: int = 20) -> dict:
19+
"""Run smoke test with specified number of URLs."""
20+
# Use a mix of real URLs for testing
21+
test_urls = [
22+
"https://example.com",
23+
"https://www.python.org",
24+
"https://docs.python.org/3/",
25+
"https://pypi.org",
26+
"https://github.com",
27+
"https://stackoverflow.com",
28+
"https://www.wikipedia.org",
29+
"https://news.ycombinator.com",
30+
"https://www.reddit.com",
31+
"https://www.bbc.com",
32+
"https://www.nytimes.com",
33+
"https://www.theguardian.com",
34+
"https://www.reuters.com",
35+
"https://www.bloomberg.com",
36+
"https://www.techcrunch.com",
37+
"https://www.wired.com",
38+
"https://www.nature.com",
39+
"https://www.sciencedirect.com",
40+
"https://arxiv.org",
41+
"https://www.jstor.org",
42+
][:count]
43+
44+
container = DependencyContainer()
45+
pipeline = Pipeline(container)
46+
47+
print(f"Starting smoke test with {count} URLs...")
48+
49+
try:
50+
result = await pipeline.run(test_urls)
51+
52+
# Check results
53+
processed = result.get("processed_count", 0)
54+
failed = result.get("failed_count", 0)
55+
total = processed + failed
56+
57+
print("\nResults:")
58+
print(f" Total URLs: {total}")
59+
print(f" Processed: {processed}")
60+
print(f" Failed: {failed}")
61+
print(f" Success rate: {processed/total*100:.1f}%")
62+
63+
# Check if we have non-None content for at least 90% of URLs (18/20)
64+
success_threshold = int(count * 0.9)
65+
if processed >= success_threshold:
66+
print(f"\n✅ PASSED: {processed}/{count} URLs processed successfully (>= {success_threshold} required)")
67+
return {"status": "PASSED", "processed": processed, "failed": failed, "total": total}
68+
else:
69+
print(f"\n❌ FAILED: Only {processed}/{count} URLs processed successfully (>= {success_threshold} required)")
70+
return {"status": "FAILED", "processed": processed, "failed": failed, "total": total}
71+
72+
except Exception as e:
73+
print(f"\n❌ FAILED: Uncaught exception: {e}")
74+
return {"status": "FAILED", "error": str(e)}
75+
76+
77+
def main():
78+
parser = argparse.ArgumentParser(description="QuarryCore smoke test")
79+
parser.add_argument("--count", type=int, default=20, help="Number of URLs to test")
80+
args = parser.parse_args()
81+
82+
result = asyncio.run(run_smoke_test(args.count))
83+
84+
if result["status"] == "FAILED":
85+
sys.exit(1)
86+
sys.exit(0)
87+
88+
89+
if __name__ == "__main__":
90+
main()

scripts/test_runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""
33
Simple test runner to show pytest progress in real-time
44
"""
5+
56
import subprocess
67
import sys
78
from pathlib import Path

src/quarrycore/config/config.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
from __future__ import annotations
66

77
import logging
8+
import os
89
import threading
910
from pathlib import Path
10-
from typing import Any, ClassVar, List, Literal, Optional, cast
11+
from typing import Any, ClassVar, Dict, List, Literal, Optional, cast
1112

1213
import yaml
1314
from pydantic import BaseModel, Field, ValidationError, field_validator
@@ -328,6 +329,39 @@ class DomainConfig(BaseModel):
328329
technical: bool = True
329330

330331

332+
class ExtractionSettings(BaseModel):
333+
"""Configuration for content extraction cascade."""
334+
335+
cascade_order: List[str] = Field(
336+
default=["trafilatura", "readability", "soup_fallback"], description="Order of extractors to try in cascade"
337+
)
338+
quality_threshold: float = Field(
339+
default_factory=lambda: float(os.getenv("EXTRACTION_QUALITY_THRESHOLD", "0.6")),
340+
ge=0.0,
341+
le=1.0,
342+
description="Minimum quality score to accept extracted content",
343+
)
344+
domain_overrides: Dict[str, List[str]] = Field(
345+
default_factory=dict, description="Domain-specific extractor ordering overrides"
346+
)
347+
348+
@field_validator("cascade_order")
349+
@classmethod
350+
def validate_cascade_order(cls, v: List[str]) -> List[str]:
351+
"""Ensure cascade order is not empty."""
352+
if not v:
353+
raise ValueError("cascade_order must contain at least one extractor")
354+
return v
355+
356+
@field_validator("quality_threshold")
357+
@classmethod
358+
def validate_threshold(cls, v: float) -> float:
359+
"""Ensure quality threshold is in valid range."""
360+
if not 0.0 <= v <= 1.0:
361+
raise ValueError("quality_threshold must be between 0.0 and 1.0")
362+
return v
363+
364+
331365
class WebUIConfig(BaseModel):
332366
"""Configuration for the real-time web UI."""
333367

@@ -380,6 +414,7 @@ class Config(BaseSettings):
380414
domains: DomainConfig = Field(default_factory=DomainConfig)
381415
monitoring: MonitoringConfig = Field(default_factory=MonitoringConfig)
382416
debug: DebugConfig = Field(default_factory=DebugConfig)
417+
extraction: ExtractionSettings = Field(default_factory=ExtractionSettings)
383418

384419
model_config = SettingsConfigDict(env_prefix="QUARRY_", env_nested_delimiter="__", case_sensitive=False)
385420

src/quarrycore/container.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
if TYPE_CHECKING:
2121
from quarrycore.crawler.http_client import HttpClient
2222
from quarrycore.dataset import DatasetConstructor
23+
from quarrycore.extractor import ExtractorManager
2324
from quarrycore.observability import ObservabilityManager
2425
from quarrycore.quality import QualityAssessor
2526
from quarrycore.recovery.dead_letter_service import SQLiteDeadLetterService
@@ -67,7 +68,13 @@ def __init__(self, container: DependencyContainer) -> None:
6768
def on_modified(self, event: FileSystemEvent) -> None:
6869
if not event.is_directory and str(event.src_path).endswith((".yaml", ".yml")):
6970
self.logger.info("Configuration file changed, reloading", path=event.src_path)
70-
asyncio.create_task(self.container.reload_config())
71+
try:
72+
loop = asyncio.get_running_loop()
73+
if not loop.is_closed():
74+
loop.create_task(self.container.reload_config())
75+
except RuntimeError:
76+
# No event loop running - skip reload
77+
self.logger.debug("Config change detected but no event loop available for reload")
7178

7279

7380
class DependencyContainer:
@@ -132,6 +139,7 @@ async def _create_instances(self) -> None:
132139
# Import modules only when needed to avoid circular imports
133140
from quarrycore.crawler.http_client import HttpClient
134141
from quarrycore.dataset import DatasetConstructor
142+
from quarrycore.extractor import ExtractorManager
135143
from quarrycore.observability import ObservabilityManager
136144
from quarrycore.quality import QualityAssessor
137145
from quarrycore.recovery.dead_letter_service import SQLiteDeadLetterService
@@ -150,6 +158,10 @@ async def _create_instances(self) -> None:
150158
# Add other modules as they become available
151159
}
152160

161+
# ExtractorManager needs quality assessor, so create it after
162+
quality_assessor = await self._instances["quality"].get()
163+
self._instances["extractor_manager"] = LazyInstance(ExtractorManager, quality_assessor, self.config.extraction)
164+
153165
async def reload_config(self) -> None:
154166
"""Hot-reload configuration and reinitialize affected modules."""
155167
old_config = self.config
@@ -191,6 +203,11 @@ async def get_dead_letter(self) -> SQLiteDeadLetterService:
191203
async with self._instances_lock:
192204
return await self._instances["dead_letter"].get() # type: ignore
193205

206+
async def get_extractor_manager(self) -> ExtractorManager:
207+
"""Get the extractor manager instance."""
208+
async with self._instances_lock:
209+
return await self._instances["extractor_manager"].get() # type: ignore
210+
194211
@asynccontextmanager
195212
async def lifecycle(self) -> AsyncIterator[DependencyContainer]:
196213
"""Context manager for proper lifecycle management."""
@@ -243,8 +260,25 @@ async def _setup_signal_handlers(self) -> None:
243260
"""Set up signal handlers for graceful shutdown."""
244261

245262
def signal_handler(signum: int, frame: Any) -> None:
263+
# Skip if already shutting down
264+
if not self.is_running:
265+
return
266+
246267
self.logger.info(f"Received signal {signum}, initiating shutdown")
247-
asyncio.create_task(self.shutdown())
268+
try:
269+
loop = asyncio.get_running_loop()
270+
# Only create task if loop is still running
271+
if not loop.is_closed():
272+
loop.create_task(self.shutdown())
273+
except RuntimeError:
274+
# Signal arrived on a non-async thread or no loop
275+
# Only attempt sync shutdown if still running
276+
if self.is_running:
277+
try:
278+
asyncio.run(self.shutdown())
279+
except RuntimeError:
280+
# Already in an event loop or other issue
281+
self.logger.debug("Could not perform async shutdown from signal handler")
248282

249283
signal.signal(signal.SIGTERM, signal_handler)
250284
signal.signal(signal.SIGINT, signal_handler)

src/quarrycore/crawler/rate_limiter.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,4 @@ def adapt_to_hardware(self, cpu_cores: int, available_memory_gb: float) -> None:
355355
self.default_rps = min(new_default_rps, 20.0) # Cap at 20 RPS
356356
self.max_rps = min(new_max_rps, 50.0) # Cap at 50 RPS
357357

358-
logger.info(
359-
f"Adapted rate limiter to hardware: " f"default={self.default_rps:.2f} RPS, max={self.max_rps:.2f} RPS"
360-
)
358+
logger.info(f"Adapted rate limiter to hardware: default={self.default_rps:.2f} RPS, max={self.max_rps:.2f} RPS")

src/quarrycore/extractor/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121
from .content_processors import CodeProcessor, ImageProcessor, LinkProcessor, TableProcessor, TextProcessor
2222
from .domain_extractors import EcommerceExtractor, LegalExtractor, MedicalExtractor, TechnicalExtractor
2323
from .language_detector import LanguageDetector
24+
from .manager import ExtractorManager
25+
from .models import ExtractResult
26+
from .protocols import Extractor
27+
from .readability_extractor import ReadabilityExtractor
28+
from .soup_extractor import SoupFallbackExtractor
29+
from .trafilatura_extractor import TrafilaturaExtractor
2430

2531
__all__ = [
2632
"CascadeExtractor",
@@ -35,4 +41,10 @@
3541
"TechnicalExtractor",
3642
"LanguageDetector",
3743
"ConfidenceScorer",
44+
"ExtractResult",
45+
"Extractor",
46+
"TrafilaturaExtractor",
47+
"ReadabilityExtractor",
48+
"SoupFallbackExtractor",
49+
"ExtractorManager",
3850
]

0 commit comments

Comments
 (0)