shua-ie · shua-ie · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
-        python-version: ["3.11", "3.12"]
+        python-version: ["3.11"]
 
     runs-on: ${{ matrix.os }}
 

diff --git a/docs/gpu_quality_assessment.md b/docs/gpu_quality_assessment.md
@@ -0,0 +1,90 @@
+# GPU-Accelerated Quality Assessment
+
+QuarryCore now supports GPU acceleration for quality assessment, providing significant performance improvements when CUDA-capable hardware is available.
+
+## Features
+
+- **Automatic GPU Detection**: Automatically detects and uses CUDA when available
+- **Transparent Fallback**: Seamlessly falls back to CPU when CUDA is not available
+- **Configurable Device Selection**: Choose between `auto`, `cpu`, or `cuda` modes
+- **Performance Monitoring**: Built-in metrics for tracking scorer latency and errors
+
+## Configuration
+
+Configure the device in your `config.yaml`:
+
+```yaml
+quality:
+  device: auto  # Options: "auto", "cpu", "cuda"
+  min_content_length: 50
+  max_content_length: 50000
+```
+
+Or use environment variables:
+
+```bash
+export QUARRY_QUALITY_DEVICE=cuda
+```
+
+## Device Options
+
+- **`auto`** (default): Automatically selects CUDA if available, otherwise CPU
+- **`cpu`**: Forces CPU execution even if CUDA is available
+- **`cuda`**: Uses CUDA if available, falls back to CPU with a warning
+
+## Performance Metrics
+
+The GPU-accelerated scorer tracks performance metrics:
+
+- `quarrycore_quality_scorer_latency_seconds`: Latency histogram by scorer type
+- `quarrycore_quality_scorer_errors_total`: Error counter by scorer type
+- `quarrycore_quality_reject_total`: Documents rejected due to low quality
+
+## Requirements
+
+### CPU Mode
+- No additional requirements
+- Works on all systems
+
+### GPU Mode
+- NVIDIA GPU with CUDA support
+- CUDA toolkit installed
+- PyTorch with CUDA support
+- Sufficient GPU memory (typically < 1GB for small models)
+
+## Performance Expectations
+
+When CUDA is available:
+- **Median latency**: ≤ 25ms for 1KB English text
+- **Memory usage**: < 300MB RAM delta after model initialization
+- **GPU memory**: < 1GB for sentence-transformers/all-MiniLM-L6-v2
+
+## Testing
+
+### Unit Tests (No GPU Required)
+```bash
+export QUARRY_QUALITY_DEVICE=cpu
+pytest tests/unit/quality/test_transformer_gpu.py -v
+```
+
+### Performance Tests (GPU Required)
+```bash
+export QUARRY_QUALITY_DEVICE=cuda
+pytest tests/performance/test_quality_gpu_perf.py -v -m requires_cuda
+```
+
+## Troubleshooting
+
+### CUDA Not Detected
+- Verify CUDA installation: `nvidia-smi`
+- Check PyTorch CUDA support: `python -c "import torch; print(torch.cuda.is_available())"`
+
+### Out of Memory Errors
+- Reduce batch size in configuration
+- Use a smaller model
+- Clear GPU cache periodically
+
+### Performance Issues
+- Ensure GPU is not being used by other processes
+- Check GPU utilization with `nvidia-smi`
+- Consider using CPU mode for small workloads where GPU overhead isn't justified 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,13 @@
 [build-system]
-requires = ["setuptools>=65.0", "wheel"]
-build-backend = "setuptools.build_meta"
+requires = ["hatchling"]
+build-backend = "hatchling.build"
 
 [project]
 name = "quarrycore"
-version = "0.1.0"
-description = "Production-grade AI training data miner with adaptive hardware optimization"
+dynamic = ["version"]
+description = "A production-grade asynchronous web content extraction and processing framework"
 readme = "README.md"
-license = {text = "MIT"}
+license = "MIT"
 authors = [
     {name = "Your Name", email = "josh.mcd31@gmail.com"}
 ]
@@ -32,7 +32,7 @@ classifiers = [
     "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-requires-python = ">=3.11,<3.13"
+requires-python = ">=3.11,<3.12"
 dependencies = [
     # Core async and HTTP (following workflow specs)
     "httpx[http2]>=0.25.0",

diff --git a/requirements.txt b/requirements.txt
@@ -97,9 +97,9 @@ multiprocess==0.70.16
 murmurhash==1.0.13
 mypy==1.16.1
 mypy_extensions==1.1.0
-networkx==3.5
+networkx==3.3
 nodeenv==1.9.1
-numpy==2.3.1
+numpy==1.24.4
 nvidia-cublas-cu12==12.6.4.1
 nvidia-cuda-cupti-cu12==12.6.80
 nvidia-cuda-nvrtc-cu12==12.6.77

diff --git a/requirements_py310.txt b/requirements_py310.txt
@@ -0,0 +1,43 @@
+# Core dependencies compatible with Python 3.10
+aiofiles==24.1.0
+aiosqlite==0.20.0
+httpx==0.29.0
+structlog==24.5.0
+pydantic==2.10.7
+pydantic-settings==2.7.1
+prometheus-client==0.22.0
+PyYAML==6.0.2
+lxml==5.3.0
+selectolax==0.3.30
+trafilatura==2.0.0
+beautifulsoup4==4.12.3
+ftfy==6.4.0
+fasttext-wheel==0.9.2
+sentence-transformers==3.0.0
+psutil==6.1.1
+tenacity==9.0.0
+uvloop==0.21.0
+async-lru==2.0.4
+markdownify==0.14.1
+humanize==4.11.0
+polars==1.20.0
+rich==14.0.0
+python-dotenv==1.1.1
+pytest==8.3.6
+pytest-asyncio==0.24.0
+pytest-cov==6.0.0
+pytest-mock==3.14.0
+pytest-benchmark==6.0.0
+hypothesis==6.130.2
+black==24.10.0
+ruff==0.12.0
+mypy==1.16.1
+isort==5.13.2
+flake8==7.1.1
+pre-commit==4.0.1
+# ML dependencies
+torch==2.2.2
+numpy==1.24.4
+scipy==1.10.1
+scikit-learn==1.3.2
+networkx==3.1
diff --git a/src/quarrycore/cli.py b/src/quarrycore/cli.py
@@ -184,7 +184,7 @@ async def run_pipeline() -> None:
                         )
                     )
                 else:
-                    print(json.dumps(health, indent=2))
+                    logger.info("Health check", health=health)
             return
 
         if console and Panel:
@@ -216,14 +216,16 @@ async def run_standard_pipeline(
     """Run pipeline with progress bars."""
     if not Progress or not console:
         # Fallback to simple text output
-        print(f"Processing {len(url_list)} URLs...")
+        logger.info("Processing URLs", count=len(url_list))
         result = await pipeline.run(
             urls=url_list,
             batch_size=batch_size,
             checkpoint_interval=checkpoint_interval,
             resume_from=resume_from,
         )
-        print(f"Completed: {result.get('processed_count', 0)} processed, {result.get('failed_count', 0)} failed")
+        logger.info(
+            "Pipeline completed", processed=result.get("processed_count", 0), failed=result.get("failed_count", 0)
+        )
         return
 
     progress = Progress(
@@ -311,11 +313,13 @@ def create_status_table() -> Any:
 
         while not pipeline_task.done():
             if pipeline.state:
-                print(f"Status: {pipeline.state.stage.value}, Processed: {pipeline.state.processed_count}")
+                logger.info(
+                    "Pipeline status", stage=pipeline.state.stage.value, processed=pipeline.state.processed_count
+                )
             await asyncio.sleep(1)
 
         await pipeline_task
-        print("Pipeline completed successfully!")
+        logger.info("Pipeline completed successfully")
         return
 
     with Live(create_status_table(), refresh_per_second=2, console=console) as live:
@@ -404,9 +408,9 @@ async def run_analysis() -> None:
             else:
                 formatted_result = json.dumps(analysis_result, indent=2)
                 if console:
-                    console.print(formatted_result)
+                    console.logger.info("Analysis result", result=formatted_result)
                 else:
-                    print(formatted_result)
+                    logger.info("Analysis result", result=formatted_result)
 
                 if output:
                     Path(output).write_text(formatted_result)
@@ -462,28 +466,28 @@ async def validate() -> None:
 
                 console.print(table)
             else:
-                print("Configuration Status:")
+                logger.info("Configuration Status")
                 for key, value in health.items():
                     status = "✅ OK" if value else "❌ Error"
-                    print(f"{key}: {status}")
+                    logger.info("Config item", key=key, status=status)
 
             if all(health.values()):
                 if console:
                     console.print("[green]✅ Configuration is valid![/green]")
                 else:
-                    print("✅ Configuration is valid!")
+                    logger.info("Configuration is valid")
             else:
                 if console:
                     console.print("[red]❌ Configuration has issues![/red]")
                 else:
-                    print("❌ Configuration has issues!")
+                    logger.error("Configuration has issues")
                 sys.exit(1)
 
         except Exception as e:
             if console:
                 console.print(f"[red]❌ Configuration validation failed: {e}[/red]")
             else:
-                print(f"❌ Configuration validation failed: {e}")
+                logger.error("Configuration validation failed", error=str(e))
             sys.exit(1)
 
     asyncio.run(validate())
@@ -510,15 +514,15 @@ async def check_health() -> None:
                 for key, value in health_status.items():
                     metric_value = 1 if value else 0
                     if console:
-                        console.print(f"quarrycore_health_{key} {metric_value}")
+                        console.logger.info("Health metric", metric=f"quarrycore_health_{key}", value=metric_value)
                     else:
-                        print(f"quarrycore_health_{key} {metric_value}")
+                        logger.info("Health metric", metric=f"quarrycore_health_{key}", value=metric_value)
             else:
                 output = json.dumps(health_status, indent=2)
                 if console:
-                    console.print(output)
+                    console.logger.info("Health output", output=output)
                 else:
-                    print(output)
+                    logger.info("Health output", output=output)
 
             # Exit with non-zero if unhealthy
             if not all(health_status.values()):
@@ -527,9 +531,9 @@ async def check_health() -> None:
         except Exception as e:
             error_output = json.dumps({"error": str(e)})
             if console:
-                console.print(error_output)
+                console.logger.error("Health check error", output=error_output)
             else:
-                print(error_output)
+                logger.error("Health check error", output=error_output)
             sys.exit(1)
 
     asyncio.run(check_health())

diff --git a/src/quarrycore/config/config.py b/src/quarrycore/config/config.py
@@ -231,6 +231,10 @@ class QualityConfig(BaseModel):
 
     min_content_length: int = Field(default=50, description="Minimum word count to perform quality assessment.")
     max_content_length: int = Field(default=50000, description="Maximum word count to perform quality assessment.")
+    min_score: float = Field(default=0.6, ge=0, le=1, description="Minimum quality score to accept content.")
+    device: Literal["auto", "cpu", "cuda"] = Field(
+        default="auto", description="Device to use for neural scoring (auto, cpu, or cuda)"
+    )
     default: DomainQualityConfig = Field(default_factory=DomainQualityConfig)
     domains: dict[DomainType, DomainQualityConfig] = Field(
         default_factory=dict, description="Domain-specific quality thresholds."

diff --git a/src/quarrycore/dataset/analytics.py b/src/quarrycore/dataset/analytics.py
@@ -8,11 +8,14 @@
 from typing import Any, Dict, List, Tuple
 
 import numpy as np
+import structlog
 from transformers import AutoTokenizer
 
 from quarrycore.config.config import DatasetConfig
 from quarrycore.protocols import ContentMetadata, QualityScore
 
+logger = structlog.get_logger(__name__)
+
 
 class Analytics:
     """Computes and presents analytics on a final dataset."""
@@ -101,6 +104,6 @@ def pretty_print_report(self, report: Dict[str, Any]) -> None:
         """Prints the analytics report in a readable format."""
         import json
 
-        print("\n--- Dataset Analytics Report ---")
-        print(json.dumps(report, indent=2))
-        print("------------------------------\n")
+        logger.info("\n--- Dataset Analytics Report ---")
+        logger.info(json.dumps(report, indent=2))
+        logger.info("------------------------------\n")