Skip to content

Commit 99aae2f

Browse files
authored
feat: docker support (#8)
1 parent ff5e2e9 commit 99aae2f

File tree

5 files changed

+431
-1
lines changed

5 files changed

+431
-1
lines changed

.dockerignore

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Dependencies
2+
node_modules/
3+
*/node_modules/
4+
5+
# Build artifacts
6+
dist/
7+
*/dist/
8+
.output/
9+
10+
# Development files
11+
.git/
12+
.github/
13+
*.log
14+
*.tmp
15+
.DS_Store
16+
17+
# Test files
18+
test/
19+
coverage/
20+
*.test.*
21+
*.spec.*
22+
23+
# Documentation
24+
README.md
25+
CHANGELOG.md
26+
docs/
27+
28+
# IDE files
29+
.vscode/
30+
.idea/
31+
*.swp
32+
*.swo
33+
34+
# Environment files
35+
.env*
36+
!.env.example
37+
38+
# Cache directories
39+
.cache/
40+
.temp/
41+
.tmp/

.github/workflows/release-docker.yml

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
name: Release Docker Images
2+
3+
on:
4+
release:
5+
types: [published]
6+
push:
7+
tags:
8+
- 'v*'
9+
workflow_dispatch:
10+
inputs:
11+
tag:
12+
description: Tag to build and publish
13+
required: true
14+
default: latest
15+
16+
env:
17+
REGISTRY_DOCKERHUB: docker.io
18+
REGISTRY_GHCR: ghcr.io
19+
IMAGE_NAME: mdream
20+
21+
jobs:
22+
publish-docker:
23+
runs-on: ubuntu-latest
24+
permissions:
25+
contents: read
26+
packages: write
27+
28+
steps:
29+
- name: Checkout repository
30+
uses: actions/checkout@v4
31+
32+
- name: Set up Docker Buildx
33+
uses: docker/setup-buildx-action@v3
34+
35+
- name: Log in to Docker Hub
36+
uses: docker/login-action@v3
37+
with:
38+
username: ${{ secrets.DOCKERHUB_USERNAME }}
39+
password: ${{ secrets.DOCKERHUB_TOKEN }}
40+
41+
- name: Log in to GitHub Container Registry
42+
uses: docker/login-action@v3
43+
with:
44+
registry: ${{ env.REGISTRY_GHCR }}
45+
username: ${{ github.actor }}
46+
password: ${{ secrets.GITHUB_TOKEN }}
47+
48+
- name: Extract version from tag
49+
id: version
50+
run: |
51+
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
52+
VERSION=${{ github.event.inputs.tag }}
53+
elif [[ "${{ github.ref }}" == refs/tags/* ]]; then
54+
VERSION=${GITHUB_REF#refs/tags/v}
55+
elif [[ "${{ github.event_name }}" == "release" ]]; then
56+
VERSION=${{ github.event.release.tag_name }}
57+
VERSION=${VERSION#v}
58+
else
59+
VERSION=latest
60+
fi
61+
echo "version=$VERSION" >> $GITHUB_OUTPUT
62+
echo "Building version: $VERSION"
63+
64+
- name: Extract metadata
65+
id: meta
66+
uses: docker/metadata-action@v5
67+
with:
68+
images: |
69+
${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}
70+
${{ env.REGISTRY_GHCR }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}
71+
tags: |
72+
type=raw,value=latest
73+
type=raw,value=${{ steps.version.outputs.version }}
74+
type=semver,pattern={{version}},value=v${{ steps.version.outputs.version }}
75+
type=semver,pattern={{major}}.{{minor}},value=v${{ steps.version.outputs.version }}
76+
type=semver,pattern={{major}},value=v${{ steps.version.outputs.version }}
77+
labels: |
78+
org.opencontainers.image.title=mdream
79+
org.opencontainers.image.description=Ultra-performant HTML to Markdown converter optimized for LLMs with Playwright Chrome
80+
org.opencontainers.image.vendor=${{ github.repository_owner }}
81+
org.opencontainers.image.version=${{ steps.version.outputs.version }}
82+
83+
- name: Build and push Docker image
84+
uses: docker/build-push-action@v5
85+
with:
86+
context: .
87+
platforms: linux/amd64,linux/arm64
88+
push: true
89+
tags: ${{ steps.meta.outputs.tags }}
90+
labels: ${{ steps.meta.outputs.labels }}
91+
cache-from: type=gha
92+
cache-to: type=gha,mode=max
93+
build-args: |
94+
VERSION=${{ steps.version.outputs.version }}
95+
96+
- name: Test published images
97+
run: |
98+
echo "🧪 Testing published Docker crawl images..."
99+
100+
# Wait for images to be available
101+
sleep 30
102+
103+
# Test Docker Hub image with help (default ENTRYPOINT behavior)
104+
echo "Testing Docker Hub image help..."
105+
timeout 60 docker run --rm ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} --help > /dev/null
106+
echo "✅ Docker Hub image help works!"
107+
108+
# Test GitHub Container Registry image
109+
echo "Testing GHCR image help..."
110+
timeout 60 docker run --rm ${{ env.REGISTRY_GHCR }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} --help > /dev/null
111+
echo "✅ GHCR image works!"
112+
113+
# Test version command (direct to ENTRYPOINT)
114+
echo "Testing version command..."
115+
timeout 60 docker run --rm ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} --version > /dev/null
116+
echo "✅ Version command works!"
117+
118+
# Test that container runs without arguments (should show help)
119+
echo "Testing container without arguments..."
120+
OUTPUT=$(timeout 60 docker run --rm ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} 2>&1 || true)
121+
if [[ "$OUTPUT" == *"@mdream/crawl"* ]]; then
122+
echo "✅ Container shows help when run without arguments!"
123+
else
124+
echo "❌ Container should show help when run without arguments"
125+
echo "Output was: $OUTPUT"
126+
exit 1
127+
fi
128+
129+
# Test with a URL argument to ensure ENTRYPOINT works correctly
130+
echo "Testing with URL argument (should show error for demo URL)..."
131+
OUTPUT=$(timeout 60 docker run --rm ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} --url https://httpbin.org/delay/30 --max-pages 1 2>&1 || true)
132+
echo "✅ Container accepts arguments correctly via ENTRYPOINT!"
133+
134+
- name: Report image sizes
135+
run: |
136+
echo "## 📦 Image Size Report" >> $GITHUB_STEP_SUMMARY
137+
echo "" >> $GITHUB_STEP_SUMMARY
138+
139+
# Get and display image sizes
140+
DOCKERHUB_SIZE=$(docker inspect ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} --format='{{.Size}}' | numfmt --to=iec-i --suffix=B)
141+
GHCR_SIZE=$(docker inspect ${{ env.REGISTRY_GHCR }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} --format='{{.Size}}' | numfmt --to=iec-i --suffix=B)
142+
143+
echo "| Registry | Image | Size |" >> $GITHUB_STEP_SUMMARY
144+
echo "|----------|-------|------|" >> $GITHUB_STEP_SUMMARY
145+
echo "| Docker Hub | \`${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}\` | **$DOCKERHUB_SIZE** |" >> $GITHUB_STEP_SUMMARY
146+
echo "| GHCR | \`${{ env.REGISTRY_GHCR }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}\` | **$GHCR_SIZE** |" >> $GITHUB_STEP_SUMMARY
147+
148+
# Get layer information
149+
echo "" >> $GITHUB_STEP_SUMMARY
150+
echo "### Layer Analysis" >> $GITHUB_STEP_SUMMARY
151+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
152+
docker history ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} --no-trunc --format "table {{.CreatedBy}}\t{{.Size}}" | head -20 >> $GITHUB_STEP_SUMMARY
153+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
154+
155+
- name: Update Docker Hub description
156+
uses: peter-evans/dockerhub-description@v3
157+
with:
158+
username: ${{ secrets.DOCKERHUB_USERNAME }}
159+
password: ${{ secrets.DOCKERHUB_TOKEN }}
160+
repository: ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}
161+
short-description: '@mdream/crawl with Playwright Chrome for website crawling and llms.txt generation'
162+
readme-filepath: ./DOCKER.md
163+
164+
- name: Create release summary
165+
run: |
166+
echo "## 🐳 Docker Crawl Images Published" >> $GITHUB_STEP_SUMMARY
167+
echo "" >> $GITHUB_STEP_SUMMARY
168+
echo "**@mdream/crawl with Playwright Chrome pre-installed**" >> $GITHUB_STEP_SUMMARY
169+
echo "" >> $GITHUB_STEP_SUMMARY
170+
echo "### Available Images" >> $GITHUB_STEP_SUMMARY
171+
echo "- \`${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY
172+
echo "- \`${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:latest\`" >> $GITHUB_STEP_SUMMARY
173+
echo "- \`${{ env.REGISTRY_GHCR }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY
174+
echo "- \`${{ env.REGISTRY_GHCR }}/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:latest\`" >> $GITHUB_STEP_SUMMARY
175+
echo "" >> $GITHUB_STEP_SUMMARY
176+
echo "### Quick Start" >> $GITHUB_STEP_SUMMARY
177+
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
178+
echo "# Pull the crawl image" >> $GITHUB_STEP_SUMMARY
179+
echo "docker pull ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}" >> $GITHUB_STEP_SUMMARY
180+
echo "" >> $GITHUB_STEP_SUMMARY
181+
echo "# Crawl a website" >> $GITHUB_STEP_SUMMARY
182+
echo "docker run --rm -v \$(pwd)/output:/app/output \\\\" >> $GITHUB_STEP_SUMMARY
183+
echo " ${{ secrets.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }} \\\\" >> $GITHUB_STEP_SUMMARY
184+
echo " --url https://example.com --output /app/output" >> $GITHUB_STEP_SUMMARY
185+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
186+
echo "" >> $GITHUB_STEP_SUMMARY
187+
echo "### Available Commands" >> $GITHUB_STEP_SUMMARY
188+
echo "- \`mdream-crawl\` - Main crawling command" >> $GITHUB_STEP_SUMMARY
189+
echo "- \`crawl\` - Short alias" >> $GITHUB_STEP_SUMMARY
190+
echo "" >> $GITHUB_STEP_SUMMARY
191+
echo "### Platforms" >> $GITHUB_STEP_SUMMARY
192+
echo "- linux/amd64" >> $GITHUB_STEP_SUMMARY
193+
echo "- linux/arm64" >> $GITHUB_STEP_SUMMARY
194+
195+
notify-success:
196+
needs: publish-docker
197+
runs-on: ubuntu-latest
198+
if: success()
199+
permissions: {}
200+
steps:
201+
- name: Notify release success
202+
run: |
203+
echo "🎉 Docker images for ${{ github.event.release.tag_name || github.ref_name }} published successfully!"
204+
echo "Images are available at:"
205+
echo "- Docker Hub: ${{ secrets.DOCKERHUB_USERNAME }}/mdream"
206+
echo "- GHCR: ghcr.io/${{ github.repository_owner }}/mdream"

DOCKER.md

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Docker Usage
2+
3+
This Docker image provides `@mdream/crawl` with Playwright Chrome pre-installed for website crawling and llms.txt generation in containerized environments.
4+
5+
## Quick Start
6+
7+
```bash
8+
# Basic crawling
9+
docker run harlanzw/mdream:latest https://example.com
10+
11+
# Interactive mode
12+
docker run -it harlanzw/mdream:latest
13+
14+
# Show help
15+
docker run harlanzw/mdream:latest --help
16+
```
17+
18+
## Available Images
19+
20+
- **Docker Hub**: `harlanzw/mdream:latest`, `harlanzw/mdream:v0.8.5`
21+
- **GitHub Container Registry**: `ghcr.io/harlan-zw/mdream:latest`
22+
- **Multi-platform**: Supports `linux/amd64` and `linux/arm64`
23+
24+
## Basic Usage
25+
26+
```bash
27+
# Crawl a website with depth limit
28+
docker run harlanzw/mdream:latest https://example.com --depth 2
29+
30+
# Crawl with exclusions and limits
31+
docker run harlanzw/mdream:latest https://large-site.com \
32+
--exclude "*/admin/*" --exclude "*/api/*" --max-pages 50
33+
34+
# Crawl using Playwright for JavaScript sites
35+
docker run harlanzw/mdream:latest https://spa-site.com --driver playwright
36+
```
37+
38+
## Persistent Output
39+
40+
To save crawled content to your local machine:
41+
42+
```bash
43+
# Mount output directory
44+
docker run -v $(pwd)/output:/app/output harlanzw/mdream:latest \
45+
https://example.com --output /app/output
46+
```
47+
48+
## Building Locally
49+
50+
```bash
51+
docker build -t mdream-local .
52+
docker run mdream-local https://example.com
53+
```
54+
55+
## How It Works
56+
57+
The Docker container is configured with `ENTRYPOINT` to act directly as the `mdream-crawl` command:
58+
- All arguments passed to `docker run` are forwarded to `mdream-crawl`
59+
- No need to specify command names - just pass your crawl options
60+
- Clean, intuitive interface that feels like using the CLI directly
61+
62+
## Environment Variables
63+
64+
- `PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1` - Already set (browsers pre-installed)
65+
- `PLAYWRIGHT_BROWSERS_PATH=/ms-playwright` - Browser location
66+
- `DISPLAY=:99` - Virtual display for headless browsing
67+
68+
69+
## Output Files
70+
71+
The crawler generates these artifacts in your output directory:
72+
- `llms.txt` - Consolidated text file optimized for LLM consumption
73+
- `llms-full.txt` - Extended format with comprehensive metadata
74+
- `md/` - Individual Markdown files for each crawled page
75+
76+
## Base Image
77+
78+
Uses `apify/actor-node-playwright-chrome:20` which includes:
79+
- Node.js 20 with pnpm
80+
- Playwright with Chrome browser pre-installed
81+
- XVFB for headless browsing support
82+
- Optimized for web crawling and automation

0 commit comments

Comments
 (0)