Skip to content

Crawl

Crawl #78

Workflow file for this run

name: Crawl
on:
schedule:
# 5:30pm Pacific on M/W/F
- cron: '30 0 * * 2,4,6'
workflow_dispatch:
inputs:
dry_run:
description: 'Dry run: report planned work, but do not actually upload or change anything'
type: boolean
default: true
test_seeds:
description: 'Use test seeds'
type: boolean
default: true
concurrency:
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: false
env:
BROWSERTRIX_IMAGE: 'webrecorder/browsertrix-crawler:1.7.0'
COLLECTION_NAME_BASE: 'edgi-active-urls'
COLLECTION_TITLE_BASE: 'EDGI Web Monitoring Crawl'
jobs:
seeds:
runs-on: ubuntu-latest
outputs:
timestamp: ${{ steps.timestamper.outputs.timestamp }}
seed_files: ${{ steps.seeds.outputs.files }}
steps:
- id: timestamper
run: |
timestamp="$(date +'%Y%m%d%H%M%S')"
echo "timestamp: '${timestamp}'"
echo "timestamp=${timestamp}" >> "$GITHUB_OUTPUT"
- uses: actions/checkout@v5
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install Python dependencies
run: uv sync --all-extras
- name: Generate seed lists
id: seeds
env:
WEB_MONITORING_DB_EMAIL: '${{ secrets.WEB_MONITORING_DB_EMAIL }}'
WEB_MONITORING_DB_PASSWORD: '${{ secrets.WEB_MONITORING_DB_PASSWORD }}'
run: |
if [[ -n '${{ inputs.test_seeds && 'test' || '' }}' ]]; then
mkdir -p seeds
cp test.crawl.yaml seeds/test.seeds.yaml
echo '["test"]' > filenames.json
else
uv run edgi-wm-crawler multi-seeds \
--size 250 \
--single-group-size 500 \
--workers 1 \
--output seeds \
> filenames.json
fi
echo "files=$(cat filenames.json)" >> $GITHUB_OUTPUT
- name: Upload seed lists as artifact
uses: actions/upload-artifact@v4
with:
name: seeds
path: seeds
retention-days: 7
crawl:
needs:
- seeds
runs-on: ubuntu-latest
strategy:
# One crawl might go bad, but that should not break other crawls! This
# happens most with uploading to IA, not the actual crawling. ¯\_(ツ)_/¯
fail-fast: false
matrix:
seed_name: ${{ fromJSON(needs.seeds.outputs.seed_files) }}
env:
TIMESTAMP: ${{ needs.seeds.outputs.timestamp }}
COLLECTION: 'edgi-${{ needs.seeds.outputs.timestamp }}-${{ matrix.seed_name }}'
SAVE_RESULTS: ${{ (github.event_name == 'schedule' || inputs.dry_run == false) && 'true' || 'false' }}
steps:
- uses: actions/checkout@v5
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install Python dependencies
run: uv sync --all-extras
- name: Mount seed lists
uses: actions/download-artifact@v5
with:
name: seeds
path: seeds
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.CRAWLER_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.CRAWLER_AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_DEFAULT_REGION }}
- name: Get Browsertrix-Crawler
run: |
docker pull '${{ env.BROWSERTRIX_IMAGE }}'
- name: Crawl
id: crawl
run: |
mkdir -p crawls/collections
./crawl.sh ./seeds/${{ matrix.seed_name }}.seeds.yaml "${COLLECTION}"
- name: Upload to S3
if: ${{ success() && env.SAVE_RESULTS == 'true' }}
run: |
aws s3 cp --recursive \
'crawls/collections/${{ env.COLLECTION }}' \
's3://${{ secrets.CRAWLER_S3_BUCKET }}/${{ env.COLLECTION }}'
- name: Upload to workflow artifacts
# Still want to save artifacts if crawl fails (so we can investigate)
if: ${{ success() || steps.crawl.outcome == 'failure' }}
uses: actions/upload-artifact@v4
with:
name: 'crawl-${{ matrix.seed_name }}'
path: 'crawls/collections/${{ env.COLLECTION }}'
retention-days: 7
- name: Import to web-monitoring-db
if: ${{ success() && env.SAVE_RESULTS == 'true' }}
env:
WEB_MONITORING_DB_EMAIL: '${{ secrets.WEB_MONITORING_DB_EMAIL }}'
WEB_MONITORING_DB_PASSWORD: '${{ secrets.WEB_MONITORING_DB_PASSWORD }}'
WEB_MONITORING_DB_S3_BUCKET: '${{ secrets.WEB_MONITORING_DB_S3_BUCKET }}'
DRY_RUN_OPTION: ${{ env.SAVE_RESULTS == 'false' && '--dry-run' || '' }}
run: |
crawl_path="crawls/collections/${COLLECTION}"
uv run wm-warc-import \
--archive-s3 "${WEB_MONITORING_DB_S3_BUCKET}" \
--seeds "${crawl_path}/crawls/config.${COLLECTION}.yaml" \
${{ env.DRY_RUN_OPTION }} \
${crawl_path}/archive/*.warc.gz
- name: Derive Internet Archive options
if: ${{ success() && env.SAVE_RESULTS == 'true' }}
id: ia-options
run: |
iso_date="$(
echo "${TIMESTAMP}" \
| sed -E 's/^([0-9]{4})([0-9]{2})([0-9]{2}).*/\1-\2-\3/'
)"
title="EDGI Web Monitoring Crawl ${iso_date}"
echo "identifier=edgi-active-urls--${TIMESTAMP}" >> "$GITHUB_OUTPUT"
echo "iso_date=${iso_date}" >> "$GITHUB_OUTPUT"
echo "title=${title}" >> "$GITHUB_OUTPUT"
# The IA CLI does not currently support env vars (it used to), so manually
# write a config file.
# See: https://github.com/jjjake/internetarchive/issues/640
- name: Configure ia CLI
if: ${{ success() && env.SAVE_RESULTS == 'true' }}
run: |
config_dir=~/.config/internetarchive
mkdir -p $config_dir
(
echo '[s3]'
echo 'access = ${{ secrets.IAS3_ACCESS_KEY }}'
echo 'secret = ${{ secrets.IAS3_SECRET_KEY }}'
) > $config_dir/ia.ini
- name: Upload to Internet Archive
if: ${{ success() && env.SAVE_RESULTS == 'true' }}
run: |
crawl_path="crawls/collections/${COLLECTION}"
uv run ia upload \
--retries 50 \
--metadata="mediatype:web" \
--metadata="title:${{ steps.ia-options.outputs.title }}" \
--metadata="date:${{ steps.ia-options.outputs.iso_date }}" \
'${{ steps.ia-options.outputs.identifier }}' \
${crawl_path}/archive/* \
${crawl_path}/logs/*