Skip to content

Scheduled ETL: Extract Drudge sample #949

Scheduled ETL: Extract Drudge sample

Scheduled ETL: Extract Drudge sample #949

name: "Scheduled ETL: Extract Drudge sample"
on:
schedule:
- cron: '30 0 * * *'
workflow_dispatch:
concurrency:
group: extract-drudge
cancel-in-progress: true
jobs:
download:
name: Download Drudge hyperlinks
runs-on: ubuntu-latest
steps:
- id: checkout
name: Checkout
uses: actions/checkout@v4
- id: install
name: Install
uses: ./.github/actions/install
- id: cache
name: Update cache
uses: actions/cache@v4
with:
path: ~/.cache
key: extract-drudge-${{ github.run_id }}
restore-keys: |
extract-drudge
- id: download-drudge-hyperlinks
name: Download Drudge hyperlink lists
run: pipenv run newshomepages-extract hyperlinks --site=drudge --days=90 --output-path=./drudge-hyperlinks-sample.csv;
shell: bash
- id: upload-to-internet-archive
name: Upload files to archive.org
uses: palewire/internet-archive-upload@v1
with:
access-key: ${{ secrets.IA_ACCESS_KEY }}
secret-key: ${{ secrets.IA_SECRET_KEY }}
identifier: news-homepages-extracts
files: drudge-hyperlinks-sample.csv