Skip to content

acquire-transfermarkt-scraper #132

acquire-transfermarkt-scraper

acquire-transfermarkt-scraper #132

name: acquire-transfermarkt-scraper
on:
schedule:
- cron: '0 4 * * TUE,FRI'
workflow_dispatch:
inputs:
season:
required: false
default: "2024"
description: Season to acquire data for
env:
SEASON: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.season || '2024' }}
DATA_DIR: data/raw/transfermarkt-scraper/${{ github.event_name == 'workflow_dispatch' && github.event.inputs.season || '2024' }}
jobs:
acquire-clubs:
runs-on: ubuntu-latest
container:
image: dcaribou/transfermarkt-datasets:linux-amd64-master
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v4
- name: run acquire
run: |
make \
acquire_local \
ARGS="--asset clubs --seasons $SEASON"
- uses: actions/upload-artifact@v3
with:
name: clubs
path: ${{ env.DATA_DIR }}/clubs.json.gz
acquire-players:
runs-on: ubuntu-latest
container:
image: dcaribou/transfermarkt-datasets:linux-amd64-master
defaults:
run:
shell: bash -l {0}
needs: acquire-clubs
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v3
with:
name: clubs
path: ${{ env.DATA_DIR }}
- name: run acquire
run: |
make \
acquire_local \
ARGS="--asset players --seasons $SEASON"
- uses: actions/upload-artifact@v3
with:
name: players
path: ${{ env.DATA_DIR }}/players.json.gz
acquire-games:
runs-on: ubuntu-latest
container:
image: dcaribou/transfermarkt-datasets:linux-amd64-master
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v4
- name: run acquire
run: |
make \
acquire_local \
ARGS="--asset games --seasons $SEASON"
- uses: actions/upload-artifact@v3
with:
name: games
path: ${{ env.DATA_DIR }}/games.json.gz
acquire-game-lineups:
runs-on: ubuntu-latest
container:
image: dcaribou/transfermarkt-datasets:linux-amd64-master
defaults:
run:
shell: bash -l {0}
needs: acquire-games
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v3
with:
name: games
path: ${{ env.DATA_DIR }}
- name: run acquire
run: |
make \
acquire_local \
ARGS="--asset game_lineups --seasons $SEASON"
- uses: actions/upload-artifact@v3
with:
name: game_lineups
path: ${{ env.DATA_DIR }}/game_lineups.json.gz
acquire-appearances:
runs-on: ubuntu-latest
container:
image: dcaribou/transfermarkt-datasets:linux-amd64-master
defaults:
run:
shell: bash -l {0}
needs: acquire-players
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v3
with:
name: players
path: ${{ env.DATA_DIR }}
- name: run acquire
run: |
make \
acquire_local \
ARGS="--asset appearances --seasons $SEASON"
- uses: actions/upload-artifact@v3
with:
name: appearances
path: ${{ env.DATA_DIR }}/appearances.json.gz
dvc-push:
runs-on: ubuntu-latest
container:
image: dcaribou/transfermarkt-datasets:linux-amd64-master
defaults:
run:
shell: bash -l {0}
needs:
- acquire-clubs
- acquire-players
- acquire-games
- acquire-appearances
- acquire-game-lineups
steps:
- uses: actions/checkout@v4
with:
# a different (personal access) github token need to be setup here so that a 'add-and-commit' step below triggers the 'on-push' workflow
# checkout https://github.community/t/push-from-action-does-not-trigger-subsequent-action/16854
token: ${{ secrets.PA_GITHUB_TOKEN }}
- name: pull data
run: |
dvc pull
- uses: actions/download-artifact@v3
with:
name: clubs
path: ${{ env.DATA_DIR }}
- uses: actions/download-artifact@v3
with:
name: players
path: ${{ env.DATA_DIR }}
- uses: actions/download-artifact@v3
with:
name: games
path: ${{ env.DATA_DIR }}
- uses: actions/download-artifact@v3
with:
name: appearances
path: ${{ env.DATA_DIR }}
- uses: actions/download-artifact@v3
with:
name: game_lineups
path: ${{ env.DATA_DIR }}
- name: dvc commit and push
run: |
dvc commit -f && dvc push --remote s3
git config --global --add safe.directory '*'
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- uses: EndBug/add-and-commit@v9
with:
add: 'data/raw/transfermarkt-scraper.dvc'
message: '🤖 updated `transfermarkt-scraper` raw data'
default_author: github_actions
pull: '--no-rebase'