Cache ETL data in GitHub Backend PR Action

This commit is contained in:
Chris Alfano 2025-01-10 11:39:13 -05:00 committed by Carlos Felix
parent 6f3432d48a
commit 5f6ed5d90f
2 changed files with 30 additions and 11 deletions

View file

@ -1,5 +1,10 @@
name: Deploy Backend Main
on: workflow_dispatch
on:
push:
branches: [main]
paths:
- "data/**"
- ".github/workflows/deploy_backend_main.yml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
@ -45,15 +50,23 @@ jobs:
aws-access-key-id: ${{ secrets.DATA_DEV_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.DATA_DEV_AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Cleanup Data
run: |
poetry run python3 -m data_pipeline.application data-cleanup
- name: Cache Census Data
id: cache-census
uses: actions/cache@v4
with:
path: data/data-pipeline/data_pipeline/data/census
key: data-census
- name: Install GDAL/ogr2ogr
if: steps.cache-census.outputs.cache-hit != 'true'
run: |
sudo apt-get update
sudo apt-get -y install gdal-bin
ogrinfo --version
- name: Cleanup Data
run: |
poetry run python3 -m data_pipeline.application data-cleanup
- name: Get Census Data
if: steps.cache-census.outputs.cache-hit != 'true'
run: |
poetry run python3 -m data_pipeline.application census-data-download
- name: Run ETL

View file

@ -6,7 +6,6 @@ concurrency:
cancel-in-progress: true
env:
python-version: '3.10'
CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
J40_VERSION_LABEL_STRING: ${{ vars.SCORE_VERSION }}
jobs:
# JOB to run change detection
@ -51,7 +50,7 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
key: cejst-poetry-env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
- name: Install poetry
uses: snok/install-poetry@v1
- name: Install dependencies
@ -64,7 +63,7 @@ jobs:
- name: Run static code analysis
run: poetry run pylint data_pipeline/
- name: Check library safety
run: poetry run safety check --ignore 51457 --ignore 44715 --ignore 70612
run: poetry run safety check --ignore 51457 --ignore 44715 --ignore 70612 --ignore 74439
- name: Run unit tests
run: |
poetry run pytest data_pipeline/
@ -91,7 +90,7 @@ jobs:
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
key: cejst-poetry-env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_backend_main.yml') }}
- name: Install poetry
uses: snok/install-poetry@v1
- name: Print Poetry settings
@ -104,13 +103,20 @@ jobs:
sudo apt-get update
sudo apt-get -y install gdal-bin
ogrinfo --version
- name: Cleanup Data
run: |
poetry run python3 -m data_pipeline.application data-cleanup
- name: Load cached ETL data
id: cached-etl-data
uses: actions/cache@v4
with:
path: |
data/data-pipeline/data_pipeline/data/census
data/data-pipeline/data_pipeline/data/dataset
key: cejst-dataset-env-${{ runner.os }}-${{ env.python-version }}-${{ hashFiles('data/data-pipeline/data_pipeline/etl/**/*') }}-${{ hashFiles('data/data-pipeline/data_pipeline/utils.py') }}
- name: Get Census Data
if: steps.cached-etl-data.outputs.cache-hit != 'true'
run: |
poetry run python3 -m data_pipeline.application census-data-download
- name: Run ETL
if: steps.cached-etl-data.outputs.cache-hit != 'true'
run: |
poetry run python3 -m data_pipeline.application etl-run
poetry run python3 -m data_pipeline.application etl-run --dataset tribal