S3 Parallel Upload and Deletions (#1410)

* installation step

* trigger action

* installing to home dir

* dry-run

* pyenv

* py 2.8

* trying s4cmd

* removing pyenv

* poetry s4cmd

* num-threads

* public read

* poetry cache

* s4cmd all around

* poetry cache

* poetry cache

* install poetry packages

* poetry echo

* let's do this

* s4cmd install on run

* s4cmd

* ad aws back

* add aws back

* testing census api key and poetry caching

* census api key

* census api

* census api key #3

* 250

* poetry update

* poetry change

* check census api key

* force flag

* update score gen and tilefy; remove cached fips

* small gdal update

* invalidation

* missing cache ids
This commit is contained in:
Jorge Escobar 2022-03-17 23:19:23 -04:00 committed by GitHub
commit 7b05ee9c76
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 307 additions and 197 deletions

View file

@ -6,7 +6,8 @@ on:
description: This will rebuild the data sources and regenerate the score, are you sure you want to proceed? (Y/n)
default: n
required: true
env:
BE_CDN_ID: E1324VDMNCO97N
jobs:
deploy_data:
runs-on: ubuntu-latest
@ -25,15 +26,23 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Setup Poetry
uses: Gr1N/setup-poetry@v7
- name: Print poetry version
run: poetry --version
- name: Load cached Poetry installation
uses: actions/cache@v2
id: cached-poetry-dependencies
with:
path: ~/.cache/pypoetry/virtualenvs
key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/combine-tilefy.yml') }}
- name: Install poetry
uses: snok/install-poetry@v1
- name: Print Poetry settings
run: poetry show -v
- name: Install dependencies
run: poetry install
run: poetry add s4cmd && poetry install
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- name: Install GDAL/ogr2ogr
run: |
sudo add-apt-repository ppa:ubuntugis/ppa
sudo apt-get update
sudo apt-get -y install gdal-bin
ogrinfo --version
- name: Set timezone for tippecanoe
@ -65,7 +74,15 @@ jobs:
aws-region: us-east-1
- name: Deploy to Geoplatform AWS
run: |
aws s3 rm s3://justice40-data/data-pipeline/data/score/tiles --recursive
aws s3 cp ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline/data/score/tiles --recursive --acl public-read
aws s3 sync ./data_pipeline/data/score/geojson/ s3://justice40-data/data-pipeline/data/score/geojson --acl public-read --delete
aws s3 sync ./data_pipeline/data/score/shapefile/ s3://justice40-data/data-pipeline/data/score/shapefile --acl public-read --delete
poetry run s4cmd del s3://justice40-data/data-pipeline/data/score/tiles --recursive --num-threads=250
poetry run s4cmd put ./data_pipeline/data/score/geojson/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/geojson --recursive --force --API-ACL=public-read --num-threads=250
poetry run s4cmd put ./data_pipeline/data/score/shapefile/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/shapefile --recursive --force --API-ACL=public-read
poetry run s4cmd put ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/tiles --recursive --force --API-ACL=public-read --num-threads=250
- name: Invalidate cache on AWS CDNs
uses: chetan/invalidate-cloudfront-action@master
env:
DISTRIBUTION: ${{env.BE_CDN_ID}}
PATHS: "/*"
AWS_REGION: "us-east-1"
AWS_ACCESS_KEY_ID: ${{ secrets.DATA_DEV_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DATA_DEV_AWS_SECRET_ACCESS_KEY }}

View file

@ -7,6 +7,7 @@ on:
env:
PR_NUMBER: ${{github.event.pull_request.number}}
SHA_NUMBER: ${{github.event.pull_request.head.sha}}
CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
jobs:
generate-score-tiles:
runs-on: ubuntu-latest
@ -25,12 +26,19 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Setup Poetry
uses: Gr1N/setup-poetry@v7
- name: Print poetry version
run: poetry --version
- name: Load cached Poetry installation
id: cached-poetry-dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry/virtualenvs
key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_be_staging.yml') }}
- name: Install poetry
uses: snok/install-poetry@v1
- name: Print Poetry settings
run: poetry show -v
- name: Install dependencies
run: poetry install
run: poetry add s4cmd && poetry install
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
with:
@ -45,8 +53,8 @@ jobs:
poetry run python3 data_pipeline/application.py generate-score-post -s aws
- name: Deploy Score to Geoplatform AWS
run: |
aws s3 cp ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --acl public-read
aws s3 cp ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --acl public-read
poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
poetry run s4cmd put ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read
- name: Update PR with deployed Score URLs
uses: mshick/add-pr-comment@v1
with:
@ -62,6 +70,7 @@ jobs:
- name: Install GDAL/ogr2ogr
run: |
sudo add-apt-repository ppa:ubuntugis/ppa
sudo apt-get update
sudo apt-get -y install gdal-bin
ogrinfo --version
- name: Set timezone for tippecanoe
@ -89,11 +98,9 @@ jobs:
poetry run python3 data_pipeline/application.py generate-map-tiles
- name: Deploy Map to Geoplatform AWS
run: |
aws s3 cp ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --acl public-read
aws s3 cp ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --acl public-read
aws s3 cp ./data_pipeline/data/score/geojson/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/geojson --recursive --acl public-read
aws s3 cp ./data_pipeline/data/score/shapefile/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/shapefile --recursive --acl public-read
aws s3 cp ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/tiles --recursive --acl public-read
poetry run s4cmd put ./data_pipeline/data/score/geojson/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/geojson --recursive --force --API-ACL=public-read --num-threads=250
poetry run s4cmd put ./data_pipeline/data/score/shapefile/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/shapefile --recursive --force --API-ACL=public-read
poetry run s4cmd put ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/tiles --recursive --force --API-ACL=public-read --num-threads=250
- name: Update PR with deployed Map URL
uses: mshick/add-pr-comment@v1
with:

View file

@ -1,5 +1,5 @@
name: Generate Score
on:
on:
workflow_dispatch:
inputs:
confirm-action:
@ -25,12 +25,19 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Setup Poetry
uses: Gr1N/setup-poetry@v7
- name: Print poetry version
run: poetry --version
- name: Load cached Poetry installation
id: cached-poetry-dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pypoetry/virtualenvs
key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/generate-score.yml') }}
- name: Install poetry
uses: snok/install-poetry@v1
- name: Print Poetry settings
run: poetry show -v
- name: Install dependencies
run: poetry install
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
with:
@ -39,14 +46,14 @@ jobs:
aws-region: us-east-1
- name: Generate Score
run: |
poetry run python3 data_pipeline/application.py score-full-run
poetry run python3 data_pipeline/application.py score-full-run
- name: Upload Score to AWS
run: |
aws s3 sync ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline/data/score/csv --acl public-read --delete
- name: Generate Score Post
run: |
poetry run python3 data_pipeline/application.py generate-score-post -s aws
poetry run python3 data_pipeline/application.py generate-score-post -s aws
- name: Upload Score Post to AWS
run: |
aws s3 sync ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline/data/score/csv --acl public-read --delete
aws s3 sync ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline/data/score/downloadable --acl public-read --delete
aws s3 sync ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline/data/score/downloadable --acl public-read --delete