From dd723b6c192d161dd684a43ee35e6d345b191693 Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Mon, 21 Mar 2022 18:55:15 -0400 Subject: [PATCH] PyPi Packaging of Data Pipeline (#1464) * PyPi Packaging of Data Pipeline * package rename * adding python version * trigger data checks * print env vars * python version 2 * trigger data check * python version 3 * update caching for other GHAs --- .github/workflows/combine-tilefy.yml | 2 +- .github/workflows/data-checks.yml | 4 +++- .github/workflows/deploy_be_staging.yml | 2 +- .github/workflows/generate-score.yml | 2 +- data/data-pipeline/data_pipeline/etl/base.py | 4 ++-- data/data-pipeline/pyproject.toml | 17 ++++++++++++++--- 6 files changed, 22 insertions(+), 9 deletions(-) diff --git a/.github/workflows/combine-tilefy.yml b/.github/workflows/combine-tilefy.yml index 09fea6c8..c2a8f584 100644 --- a/.github/workflows/combine-tilefy.yml +++ b/.github/workflows/combine-tilefy.yml @@ -31,7 +31,7 @@ jobs: id: cached-poetry-dependencies with: path: ~/.cache/pypoetry/virtualenvs - key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/combine-tilefy.yml') }} + key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/combine-tilefy.yml') }} - name: Install poetry uses: snok/install-poetry@v1 - name: Print Poetry settings diff --git a/.github/workflows/data-checks.yml b/.github/workflows/data-checks.yml index f7a2e0ed..55a5ad30 100644 --- a/.github/workflows/data-checks.yml +++ b/.github/workflows/data-checks.yml @@ -23,12 +23,14 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Print variables to help debug + uses: hmarr/debug-action@v2 - name: Load cached Poetry installation id: cached-poetry-dependencies uses: actions/cache@v2 with: path: ~/.cache/pypoetry/virtualenvs - key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/data-checks.yml') }} + key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/data-checks.yml') }} - name: Install poetry uses: snok/install-poetry@v1 - name: Print Poetry settings diff --git a/.github/workflows/deploy_be_staging.yml b/.github/workflows/deploy_be_staging.yml index f5bfc4f5..4965a626 100644 --- a/.github/workflows/deploy_be_staging.yml +++ b/.github/workflows/deploy_be_staging.yml @@ -31,7 +31,7 @@ jobs: uses: actions/cache@v2 with: path: ~/.cache/pypoetry/virtualenvs - key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_be_staging.yml') }} + key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_be_staging.yml') }} - name: Install poetry uses: snok/install-poetry@v1 - name: Print Poetry settings diff --git a/.github/workflows/generate-score.yml b/.github/workflows/generate-score.yml index 14a62f29..07a2322b 100644 --- a/.github/workflows/generate-score.yml +++ b/.github/workflows/generate-score.yml @@ -30,7 +30,7 @@ jobs: uses: actions/cache@v2 with: path: ~/.cache/pypoetry/virtualenvs - key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/generate-score.yml') }} + key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/generate-score.yml') }} - name: Install poetry uses: snok/install-poetry@v1 - name: Print Poetry settings diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 87c737da..6a89861a 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -61,7 +61,7 @@ class ExtractTransformLoad: # in the output file based on this geography level. GEO_LEVEL: ValidGeoLevel = None - # COLUMNS_TO_KEEP to used to identify which columns to keep in the output df. + # COLUMNS_TO_KEEP is used to identify which columns to keep in the output df. COLUMNS_TO_KEEP: typing.List[str] = None # Thirteen digits in a census block group ID. @@ -70,7 +70,7 @@ class ExtractTransformLoad: # be from CBGs at different time periods. EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000 - # Eleven digits in a census tract ID. + # There should be Eleven digits in a census tract ID. EXPECTED_CENSUS_TRACTS_CHARACTER_LENGTH: int = 11 # TODO: investigate. Census says there are only 74,134 tracts in the United States, # Puerto Rico, and island areas. This might be from tracts at different time diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml index 95a21369..4513b341 100644 --- a/data/data-pipeline/pyproject.toml +++ b/data/data-pipeline/pyproject.toml @@ -1,8 +1,19 @@ [tool.poetry] -authors = ["Your Name "] -description = "ETL and Generation of Justice 40 Score" -name = "data-pipeline" +name = "justice40-data-pipeline" version = "0.1.0" +description = "ETL, Score and Map Generation of Justice 40 Tool" +authors = ["Justice40 Engineering "] +keywords = ["justice40", "environmental_justice", "python", "etl"] +readme = "README.md" +license = "MIT" +homepage = "https://github.com/usds/justice40-tool/tree/main/data/data-pipeline" +repository = "https://github.com/usds/justice40-tool" +include = [ + "LICENSE", +] +packages = [ + {include = "data_pipeline"} +] [tool.poetry.dependencies] CensusData = "^1.13"