From dd723b6c192d161dd684a43ee35e6d345b191693 Mon Sep 17 00:00:00 2001
From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com>
Date: Mon, 21 Mar 2022 18:55:15 -0400
Subject: [PATCH] PyPi Packaging of Data Pipeline (#1464)

* PyPi Packaging of Data Pipeline

* package rename

* adding python version

* trigger data checks

* print env vars

* python version 2

* trigger data check

* python version 3

* update caching for other GHAs
---
 .github/workflows/combine-tilefy.yml         |  2 +-
 .github/workflows/data-checks.yml            |  4 +++-
 .github/workflows/deploy_be_staging.yml      |  2 +-
 .github/workflows/generate-score.yml         |  2 +-
 data/data-pipeline/data_pipeline/etl/base.py |  4 ++--
 data/data-pipeline/pyproject.toml            | 17 ++++++++++++++---
 6 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/combine-tilefy.yml b/.github/workflows/combine-tilefy.yml
index 09fea6c8..c2a8f584 100644
--- a/.github/workflows/combine-tilefy.yml
+++ b/.github/workflows/combine-tilefy.yml
@@ -31,7 +31,7 @@ jobs:
         id: cached-poetry-dependencies
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/combine-tilefy.yml') }}
+          key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/combine-tilefy.yml') }}
       - name: Install poetry
         uses: snok/install-poetry@v1
       - name: Print Poetry settings
diff --git a/.github/workflows/data-checks.yml b/.github/workflows/data-checks.yml
index f7a2e0ed..55a5ad30 100644
--- a/.github/workflows/data-checks.yml
+++ b/.github/workflows/data-checks.yml
@@ -23,12 +23,14 @@ jobs:
         uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Print variables to help debug
+        uses: hmarr/debug-action@v2
       - name: Load cached Poetry installation
         id: cached-poetry-dependencies
         uses: actions/cache@v2
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/data-checks.yml') }}
+          key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/data-checks.yml') }}
       - name: Install poetry
         uses: snok/install-poetry@v1
       - name: Print Poetry settings
diff --git a/.github/workflows/deploy_be_staging.yml b/.github/workflows/deploy_be_staging.yml
index f5bfc4f5..4965a626 100644
--- a/.github/workflows/deploy_be_staging.yml
+++ b/.github/workflows/deploy_be_staging.yml
@@ -31,7 +31,7 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_be_staging.yml') }}
+          key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/deploy_be_staging.yml') }}
       - name: Install poetry
         uses: snok/install-poetry@v1
       - name: Print Poetry settings
diff --git a/.github/workflows/generate-score.yml b/.github/workflows/generate-score.yml
index 14a62f29..07a2322b 100644
--- a/.github/workflows/generate-score.yml
+++ b/.github/workflows/generate-score.yml
@@ -30,7 +30,7 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: env-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/generate-score.yml') }}
+          key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/generate-score.yml') }}
       - name: Install poetry
         uses: snok/install-poetry@v1
       - name: Print Poetry settings
diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py
index 87c737da..6a89861a 100644
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@@ -61,7 +61,7 @@ class ExtractTransformLoad:
     #  in the output file based on this geography level.
     GEO_LEVEL: ValidGeoLevel = None
 
-    # COLUMNS_TO_KEEP to used to identify which columns to keep in the output df.
+    # COLUMNS_TO_KEEP is used to identify which columns to keep in the output df.
     COLUMNS_TO_KEEP: typing.List[str] = None
 
     # Thirteen digits in a census block group ID.
@@ -70,7 +70,7 @@ class ExtractTransformLoad:
     #  be from CBGs at different time periods.
     EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
 
-    # Eleven digits in a census tract ID.
+    # There should be Eleven digits in a census tract ID.
     EXPECTED_CENSUS_TRACTS_CHARACTER_LENGTH: int = 11
     # TODO: investigate. Census says there are only 74,134 tracts in the United States,
     #  Puerto Rico, and island areas. This might be from tracts at different time
diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml
index 95a21369..4513b341 100644
--- a/data/data-pipeline/pyproject.toml
+++ b/data/data-pipeline/pyproject.toml
@@ -1,8 +1,19 @@
 [tool.poetry]
-authors = ["Your Name <you@example.com>"]
-description = "ETL and Generation of Justice 40 Score"
-name = "data-pipeline"
+name = "justice40-data-pipeline"
 version = "0.1.0"
+description = "ETL, Score and Map Generation of Justice 40 Tool"
+authors = ["Justice40 Engineering <j40-engineering@lists.usds.gov>"]
+keywords = ["justice40", "environmental_justice", "python", "etl"]
+readme = "README.md"
+license = "MIT"
+homepage = "https://github.com/usds/justice40-tool/tree/main/data/data-pipeline"
+repository = "https://github.com/usds/justice40-tool"
+include = [
+    "LICENSE",
+]
+packages = [
+  {include = "data_pipeline"}
+]
 
 [tool.poetry.dependencies]
 CensusData = "^1.13"