From 9a9d5fdf7f21d4b6ee383ab6dc4403727d27d77b Mon Sep 17 00:00:00 2001
From: Nat Hillard <72811320+NatHillardUSDS@users.noreply.github.com>
Date: Mon, 9 Aug 2021 10:39:59 -0400
Subject: [PATCH] Backend change for Zipfile pt. 2 (#469)

* Fixes #303 : adding downloadable zip archive logic
* linter recommendations
* Pushes data directory to AWS. We'll want to move to use AWS for this ASAP, but this works for now
* updating pattern
---
 .github/workflows/deploy_data.yml             |  60 +++++++++++
 .../data/score/downloadable/__init__.py       |   0
 .../data/score/tiles/__init__.py              |   0
 .../data_pipeline/etl/score/etl_score_post.py | 102 ++++++++++++++++--
 data/data-pipeline/data_pipeline/utils.py     |  38 ++++++-
 data/data-pipeline/poetry.lock                |  29 ++++-
 data/data-pipeline/pyproject.toml             |   2 +
 data/data-pipeline/requirements.txt           |   2 +
 8 files changed, 223 insertions(+), 10 deletions(-)
 create mode 100644 .github/workflows/deploy_data.yml
 create mode 100644 data/data-pipeline/data_pipeline/data/score/downloadable/__init__.py
 create mode 100644 data/data-pipeline/data_pipeline/data/score/tiles/__init__.py

diff --git a/.github/workflows/deploy_data.yml b/.github/workflows/deploy_data.yml
new file mode 100644
index 00000000..2ce4050c
--- /dev/null
+++ b/.github/workflows/deploy_data.yml
@@ -0,0 +1,60 @@
+name: Deploy Data
+on: 
+  push:
+    paths:
+      - 'data/data-pipeline/*'
+  pull_request:
+    paths:
+      - 'data/data-pipeline/*'
+
+jobs:
+  deploy_data:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: data/data-pipeline
+    strategy:
+      matrix:
+        python-version: [3.9]
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v2
+      - name: Print variables to help debug
+        uses: hmarr/debug-action@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Setup Poetry
+        uses: Gr1N/setup-poetry@v7
+      - name: Print poetry version
+        run: poetry --version
+      - name: Install dependencies
+        run: poetry install
+      - name: Install GDAL/ogr2ogr
+        run: |
+          sudo apt-add-repository ppa:ubuntugis/ubuntugis-unstable
+          sudo apt-get update
+          sudo apt-get install gdal-bin libgdal-dev
+          pip install GDAL==3.2.3
+      - name: Run Scripts
+        run: |
+          poetry run download_census
+          poetry run etl_and_score
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.DATA_DEV_AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.DATA_DEV_AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+      - name: Deploy to Geoplatform AWS
+        run: |
+          aws s3 sync ./data_pipeline/data/ s3://justice40-data/data-pipeline/data --delete
+      - name: Update PR with Comment about deployment
+        uses: mshick/add-pr-comment@v1
+        with:
+          message: |
+            Data Synced! Find it here: s3://justice40-data/data-pipeline/data/
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          repo-token-user-login: 'github-actions[bot]' # The user.login for temporary GitHub tokens
+          allow-repeats: false # This is the default
\ No newline at end of file
diff --git a/data/data-pipeline/data_pipeline/data/score/downloadable/__init__.py b/data/data-pipeline/data_pipeline/data/score/downloadable/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/data/data-pipeline/data_pipeline/data/score/tiles/__init__.py b/data/data-pipeline/data_pipeline/data/score/tiles/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
index f7b6bff9..04561ad3 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@@ -1,7 +1,19 @@
-import pandas as pd
+import json
+import zipfile
+from pathlib import Path
 
+import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.utils import get_module_logger
+from data_pipeline.utils import get_module_logger, get_zip_info
+
+## zlib is not available on all systems
+try:
+    import zlib  # noqa # pylint: disable=unused-import
+
+    compression = zipfile.ZIP_DEFLATED
+except (ImportError, AttributeError):
+    compression = zipfile.ZIP_STORED
+
 
 logger = get_module_logger(__name__)
 
@@ -18,11 +30,14 @@ class PostScoreETL(ExtractTransformLoad):
         self.CENSUS_COUNTIES_COLS = ["USPS", "GEOID", "NAME"]
         self.CENSUS_USA_CSV = self.DATA_PATH / "census" / "csv" / "us.csv"
         self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
+        self.DOWNLOADABLE_INFO_PATH = self.DATA_PATH / "score" / "downloadable"
 
         self.STATE_CSV = self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv"
 
         self.FULL_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv"
-        self.TILR_SCORE_CSV = self.SCORE_CSV_PATH / "tile" / "usa.csv"
+        self.FULL_SCORE_CSV_PLUS_COUNTIES = (
+            self.SCORE_CSV_PATH / "full" / "usa_counties.csv"
+        )
 
         self.TILES_SCORE_COLUMNS = [
             "GEOID10",
@@ -35,6 +50,46 @@ class PostScoreETL(ExtractTransformLoad):
         self.TILES_SCORE_CSV_PATH = self.SCORE_CSV_PATH / "tiles"
         self.TILES_SCORE_CSV = self.TILES_SCORE_CSV_PATH / "usa.csv"
 
+        # These are the
+        self.DOWNLOADABLE_SCORE_INDICATORS_BASIC = [
+            "Percent individuals age 25 or over with less than high school degree",
+            "Linguistic isolation (percent)",
+            "Poverty (Less than 200% of federal poverty line)",
+            "Unemployed civilians (percent)",
+            "Housing burden (percent)",
+            "Respiratory hazard index",
+            "Diesel particulate matter",
+            "Particulate matter (PM2.5)",
+            "Traffic proximity and volume",
+            "Proximity to RMP sites",
+            "Wastewater discharge",
+            "Percent pre-1960s housing (lead paint indicator)",
+            "Total population",
+        ]
+
+        # For every indicator above, we want to include percentile and min-max normalized variants also
+        self.DOWNLOADABLE_SCORE_INDICATORS_FULL = list(
+            pd.core.common.flatten(
+                [
+                    [p, f"{p} (percentile)", f"{p} (min-max normalized)"]
+                    for p in self.DOWNLOADABLE_SCORE_INDICATORS_BASIC
+                ]
+            )
+        )
+
+        # Finally we augment with the GEOID10, county, and state
+        self.DOWNLOADABLE_SCORE_COLUMNS = [
+            "GEOID10",
+            "County Name",
+            "State Name",
+            *self.DOWNLOADABLE_SCORE_INDICATORS_FULL,
+        ]
+        self.DOWNLOADABLE_SCORE_CSV = self.DOWNLOADABLE_INFO_PATH / "usa.csv"
+        self.DOWNLOADABLE_SCORE_EXCEL = self.DOWNLOADABLE_INFO_PATH / "usa.xlsx"
+        self.DOWNLOADABLE_SCORE_ZIP = (
+            self.DOWNLOADABLE_INFO_PATH / "Screening Tool Data.zip"
+        )
+
         self.counties_df: pd.DataFrame
         self.states_df: pd.DataFrame
         self.score_df: pd.DataFrame
@@ -43,7 +98,8 @@ class PostScoreETL(ExtractTransformLoad):
 
     def extract(self) -> None:
         super().extract(
-            self.CENSUS_COUNTIES_ZIP_URL, self.TMP_PATH,
+            self.CENSUS_COUNTIES_ZIP_URL,
+            self.TMP_PATH,
         )
 
         logger.info("Reading Counties CSV")
@@ -67,7 +123,8 @@ class PostScoreETL(ExtractTransformLoad):
         # rename some of the columns to prepare for merge
         self.counties_df = self.counties_df[["USPS", "GEOID", "NAME"]]
         self.counties_df.rename(
-            columns={"USPS": "State Abbreviation", "NAME": "County Name"}, inplace=True,
+            columns={"USPS": "State Abbreviation", "NAME": "County Name"},
+            inplace=True,
         )
 
         # remove unnecessary columns
@@ -122,14 +179,45 @@ class PostScoreETL(ExtractTransformLoad):
         # set the score to the new df
         self.score_county_state_merged = removed_df
 
-    def load(self) -> None:
+    def _save_full_csv(self):
         logger.info("Saving Full Score CSV with County Information")
         self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
-        self.score_county_state_merged.to_csv(self.FULL_SCORE_CSV, index=False)
+        self.score_county_state_merged.to_csv(
+            self.FULL_SCORE_CSV_PLUS_COUNTIES, index=False
+        )
 
+    def _save_tile_csv(self):
         logger.info("Saving Tile Score CSV")
         # TODO: check which are the columns we'll use
         # Related to: https://github.com/usds/justice40-tool/issues/302
         score_tiles = self.score_county_state_merged[self.TILES_SCORE_COLUMNS]
         self.TILES_SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
         score_tiles.to_csv(self.TILES_SCORE_CSV, index=False)
+
+    def _save_downloadable_zip(self):
+        logger.info("Saving Downloadable CSV")
+        logger.info(list(self.score_county_state_merged.columns))
+        logger.info(self.DOWNLOADABLE_SCORE_COLUMNS)
+        downloadable_tiles = self.score_county_state_merged[
+            self.DOWNLOADABLE_SCORE_COLUMNS
+        ]
+        self.DOWNLOADABLE_INFO_PATH.mkdir(parents=True, exist_ok=True)
+
+        logger.info("Writing downloadable csv")
+        downloadable_tiles.to_csv(self.DOWNLOADABLE_SCORE_CSV, index=False)
+
+        logger.info("Writing downloadable excel")
+        downloadable_tiles.to_excel(self.DOWNLOADABLE_SCORE_EXCEL, index=False)
+
+        logger.info("Compressing files")
+        files_to_compress = [self.DOWNLOADABLE_SCORE_CSV, self.DOWNLOADABLE_SCORE_EXCEL]
+        with zipfile.ZipFile(self.DOWNLOADABLE_SCORE_ZIP, "w") as zf:
+            for f in files_to_compress:
+                zf.write(f, arcname=Path(f).name, compress_type=compression)
+        zip_info = get_zip_info(self.DOWNLOADABLE_SCORE_ZIP)
+        logger.info(json.dumps(zip_info, indent=4, sort_keys=True, default=str))
+
+    def load(self) -> None:
+        self._save_full_csv()
+        self._save_tile_csv()
+        self._save_downloadable_zip()
diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py
index 3dcb04be..4263a1a5 100644
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@@ -1,5 +1,7 @@
+import datetime
 import logging
 import os
+import sys
 import shutil
 import zipfile
 from pathlib import Path
@@ -119,8 +121,13 @@ def unzip_file_from_url(
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
     logger.info(f"Downloading {file_url}")
-    download = requests.get(file_url, verify=verify)
-    file_contents = download.content
+    response = requests.get(file_url, verify=verify)
+    if response.status_code == 200:
+        file_contents = response.content
+    else:
+        sys.exit(
+            f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
+        )
 
     zip_file_path = download_path / "downloaded.zip"
     zip_file = open(zip_file_path, "wb")
@@ -152,6 +159,7 @@ def score_folder_cleanup() -> None:
     logger.info("Initializing all score data")
     remove_all_from_dir(data_path / "score" / "csv")
     remove_all_from_dir(data_path / "score" / "geojson")
+    remove_all_from_dir(data_path / "score" / "downloadable")
 
 
 def temp_folder_cleanup() -> None:
@@ -1176,3 +1184,29 @@ def get_excel_column_name(index: int) -> str:
     ]
 
     return excel_column_names[index]
+
+
+def get_zip_info(archive_path: Path) -> list:
+    """
+    Returns information about a provided archive
+
+    Args:
+        archive_path (pathlib.Path): Path of the archive to be inspected
+
+    Returns:
+        a list of information about every file in the zipfile
+
+    """
+    zf = zipfile.ZipFile(archive_path)
+    info_list = []
+    for info in zf.infolist():
+        info_dict = {}
+        info_dict["Filename"] = info.filename
+        info_dict["Comment"] = info.comment.decode("utf8")
+        info_dict["Modified"] = datetime.datetime(*info.date_time).isoformat()
+        info_dict["System"] = f"{info.create_system} (0 = Windows, 3 = Unix)"
+        info_dict["ZIP version"] = info.create_version
+        info_dict["Compressed"] = f"{info.compress_size} bytes"
+        info_dict["Uncompressed"] = f"{info.file_size} bytes"
+        info_list.append(info_dict)
+    return info_list
diff --git a/data/data-pipeline/poetry.lock b/data/data-pipeline/poetry.lock
index eafd0657..1c7b5bbd 100644
--- a/data/data-pipeline/poetry.lock
+++ b/data/data-pipeline/poetry.lock
@@ -310,6 +310,14 @@ category = "main"
 optional = false
 python-versions = ">=2.7"
 
+[[package]]
+name = "et-xmlfile"
+version = "1.1.0"
+description = "An implementation of lxml.xmlfile for the standard library"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
 [[package]]
 name = "filelock"
 version = "3.0.12"
@@ -935,6 +943,17 @@ category = "main"
 optional = false
 python-versions = ">=3.7"
 
+[[package]]
+name = "openpyxl"
+version = "3.0.7"
+description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
+category = "dev"
+optional = false
+python-versions = ">=3.6,"
+
+[package.dependencies]
+et-xmlfile = "*"
+
 [[package]]
 name = "packaging"
 version = "21.0"
@@ -1529,7 +1548,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.7.1"
-content-hash = "6fcf0825ce80c30181c920385d4e9b5e79ac6930b9a59526a916703795977f76"
+content-hash = "7380a36633c41b57d351df2facdf3a5fd05dfc9f0dc4f629d5f3dfec61181c6b"
 
 [metadata.files]
 appdirs = [
@@ -1751,6 +1770,10 @@ entrypoints = [
     {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"},
     {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"},
 ]
+et-xmlfile = [
+    {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
+    {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
+]
 filelock = [
     {file = "filelock-3.0.12-py3-none-any.whl", hash = "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"},
     {file = "filelock-3.0.12.tar.gz", hash = "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59"},
@@ -2068,6 +2091,10 @@ numpy = [
     {file = "numpy-1.21.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4"},
     {file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"},
 ]
+openpyxl = [
+    {file = "openpyxl-3.0.7-py2.py3-none-any.whl", hash = "sha256:46af4eaf201a89b610fcca177eed957635f88770a5462fb6aae4a2a52b0ff516"},
+    {file = "openpyxl-3.0.7.tar.gz", hash = "sha256:6456a3b472e1ef0facb1129f3c6ef00713cebf62e736cd7a75bcc3247432f251"},
+]
 packaging = [
     {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"},
     {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"},
diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml
index 958b3c5a..1f6fdbf2 100644
--- a/data/data-pipeline/pyproject.toml
+++ b/data/data-pipeline/pyproject.toml
@@ -23,6 +23,7 @@ black = {version = "^21.6b0", allow-prereleases = true}
 flake8 = "^3.9.2"
 liccheck = "^0.6.2"
 mypy = "^0.910"
+openpyxl = "^3.0.7"
 pylint = "^2.9.6"
 pytest = "^6.2.4"
 safety = "^1.10.3"
@@ -100,6 +101,7 @@ authorized_licenses = [
 ]
 
 [tool.poetry.scripts]
+cleanup_census = 'data_pipeline.application:census_cleanup'
 cleanup_data = 'data_pipeline.application:data_cleanup'
 download_census = 'data_pipeline.application:census_data_download'
 etl = 'data_pipeline.application:etl_run'
diff --git a/data/data-pipeline/requirements.txt b/data/data-pipeline/requirements.txt
index df88696e..f6d93c43 100644
--- a/data/data-pipeline/requirements.txt
+++ b/data/data-pipeline/requirements.txt
@@ -25,6 +25,7 @@ distlib==0.3.2; python_version >= "2.7" and python_full_version < "3.0.0" or pyt
 dparse==0.5.1; python_version >= "3.5"
 dynaconf==3.1.4
 entrypoints==0.3; python_version >= "3.7"
+et-xmlfile==1.1.0; python_version >= "3.6"
 filelock==3.0.12; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
 fiona==1.8.20; python_version >= "3.6"
 flake8==3.9.2; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
@@ -67,6 +68,7 @@ nbformat==5.1.3; python_full_version >= "3.6.1" and python_version >= "3.7"
 nest-asyncio==1.5.1; python_full_version >= "3.6.1" and python_version >= "3.7"
 notebook==6.4.0; python_version >= "3.6"
 numpy==1.21.1; python_version >= "3.7"
+openpyxl==3.0.7; python_version >= "3.6"
 packaging==21.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7"
 pandas==1.3.1; python_full_version >= "3.7.1"
 pandocfilters==1.4.3; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7"