From 9a9d5fdf7f21d4b6ee383ab6dc4403727d27d77b Mon Sep 17 00:00:00 2001 From: Nat Hillard <72811320+NatHillardUSDS@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:39:59 -0400 Subject: [PATCH] Backend change for Zipfile pt. 2 (#469) * Fixes #303 : adding downloadable zip archive logic * linter recommendations * Pushes data directory to AWS. We'll want to move to use AWS for this ASAP, but this works for now * updating pattern --- .github/workflows/deploy_data.yml | 60 +++++++++++ .../data/score/downloadable/__init__.py | 0 .../data/score/tiles/__init__.py | 0 .../data_pipeline/etl/score/etl_score_post.py | 102 ++++++++++++++++-- data/data-pipeline/data_pipeline/utils.py | 38 ++++++- data/data-pipeline/poetry.lock | 29 ++++- data/data-pipeline/pyproject.toml | 2 + data/data-pipeline/requirements.txt | 2 + 8 files changed, 223 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/deploy_data.yml create mode 100644 data/data-pipeline/data_pipeline/data/score/downloadable/__init__.py create mode 100644 data/data-pipeline/data_pipeline/data/score/tiles/__init__.py diff --git a/.github/workflows/deploy_data.yml b/.github/workflows/deploy_data.yml new file mode 100644 index 00000000..2ce4050c --- /dev/null +++ b/.github/workflows/deploy_data.yml @@ -0,0 +1,60 @@ +name: Deploy Data +on: + push: + paths: + - 'data/data-pipeline/*' + pull_request: + paths: + - 'data/data-pipeline/*' + +jobs: + deploy_data: + runs-on: ubuntu-latest + defaults: + run: + working-directory: data/data-pipeline + strategy: + matrix: + python-version: [3.9] + steps: + - name: Checkout source + uses: actions/checkout@v2 + - name: Print variables to help debug + uses: hmarr/debug-action@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Setup Poetry + uses: Gr1N/setup-poetry@v7 + - name: Print poetry version + run: poetry --version + - name: Install dependencies + run: poetry install + - name: Install GDAL/ogr2ogr + run: | + sudo apt-add-repository ppa:ubuntugis/ubuntugis-unstable + sudo apt-get update + sudo apt-get install gdal-bin libgdal-dev + pip install GDAL==3.2.3 + - name: Run Scripts + run: | + poetry run download_census + poetry run etl_and_score + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.DATA_DEV_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.DATA_DEV_AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + - name: Deploy to Geoplatform AWS + run: | + aws s3 sync ./data_pipeline/data/ s3://justice40-data/data-pipeline/data --delete + - name: Update PR with Comment about deployment + uses: mshick/add-pr-comment@v1 + with: + message: | + Data Synced! Find it here: s3://justice40-data/data-pipeline/data/ + repo-token: ${{ secrets.GITHUB_TOKEN }} + repo-token-user-login: 'github-actions[bot]' # The user.login for temporary GitHub tokens + allow-repeats: false # This is the default \ No newline at end of file diff --git a/data/data-pipeline/data_pipeline/data/score/downloadable/__init__.py b/data/data-pipeline/data_pipeline/data/score/downloadable/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/data/score/tiles/__init__.py b/data/data-pipeline/data_pipeline/data/score/tiles/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index f7b6bff9..04561ad3 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -1,7 +1,19 @@ -import pandas as pd +import json +import zipfile +from pathlib import Path +import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.utils import get_module_logger +from data_pipeline.utils import get_module_logger, get_zip_info + +## zlib is not available on all systems +try: + import zlib # noqa # pylint: disable=unused-import + + compression = zipfile.ZIP_DEFLATED +except (ImportError, AttributeError): + compression = zipfile.ZIP_STORED + logger = get_module_logger(__name__) @@ -18,11 +30,14 @@ class PostScoreETL(ExtractTransformLoad): self.CENSUS_COUNTIES_COLS = ["USPS", "GEOID", "NAME"] self.CENSUS_USA_CSV = self.DATA_PATH / "census" / "csv" / "us.csv" self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" + self.DOWNLOADABLE_INFO_PATH = self.DATA_PATH / "score" / "downloadable" self.STATE_CSV = self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv" self.FULL_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv" - self.TILR_SCORE_CSV = self.SCORE_CSV_PATH / "tile" / "usa.csv" + self.FULL_SCORE_CSV_PLUS_COUNTIES = ( + self.SCORE_CSV_PATH / "full" / "usa_counties.csv" + ) self.TILES_SCORE_COLUMNS = [ "GEOID10", @@ -35,6 +50,46 @@ class PostScoreETL(ExtractTransformLoad): self.TILES_SCORE_CSV_PATH = self.SCORE_CSV_PATH / "tiles" self.TILES_SCORE_CSV = self.TILES_SCORE_CSV_PATH / "usa.csv" + # These are the + self.DOWNLOADABLE_SCORE_INDICATORS_BASIC = [ + "Percent individuals age 25 or over with less than high school degree", + "Linguistic isolation (percent)", + "Poverty (Less than 200% of federal poverty line)", + "Unemployed civilians (percent)", + "Housing burden (percent)", + "Respiratory hazard index", + "Diesel particulate matter", + "Particulate matter (PM2.5)", + "Traffic proximity and volume", + "Proximity to RMP sites", + "Wastewater discharge", + "Percent pre-1960s housing (lead paint indicator)", + "Total population", + ] + + # For every indicator above, we want to include percentile and min-max normalized variants also + self.DOWNLOADABLE_SCORE_INDICATORS_FULL = list( + pd.core.common.flatten( + [ + [p, f"{p} (percentile)", f"{p} (min-max normalized)"] + for p in self.DOWNLOADABLE_SCORE_INDICATORS_BASIC + ] + ) + ) + + # Finally we augment with the GEOID10, county, and state + self.DOWNLOADABLE_SCORE_COLUMNS = [ + "GEOID10", + "County Name", + "State Name", + *self.DOWNLOADABLE_SCORE_INDICATORS_FULL, + ] + self.DOWNLOADABLE_SCORE_CSV = self.DOWNLOADABLE_INFO_PATH / "usa.csv" + self.DOWNLOADABLE_SCORE_EXCEL = self.DOWNLOADABLE_INFO_PATH / "usa.xlsx" + self.DOWNLOADABLE_SCORE_ZIP = ( + self.DOWNLOADABLE_INFO_PATH / "Screening Tool Data.zip" + ) + self.counties_df: pd.DataFrame self.states_df: pd.DataFrame self.score_df: pd.DataFrame @@ -43,7 +98,8 @@ class PostScoreETL(ExtractTransformLoad): def extract(self) -> None: super().extract( - self.CENSUS_COUNTIES_ZIP_URL, self.TMP_PATH, + self.CENSUS_COUNTIES_ZIP_URL, + self.TMP_PATH, ) logger.info("Reading Counties CSV") @@ -67,7 +123,8 @@ class PostScoreETL(ExtractTransformLoad): # rename some of the columns to prepare for merge self.counties_df = self.counties_df[["USPS", "GEOID", "NAME"]] self.counties_df.rename( - columns={"USPS": "State Abbreviation", "NAME": "County Name"}, inplace=True, + columns={"USPS": "State Abbreviation", "NAME": "County Name"}, + inplace=True, ) # remove unnecessary columns @@ -122,14 +179,45 @@ class PostScoreETL(ExtractTransformLoad): # set the score to the new df self.score_county_state_merged = removed_df - def load(self) -> None: + def _save_full_csv(self): logger.info("Saving Full Score CSV with County Information") self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) - self.score_county_state_merged.to_csv(self.FULL_SCORE_CSV, index=False) + self.score_county_state_merged.to_csv( + self.FULL_SCORE_CSV_PLUS_COUNTIES, index=False + ) + def _save_tile_csv(self): logger.info("Saving Tile Score CSV") # TODO: check which are the columns we'll use # Related to: https://github.com/usds/justice40-tool/issues/302 score_tiles = self.score_county_state_merged[self.TILES_SCORE_COLUMNS] self.TILES_SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) score_tiles.to_csv(self.TILES_SCORE_CSV, index=False) + + def _save_downloadable_zip(self): + logger.info("Saving Downloadable CSV") + logger.info(list(self.score_county_state_merged.columns)) + logger.info(self.DOWNLOADABLE_SCORE_COLUMNS) + downloadable_tiles = self.score_county_state_merged[ + self.DOWNLOADABLE_SCORE_COLUMNS + ] + self.DOWNLOADABLE_INFO_PATH.mkdir(parents=True, exist_ok=True) + + logger.info("Writing downloadable csv") + downloadable_tiles.to_csv(self.DOWNLOADABLE_SCORE_CSV, index=False) + + logger.info("Writing downloadable excel") + downloadable_tiles.to_excel(self.DOWNLOADABLE_SCORE_EXCEL, index=False) + + logger.info("Compressing files") + files_to_compress = [self.DOWNLOADABLE_SCORE_CSV, self.DOWNLOADABLE_SCORE_EXCEL] + with zipfile.ZipFile(self.DOWNLOADABLE_SCORE_ZIP, "w") as zf: + for f in files_to_compress: + zf.write(f, arcname=Path(f).name, compress_type=compression) + zip_info = get_zip_info(self.DOWNLOADABLE_SCORE_ZIP) + logger.info(json.dumps(zip_info, indent=4, sort_keys=True, default=str)) + + def load(self) -> None: + self._save_full_csv() + self._save_tile_csv() + self._save_downloadable_zip() diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index 3dcb04be..4263a1a5 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -1,5 +1,7 @@ +import datetime import logging import os +import sys import shutil import zipfile from pathlib import Path @@ -119,8 +121,13 @@ def unzip_file_from_url( urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) logger.info(f"Downloading {file_url}") - download = requests.get(file_url, verify=verify) - file_contents = download.content + response = requests.get(file_url, verify=verify) + if response.status_code == 200: + file_contents = response.content + else: + sys.exit( + f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}" + ) zip_file_path = download_path / "downloaded.zip" zip_file = open(zip_file_path, "wb") @@ -152,6 +159,7 @@ def score_folder_cleanup() -> None: logger.info("Initializing all score data") remove_all_from_dir(data_path / "score" / "csv") remove_all_from_dir(data_path / "score" / "geojson") + remove_all_from_dir(data_path / "score" / "downloadable") def temp_folder_cleanup() -> None: @@ -1176,3 +1184,29 @@ def get_excel_column_name(index: int) -> str: ] return excel_column_names[index] + + +def get_zip_info(archive_path: Path) -> list: + """ + Returns information about a provided archive + + Args: + archive_path (pathlib.Path): Path of the archive to be inspected + + Returns: + a list of information about every file in the zipfile + + """ + zf = zipfile.ZipFile(archive_path) + info_list = [] + for info in zf.infolist(): + info_dict = {} + info_dict["Filename"] = info.filename + info_dict["Comment"] = info.comment.decode("utf8") + info_dict["Modified"] = datetime.datetime(*info.date_time).isoformat() + info_dict["System"] = f"{info.create_system} (0 = Windows, 3 = Unix)" + info_dict["ZIP version"] = info.create_version + info_dict["Compressed"] = f"{info.compress_size} bytes" + info_dict["Uncompressed"] = f"{info.file_size} bytes" + info_list.append(info_dict) + return info_list diff --git a/data/data-pipeline/poetry.lock b/data/data-pipeline/poetry.lock index eafd0657..1c7b5bbd 100644 --- a/data/data-pipeline/poetry.lock +++ b/data/data-pipeline/poetry.lock @@ -310,6 +310,14 @@ category = "main" optional = false python-versions = ">=2.7" +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "filelock" version = "3.0.12" @@ -935,6 +943,17 @@ category = "main" optional = false python-versions = ">=3.7" +[[package]] +name = "openpyxl" +version = "3.0.7" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "dev" +optional = false +python-versions = ">=3.6," + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "packaging" version = "21.0" @@ -1529,7 +1548,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytes [metadata] lock-version = "1.1" python-versions = "^3.7.1" -content-hash = "6fcf0825ce80c30181c920385d4e9b5e79ac6930b9a59526a916703795977f76" +content-hash = "7380a36633c41b57d351df2facdf3a5fd05dfc9f0dc4f629d5f3dfec61181c6b" [metadata.files] appdirs = [ @@ -1751,6 +1770,10 @@ entrypoints = [ {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"}, {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"}, ] +et-xmlfile = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] filelock = [ {file = "filelock-3.0.12-py3-none-any.whl", hash = "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"}, {file = "filelock-3.0.12.tar.gz", hash = "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59"}, @@ -2068,6 +2091,10 @@ numpy = [ {file = "numpy-1.21.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4"}, {file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"}, ] +openpyxl = [ + {file = "openpyxl-3.0.7-py2.py3-none-any.whl", hash = "sha256:46af4eaf201a89b610fcca177eed957635f88770a5462fb6aae4a2a52b0ff516"}, + {file = "openpyxl-3.0.7.tar.gz", hash = "sha256:6456a3b472e1ef0facb1129f3c6ef00713cebf62e736cd7a75bcc3247432f251"}, +] packaging = [ {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"}, {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"}, diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml index 958b3c5a..1f6fdbf2 100644 --- a/data/data-pipeline/pyproject.toml +++ b/data/data-pipeline/pyproject.toml @@ -23,6 +23,7 @@ black = {version = "^21.6b0", allow-prereleases = true} flake8 = "^3.9.2" liccheck = "^0.6.2" mypy = "^0.910" +openpyxl = "^3.0.7" pylint = "^2.9.6" pytest = "^6.2.4" safety = "^1.10.3" @@ -100,6 +101,7 @@ authorized_licenses = [ ] [tool.poetry.scripts] +cleanup_census = 'data_pipeline.application:census_cleanup' cleanup_data = 'data_pipeline.application:data_cleanup' download_census = 'data_pipeline.application:census_data_download' etl = 'data_pipeline.application:etl_run' diff --git a/data/data-pipeline/requirements.txt b/data/data-pipeline/requirements.txt index df88696e..f6d93c43 100644 --- a/data/data-pipeline/requirements.txt +++ b/data/data-pipeline/requirements.txt @@ -25,6 +25,7 @@ distlib==0.3.2; python_version >= "2.7" and python_full_version < "3.0.0" or pyt dparse==0.5.1; python_version >= "3.5" dynaconf==3.1.4 entrypoints==0.3; python_version >= "3.7" +et-xmlfile==1.1.0; python_version >= "3.6" filelock==3.0.12; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" fiona==1.8.20; python_version >= "3.6" flake8==3.9.2; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0") @@ -67,6 +68,7 @@ nbformat==5.1.3; python_full_version >= "3.6.1" and python_version >= "3.7" nest-asyncio==1.5.1; python_full_version >= "3.6.1" and python_version >= "3.7" notebook==6.4.0; python_version >= "3.6" numpy==1.21.1; python_version >= "3.7" +openpyxl==3.0.7; python_version >= "3.6" packaging==21.0; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.7" pandas==1.3.1; python_full_version >= "3.7.1" pandocfilters==1.4.3; python_version >= "3.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.7"