From 4d7465c833340cf8121cc96b6129b743adc9255a Mon Sep 17 00:00:00 2001
From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com>
Date: Thu, 5 Aug 2021 12:55:21 -0400
Subject: [PATCH] Hotfix for fips zip download location + added full-score-run
 command (#465)

* Hotfix for S3 locations of data sources

* updated README

* lint failures

Co-authored-by: Nat Hillard <Nathaniel.K.Hillard@omb.eop.gov>
---
 data/data-pipeline/README.md                  |  4 +-
 data/data-pipeline/application.py             | 34 ++++++------
 .../data/census/csv/fips_states_2010.csv      | 53 -------------------
 .../etl/sources/calenviroscreen/etl.py        |  5 +-
 .../etl/sources/census/etl_utils.py           |  2 +-
 data/data-pipeline/settings.toml              |  2 +-
 6 files changed, 26 insertions(+), 74 deletions(-)
 delete mode 100644 data/data-pipeline/data/census/csv/fips_states_2010.csv

diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md
index ca8333f5..55f7b0b1 100644
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@@ -123,9 +123,9 @@ Once completed, run `docker-compose up` and then open a new tab or terminal wind
 Here's a list of commands:
 
 - Get help: `docker exec j40_data_pipeline_1 python3 application.py --help"`
-- Clean up the census data directories: `docker exec j40_data_pipeline_1 python3 application.py census-cleanup"`
-- Clean up the data directories: `docker exec j40_data_pipeline_1 python3 application.py data-cleanup"`
 - Generate census data: `docker exec j40_data_pipeline_1 python3 application.py census-data-download"`
+- Run all ETL and Generate score: `docker exec j40_data_pipeline_1 python3 application.py score-full-run`
+- Clean up the data directories: `docker exec j40_data_pipeline_1 python3 application.py data-cleanup"`
 - Run all ETL processes: `docker exec j40_data_pipeline_1 python3 application.py etl-run"`
 - Generate Score: `docker exec j40_data_pipeline_1 python3 application.py score-run"`
 - Generate Score with Geojson and high and low versions: `docker exec j40_data_pipeline_1 python3 application.py geo-score`
diff --git a/data/data-pipeline/application.py b/data/data-pipeline/application.py
index 426fd9db..7808d8f5 100644
--- a/data/data-pipeline/application.py
+++ b/data/data-pipeline/application.py
@@ -22,21 +22,6 @@ def cli():
     pass
 
 
-@cli.command(
-    help="Clean up all census data folders",
-)
-def census_cleanup():
-    """CLI command to clean up the census data folder"""
-
-    data_path = settings.APP_ROOT / "data"
-
-    # census directories
-    logger.info("Initializing all census data")
-    census_reset(data_path)
-
-    logger.info("Cleaned up all census data files")
-
-
 @cli.command(
     help="Clean up all data folders",
 )
@@ -57,8 +42,12 @@ def census_data_download():
     """CLI command to download all census shape files from the Census FTP and extract the geojson
     to generate national and by state Census Block Group CSVs"""
 
-    logger.info("Downloading census data")
     data_path = settings.APP_ROOT / "data"
+
+    logger.info("Initializing all census data")
+    census_reset(data_path)
+
+    logger.info("Downloading census data")
     download_census_csvs(data_path)
 
     logger.info("Completed downloading census data")
@@ -90,6 +79,19 @@ def score_run():
     score_generate()
 
 
+@cli.command(
+    help="Run ETL + Score Generation",
+)
+def score_full_run():
+    """CLI command to run ETL and generate the score in one command"""
+
+    data_folder_cleanup()
+    score_folder_cleanup()
+    temp_folder_cleanup()
+    etl_runner()
+    score_generate()
+
+
 @cli.command(
     help="Generate Geojson files with scores baked in",
 )
diff --git a/data/data-pipeline/data/census/csv/fips_states_2010.csv b/data/data-pipeline/data/census/csv/fips_states_2010.csv
deleted file mode 100644
index 006a2dac..00000000
--- a/data/data-pipeline/data/census/csv/fips_states_2010.csv
+++ /dev/null
@@ -1,53 +0,0 @@
-fips,state_name,state_abbreviation,region,division
-01,Alabama,AL,South,East South Central
-02,Alaska,AK,West,Pacific
-04,Arizona,AZ,West,Mountain
-05,Arkansas,AR,South,West South Central
-06,California,CA,West,Pacific
-08,Colorado,CO,West,Mountain
-09,Connecticut,CT,Northeast,New England
-10,Delaware,DE,South,South Atlantic
-11,District of Columbia,DC,South,South Atlantic
-12,Florida,FL,South,South Atlantic
-13,Georgia,GA,South,South Atlantic
-15,Hawaii,HI,West,Pacific
-16,Idaho,ID,West,Mountain
-17,Illinois,IL,Midwest,East North Central
-18,Indiana,IN,Midwest,East North Central
-19,Iowa,IA,Midwest,West North Central
-20,Kansas,KS,Midwest,West North Central
-21,Kentucky,KY,South,East South Central
-22,Louisiana,LA,South,West South Central
-23,Maine,ME,Northeast,New England
-24,Maryland,MD,South,South Atlantic
-25,Massachusetts,MA,Northeast,New England
-26,Michigan,MI,Midwest,East North Central
-27,Minnesota,MN,Midwest,West North Central
-28,Mississippi,MS,South,East South Central
-29,Missouri,MO,Midwest,West North Central
-30,Montana,MT,West,Mountain
-31,Nebraska,NE,Midwest,West North Central
-32,Nevada,NV,West,Mountain
-33,New Hampshire,NH,Northeast,New England
-34,New Jersey,NJ,Northeast,Middle Atlantic
-35,New Mexico,NM,West,Mountain
-36,New York,NY,Northeast,Middle Atlantic
-37,North Carolina,NC,South,South Atlantic
-38,North Dakota,ND,Midwest,West North Central
-39,Ohio,OH,Midwest,East North Central
-40,Oklahoma,OK,South,West South Central
-41,Oregon,OR,West,Pacific
-42,Pennsylvania,PA,Northeast,Middle Atlantic
-44,Rhode Island,RI,Northeast,New England
-45,South Carolina,SC,South,South Atlantic
-46,South Dakota,SD,Midwest,West North Central
-47,Tennessee,TN,South,East South Central
-48,Texas,TX,South,West South Central
-49,Utah,UT,West,Mountain
-50,Vermont,VT,Northeast,New England
-51,Virginia,VA,South,South Atlantic
-53,Washington,WA,West,Pacific
-54,West Virginia,WV,South,South Atlantic
-55,Wisconsin,WI,Midwest,East North Central
-56,Wyoming,WY,West,Mountain
-72,Puerto Rico,PR,Puerto Rico,Puerto Rico
diff --git a/data/data-pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/etl/sources/calenviroscreen/etl.py
index db3ab31b..1d2c409b 100644
--- a/data/data-pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/etl/sources/calenviroscreen/etl.py
@@ -2,13 +2,16 @@ import pandas as pd
 
 from etl.base import ExtractTransformLoad
 from utils import get_module_logger
+from config import settings
 
 logger = get_module_logger(__name__)
 
 
 class CalEnviroScreenETL(ExtractTransformLoad):
     def __init__(self):
-        self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip"
+        self.CALENVIROSCREEN_FTP_URL = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/CalEnviroScreen_4.0_2021.zip"
+        )
         self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
         self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
 
diff --git a/data/data-pipeline/etl/sources/census/etl_utils.py b/data/data-pipeline/etl/sources/census/etl_utils.py
index 625cd923..6b521db6 100644
--- a/data/data-pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/etl/sources/census/etl_utils.py
@@ -38,7 +38,7 @@ def get_state_fips_codes(data_path: Path) -> list:
     if not os.path.isfile(fips_csv_path):
         logger.info("Downloading fips from S3 repository")
         unzip_file_from_url(
-            settings.AWS_JUSTICE40_DATA_URL + "/Census/fips_states_2010.zip",
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
             data_path / "tmp",
             data_path / "census" / "csv",
         )
diff --git a/data/data-pipeline/settings.toml b/data/data-pipeline/settings.toml
index f6d733a4..26d7746b 100644
--- a/data/data-pipeline/settings.toml
+++ b/data/data-pipeline/settings.toml
@@ -1,5 +1,5 @@
 [default]
-AWS_JUSTICE40_DATA_URL = "https://justice40-data.s3.amazonaws.com"
+AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources"
 
 [development]