From 4d7465c833340cf8121cc96b6129b743adc9255a Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Thu, 5 Aug 2021 12:55:21 -0400 Subject: [PATCH] Hotfix for fips zip download location + added full-score-run command (#465) * Hotfix for S3 locations of data sources * updated README * lint failures Co-authored-by: Nat Hillard --- data/data-pipeline/README.md | 4 +- data/data-pipeline/application.py | 34 ++++++------ .../data/census/csv/fips_states_2010.csv | 53 ------------------- .../etl/sources/calenviroscreen/etl.py | 5 +- .../etl/sources/census/etl_utils.py | 2 +- data/data-pipeline/settings.toml | 2 +- 6 files changed, 26 insertions(+), 74 deletions(-) delete mode 100644 data/data-pipeline/data/census/csv/fips_states_2010.csv diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index ca8333f5..55f7b0b1 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -123,9 +123,9 @@ Once completed, run `docker-compose up` and then open a new tab or terminal wind Here's a list of commands: - Get help: `docker exec j40_data_pipeline_1 python3 application.py --help"` -- Clean up the census data directories: `docker exec j40_data_pipeline_1 python3 application.py census-cleanup"` -- Clean up the data directories: `docker exec j40_data_pipeline_1 python3 application.py data-cleanup"` - Generate census data: `docker exec j40_data_pipeline_1 python3 application.py census-data-download"` +- Run all ETL and Generate score: `docker exec j40_data_pipeline_1 python3 application.py score-full-run` +- Clean up the data directories: `docker exec j40_data_pipeline_1 python3 application.py data-cleanup"` - Run all ETL processes: `docker exec j40_data_pipeline_1 python3 application.py etl-run"` - Generate Score: `docker exec j40_data_pipeline_1 python3 application.py score-run"` - Generate Score with Geojson and high and low versions: `docker exec j40_data_pipeline_1 python3 application.py geo-score` diff --git a/data/data-pipeline/application.py b/data/data-pipeline/application.py index 426fd9db..7808d8f5 100644 --- a/data/data-pipeline/application.py +++ b/data/data-pipeline/application.py @@ -22,21 +22,6 @@ def cli(): pass -@cli.command( - help="Clean up all census data folders", -) -def census_cleanup(): - """CLI command to clean up the census data folder""" - - data_path = settings.APP_ROOT / "data" - - # census directories - logger.info("Initializing all census data") - census_reset(data_path) - - logger.info("Cleaned up all census data files") - - @cli.command( help="Clean up all data folders", ) @@ -57,8 +42,12 @@ def census_data_download(): """CLI command to download all census shape files from the Census FTP and extract the geojson to generate national and by state Census Block Group CSVs""" - logger.info("Downloading census data") data_path = settings.APP_ROOT / "data" + + logger.info("Initializing all census data") + census_reset(data_path) + + logger.info("Downloading census data") download_census_csvs(data_path) logger.info("Completed downloading census data") @@ -90,6 +79,19 @@ def score_run(): score_generate() +@cli.command( + help="Run ETL + Score Generation", +) +def score_full_run(): + """CLI command to run ETL and generate the score in one command""" + + data_folder_cleanup() + score_folder_cleanup() + temp_folder_cleanup() + etl_runner() + score_generate() + + @cli.command( help="Generate Geojson files with scores baked in", ) diff --git a/data/data-pipeline/data/census/csv/fips_states_2010.csv b/data/data-pipeline/data/census/csv/fips_states_2010.csv deleted file mode 100644 index 006a2dac..00000000 --- a/data/data-pipeline/data/census/csv/fips_states_2010.csv +++ /dev/null @@ -1,53 +0,0 @@ -fips,state_name,state_abbreviation,region,division -01,Alabama,AL,South,East South Central -02,Alaska,AK,West,Pacific -04,Arizona,AZ,West,Mountain -05,Arkansas,AR,South,West South Central -06,California,CA,West,Pacific -08,Colorado,CO,West,Mountain -09,Connecticut,CT,Northeast,New England -10,Delaware,DE,South,South Atlantic -11,District of Columbia,DC,South,South Atlantic -12,Florida,FL,South,South Atlantic -13,Georgia,GA,South,South Atlantic -15,Hawaii,HI,West,Pacific -16,Idaho,ID,West,Mountain -17,Illinois,IL,Midwest,East North Central -18,Indiana,IN,Midwest,East North Central -19,Iowa,IA,Midwest,West North Central -20,Kansas,KS,Midwest,West North Central -21,Kentucky,KY,South,East South Central -22,Louisiana,LA,South,West South Central -23,Maine,ME,Northeast,New England -24,Maryland,MD,South,South Atlantic -25,Massachusetts,MA,Northeast,New England -26,Michigan,MI,Midwest,East North Central -27,Minnesota,MN,Midwest,West North Central -28,Mississippi,MS,South,East South Central -29,Missouri,MO,Midwest,West North Central -30,Montana,MT,West,Mountain -31,Nebraska,NE,Midwest,West North Central -32,Nevada,NV,West,Mountain -33,New Hampshire,NH,Northeast,New England -34,New Jersey,NJ,Northeast,Middle Atlantic -35,New Mexico,NM,West,Mountain -36,New York,NY,Northeast,Middle Atlantic -37,North Carolina,NC,South,South Atlantic -38,North Dakota,ND,Midwest,West North Central -39,Ohio,OH,Midwest,East North Central -40,Oklahoma,OK,South,West South Central -41,Oregon,OR,West,Pacific -42,Pennsylvania,PA,Northeast,Middle Atlantic -44,Rhode Island,RI,Northeast,New England -45,South Carolina,SC,South,South Atlantic -46,South Dakota,SD,Midwest,West North Central -47,Tennessee,TN,South,East South Central -48,Texas,TX,South,West South Central -49,Utah,UT,West,Mountain -50,Vermont,VT,Northeast,New England -51,Virginia,VA,South,South Atlantic -53,Washington,WA,West,Pacific -54,West Virginia,WV,South,South Atlantic -55,Wisconsin,WI,Midwest,East North Central -56,Wyoming,WY,West,Mountain -72,Puerto Rico,PR,Puerto Rico,Puerto Rico diff --git a/data/data-pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/etl/sources/calenviroscreen/etl.py index db3ab31b..1d2c409b 100644 --- a/data/data-pipeline/etl/sources/calenviroscreen/etl.py +++ b/data/data-pipeline/etl/sources/calenviroscreen/etl.py @@ -2,13 +2,16 @@ import pandas as pd from etl.base import ExtractTransformLoad from utils import get_module_logger +from config import settings logger = get_module_logger(__name__) class CalEnviroScreenETL(ExtractTransformLoad): def __init__(self): - self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip" + self.CALENVIROSCREEN_FTP_URL = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + "/CalEnviroScreen_4.0_2021.zip" + ) self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv" self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4" diff --git a/data/data-pipeline/etl/sources/census/etl_utils.py b/data/data-pipeline/etl/sources/census/etl_utils.py index 625cd923..6b521db6 100644 --- a/data/data-pipeline/etl/sources/census/etl_utils.py +++ b/data/data-pipeline/etl/sources/census/etl_utils.py @@ -38,7 +38,7 @@ def get_state_fips_codes(data_path: Path) -> list: if not os.path.isfile(fips_csv_path): logger.info("Downloading fips from S3 repository") unzip_file_from_url( - settings.AWS_JUSTICE40_DATA_URL + "/Census/fips_states_2010.zip", + settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip", data_path / "tmp", data_path / "census" / "csv", ) diff --git a/data/data-pipeline/settings.toml b/data/data-pipeline/settings.toml index f6d733a4..26d7746b 100644 --- a/data/data-pipeline/settings.toml +++ b/data/data-pipeline/settings.toml @@ -1,5 +1,5 @@ [default] -AWS_JUSTICE40_DATA_URL = "https://justice40-data.s3.amazonaws.com" +AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources" [development]