diff --git a/.gitignore b/.gitignore index ffb1f2ae..655ef216 100644 --- a/.gitignore +++ b/.gitignore @@ -130,15 +130,15 @@ dmypy.json cython_debug/ # Ignore dynaconf secret files -score/.secrets.* +*/data-pipeline/.secrets.* # ignore data -score/data -score/data/census -score/data/tiles -score/data/tmp -score/data/dataset -score/data/score +*/data-pipeline/data +*/data-pipeline/data/census +*/data-pipeline/data/tiles +*/data-pipeline/data/tmp +*/data-pipeline/data/dataset +*/data-pipeline/data/score # node node_modules diff --git a/score/Dockerfile b/data/data-pipeline/Dockerfile similarity index 97% rename from score/Dockerfile rename to data/data-pipeline/Dockerfile index df2b2dfc..70885f99 100644 --- a/score/Dockerfile +++ b/data/data-pipeline/Dockerfile @@ -25,7 +25,7 @@ RUN add-apt-repository ppa:ubuntugis/ppa RUN apt-get -y install gdal-bin # Prepare python packages -WORKDIR /score +WORKDIR /data-pipeline RUN pip3 install --upgrade pip setuptools wheel COPY . . diff --git a/score/README.md b/data/data-pipeline/README.md similarity index 82% rename from score/README.md rename to data/data-pipeline/README.md index 53f6808d..4920f2d8 100644 --- a/score/README.md +++ b/data/data-pipeline/README.md @@ -93,30 +93,30 @@ We use Docker to install the necessary libraries in a container that can be run To build the docker container the first time, make sure you're in the root directory of the repository and run `docker-compose build`. -After that, to run commands type the following: +Once completed, run `docker-compose up` and then open a new tab or terminal window, and then run any command for the application using this format: +`docker exec j40_data_pipeline_1 python3 application.py [command]` -- Get help: `docker run --rm -it j40_score /bin/sh -c "python3 application.py --help"` -- Clean up the census data directories: `docker run --rm -it j40_score /bin/sh -c "python3 application.py census-cleanup"` -- Clean up the data directories: `docker run --rm -it j40_score /bin/sh -c "python3 application.py data-cleanup"` -- Generate census data: `docker run --rm -it j40_score /bin/sh -c "python3 application.py census-data-download"` -- Run all ETL processes: `docker run --rm -it j40_score /bin/sh -c "python3 application.py etl-run"` -- Generate Score: `docker run --rm -it j40_score /bin/sh -c "python3 application.py score-run"` +Here's a list of commands: -## Log visualization - -If you want to visualize logs while running a command, the following temporary workaround can be used: - -- Run `docker-compose up` on the root of the repo -- Open a new tab on your terminal -- Then run any command for the application using this format: `docker exec j40_score_1 python3 application.py [command]` +- Get help: `docker exec j40_data_pipeline_1 python3 application.py --help"` +- Clean up the census data directories: `docker exec j40_data_pipeline_1 python3 application.py census-cleanup"` +- Clean up the data directories: `docker exec j40_data_pipeline_1 python3 application.py data-cleanup"` +- Generate census data: `docker exec j40_data_pipeline_1 python3 application.py census-data-download"` +- Run all ETL processes: `docker exec j40_data_pipeline_1 python3 application.py etl-run"` +- Generate Score: `docker exec j40_data_pipeline_1 python3 application.py score-run"` ## Local development -You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. Also to generate tiles for a local map, you will need [Mapbox tippeanoe](https://github.com/mapbox/tippecanoe) +You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. Also to generate tiles for a local map, you will need [Mapbox tippeanoe](https://github.com/mapbox/tippecanoe). Please refer to the repos for specific instructions for your OS. -Note: If you are using Windows, please follow [these instructions](https://stackoverflow.com/questions/56958421/pip-install-geopandas-on-windows) to install Geopandas locally. If you want to install TippeCanoe, [follow these instrcutions](https://github.com/GISupportICRC/ArcGIS2Mapbox#installing-tippecanoe-on-windows). +### Windows Users +- If you want to download Census data or run tile generation, please install TippeCanoe [following these instrcutions](https://github.com/GISupportICRC/ArcGIS2Mapbox#installing-tippecanoe-on-windows). +- If you want to generate tiles, you need some pre-requisites for Geopandas as specified in the Poetry requirements. Please follow [these instructions](https://stackoverflow.com/questions/56958421/pip-install-geopandas-on-windows) to install the Geopandas dependency locally. + +### Setting up Poetry - Start a terminal +- Change to this directory (`/data/data-pipeline`) - Make sure you have Python 3.9 installed: `python -V` or `python3 -V` - We use [Poetry](https://python-poetry.org/) for managing dependencies and building the application. Please follow the instructions on their site to download. - Install Poetry requirements with `poetry install` @@ -125,7 +125,7 @@ Note: If you are using Windows, please follow [these instructions](https://stack - Make sure you have Docker running in your machine - Start a terminal -- Change to this directory (i.e. `cd score`) +- Change to this directory (i.e. `cd data/data-pipeline`) - If you want to clear out all data and tiles from all directories, you can run: `poetry run python application.py data-cleanup`. - Then run `poetry run python application.py census-data-download` Note: Census files are not kept in the repository and the download directories are ignored by Git @@ -137,18 +137,18 @@ Note: If you are using Windows, please follow [these instructions](https://stack ### Serve the map locally - Start a terminal -- Change to this directory (i.e. `cd score`) +- Change to this directory (i.e. `cd data/data-pipeline`) - Run: `docker run --rm -it -v ${PWD}/data/tiles:/data -p 8080:80 maptiler/tileserver-gl` ### Running Jupyter notebooks - Start a terminal -- Change to this directory (i.e. `cd score`) +- Change to this directory (i.e. `cd data/data-pipeline`) - Run `poetry run jupyter notebook`. Your browser should open with a Jupyter Notebook tab ### Activating variable-enabled Markdown for Jupyter notebooks -- Change to this directory (i.e. `cd score`) +- Change to this directory (i.e. `cd data/data-pipeline`) - Activate a Poetry Shell (see above) - Run `jupyter contrib nbextension install --user` - Run `jupyter nbextension enable python-markdown/main` diff --git a/data-roadmap/data_roadmap/__init__.py b/data/data-pipeline/__init__.py similarity index 100% rename from data-roadmap/data_roadmap/__init__.py rename to data/data-pipeline/__init__.py diff --git a/score/application.py b/data/data-pipeline/application.py similarity index 100% rename from score/application.py rename to data/data-pipeline/application.py diff --git a/score/config.py b/data/data-pipeline/config.py similarity index 100% rename from score/config.py rename to data/data-pipeline/config.py diff --git a/data-roadmap/data_roadmap/data_set_descriptions/__init__.py b/data/data-pipeline/data/census/__init__.py similarity index 100% rename from data-roadmap/data_roadmap/data_set_descriptions/__init__.py rename to data/data-pipeline/data/census/__init__.py diff --git a/score/__init__.py b/data/data-pipeline/data/census/csv/__init__.py similarity index 100% rename from score/__init__.py rename to data/data-pipeline/data/census/csv/__init__.py diff --git a/score/data/census/csv/fips_states_2010.csv b/data/data-pipeline/data/census/csv/fips_states_2010.csv similarity index 97% rename from score/data/census/csv/fips_states_2010.csv rename to data/data-pipeline/data/census/csv/fips_states_2010.csv index cf2e1550..006a2dac 100644 --- a/score/data/census/csv/fips_states_2010.csv +++ b/data/data-pipeline/data/census/csv/fips_states_2010.csv @@ -1,53 +1,53 @@ -fips,state_name,state_abbreviation,region,division -01,Alabama,AL,South,East South Central -02,Alaska,AK,West,Pacific -04,Arizona,AZ,West,Mountain -05,Arkansas,AR,South,West South Central -06,California,CA,West,Pacific -08,Colorado,CO,West,Mountain -09,Connecticut,CT,Northeast,New England -10,Delaware,DE,South,South Atlantic -11,District of Columbia,DC,South,South Atlantic -12,Florida,FL,South,South Atlantic -13,Georgia,GA,South,South Atlantic -15,Hawaii,HI,West,Pacific -16,Idaho,ID,West,Mountain -17,Illinois,IL,Midwest,East North Central -18,Indiana,IN,Midwest,East North Central -19,Iowa,IA,Midwest,West North Central -20,Kansas,KS,Midwest,West North Central -21,Kentucky,KY,South,East South Central -22,Louisiana,LA,South,West South Central -23,Maine,ME,Northeast,New England -24,Maryland,MD,South,South Atlantic -25,Massachusetts,MA,Northeast,New England -26,Michigan,MI,Midwest,East North Central -27,Minnesota,MN,Midwest,West North Central -28,Mississippi,MS,South,East South Central -29,Missouri,MO,Midwest,West North Central -30,Montana,MT,West,Mountain -31,Nebraska,NE,Midwest,West North Central -32,Nevada,NV,West,Mountain -33,New Hampshire,NH,Northeast,New England -34,New Jersey,NJ,Northeast,Middle Atlantic -35,New Mexico,NM,West,Mountain -36,New York,NY,Northeast,Middle Atlantic -37,North Carolina,NC,South,South Atlantic -38,North Dakota,ND,Midwest,West North Central -39,Ohio,OH,Midwest,East North Central -40,Oklahoma,OK,South,West South Central -41,Oregon,OR,West,Pacific -42,Pennsylvania,PA,Northeast,Middle Atlantic -44,Rhode Island,RI,Northeast,New England -45,South Carolina,SC,South,South Atlantic -46,South Dakota,SD,Midwest,West North Central -47,Tennessee,TN,South,East South Central -48,Texas,TX,South,West South Central -49,Utah,UT,West,Mountain -50,Vermont,VT,Northeast,New England -51,Virginia,VA,South,South Atlantic -53,Washington,WA,West,Pacific -54,West Virginia,WV,South,South Atlantic -55,Wisconsin,WI,Midwest,East North Central -56,Wyoming,WY,West,Mountain -72,Puerto Rico,PR,Puerto Rico,Puerto Rico +fips,state_name,state_abbreviation,region,division +01,Alabama,AL,South,East South Central +02,Alaska,AK,West,Pacific +04,Arizona,AZ,West,Mountain +05,Arkansas,AR,South,West South Central +06,California,CA,West,Pacific +08,Colorado,CO,West,Mountain +09,Connecticut,CT,Northeast,New England +10,Delaware,DE,South,South Atlantic +11,District of Columbia,DC,South,South Atlantic +12,Florida,FL,South,South Atlantic +13,Georgia,GA,South,South Atlantic +15,Hawaii,HI,West,Pacific +16,Idaho,ID,West,Mountain +17,Illinois,IL,Midwest,East North Central +18,Indiana,IN,Midwest,East North Central +19,Iowa,IA,Midwest,West North Central +20,Kansas,KS,Midwest,West North Central +21,Kentucky,KY,South,East South Central +22,Louisiana,LA,South,West South Central +23,Maine,ME,Northeast,New England +24,Maryland,MD,South,South Atlantic +25,Massachusetts,MA,Northeast,New England +26,Michigan,MI,Midwest,East North Central +27,Minnesota,MN,Midwest,West North Central +28,Mississippi,MS,South,East South Central +29,Missouri,MO,Midwest,West North Central +30,Montana,MT,West,Mountain +31,Nebraska,NE,Midwest,West North Central +32,Nevada,NV,West,Mountain +33,New Hampshire,NH,Northeast,New England +34,New Jersey,NJ,Northeast,Middle Atlantic +35,New Mexico,NM,West,Mountain +36,New York,NY,Northeast,Middle Atlantic +37,North Carolina,NC,South,South Atlantic +38,North Dakota,ND,Midwest,West North Central +39,Ohio,OH,Midwest,East North Central +40,Oklahoma,OK,South,West South Central +41,Oregon,OR,West,Pacific +42,Pennsylvania,PA,Northeast,Middle Atlantic +44,Rhode Island,RI,Northeast,New England +45,South Carolina,SC,South,South Atlantic +46,South Dakota,SD,Midwest,West North Central +47,Tennessee,TN,South,East South Central +48,Texas,TX,South,West South Central +49,Utah,UT,West,Mountain +50,Vermont,VT,Northeast,New England +51,Virginia,VA,South,South Atlantic +53,Washington,WA,West,Pacific +54,West Virginia,WV,South,South Atlantic +55,Wisconsin,WI,Midwest,East North Central +56,Wyoming,WY,West,Mountain +72,Puerto Rico,PR,Puerto Rico,Puerto Rico diff --git a/score/data/census/__init__.py b/data/data-pipeline/data/census/geojson/__init__.py similarity index 100% rename from score/data/census/__init__.py rename to data/data-pipeline/data/census/geojson/__init__.py diff --git a/score/data/census/csv/__init__.py b/data/data-pipeline/data/census/shp/__init__.py similarity index 100% rename from score/data/census/csv/__init__.py rename to data/data-pipeline/data/census/shp/__init__.py diff --git a/score/data/census/geojson/__init__.py b/data/data-pipeline/data/dataset/__init__.py similarity index 100% rename from score/data/census/geojson/__init__.py rename to data/data-pipeline/data/dataset/__init__.py diff --git a/score/data/census/shp/__init__.py b/data/data-pipeline/data/score/csv/__init__.py similarity index 100% rename from score/data/census/shp/__init__.py rename to data/data-pipeline/data/score/csv/__init__.py diff --git a/score/data/dataset/__init__.py b/data/data-pipeline/data/score/geojson/__init__.py similarity index 100% rename from score/data/dataset/__init__.py rename to data/data-pipeline/data/score/geojson/__init__.py diff --git a/score/data/score/csv/__init__.py b/data/data-pipeline/data/tiles/__init__.py similarity index 100% rename from score/data/score/csv/__init__.py rename to data/data-pipeline/data/tiles/__init__.py diff --git a/score/data/score/geojson/__init__.py b/data/data-pipeline/data/tmp/__init__.py similarity index 100% rename from score/data/score/geojson/__init__.py rename to data/data-pipeline/data/tmp/__init__.py diff --git a/score/data/tiles/__init__.py b/data/data-pipeline/etl/__init__.py similarity index 100% rename from score/data/tiles/__init__.py rename to data/data-pipeline/etl/__init__.py diff --git a/score/etl/base.py b/data/data-pipeline/etl/base.py similarity index 100% rename from score/etl/base.py rename to data/data-pipeline/etl/base.py diff --git a/score/etl/runner.py b/data/data-pipeline/etl/runner.py similarity index 100% rename from score/etl/runner.py rename to data/data-pipeline/etl/runner.py diff --git a/score/data/tmp/__init__.py b/data/data-pipeline/etl/score/__init__.py similarity index 100% rename from score/data/tmp/__init__.py rename to data/data-pipeline/etl/score/__init__.py diff --git a/score/etl/score/etl_score.py b/data/data-pipeline/etl/score/etl_score.py similarity index 100% rename from score/etl/score/etl_score.py rename to data/data-pipeline/etl/score/etl_score.py diff --git a/score/etl/score/etl_score_post.py b/data/data-pipeline/etl/score/etl_score_post.py similarity index 90% rename from score/etl/score/etl_score_post.py rename to data/data-pipeline/etl/score/etl_score_post.py index 17da0790..41c837a4 100644 --- a/score/etl/score/etl_score_post.py +++ b/data/data-pipeline/etl/score/etl_score_post.py @@ -20,8 +20,8 @@ class PostScoreETL(ExtractTransformLoad): self.STATE_CSV = ( self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv" ) - self.SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv" - self.COUNTY_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa-county.csv" + self.FULL_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv" + self.TILR_SCORE_CSV = self.SCORE_CSV_PATH / "tile" / "usa.csv" self.TILES_SCORE_COLUMNS = [ "GEOID10", @@ -59,7 +59,9 @@ class PostScoreETL(ExtractTransformLoad): self.states_df = pd.read_csv( self.STATE_CSV, dtype={"fips": "string", "state_code": "string"} ) - self.score_df = pd.read_csv(self.SCORE_CSV, dtype={"GEOID10": "string"}) + self.score_df = pd.read_csv( + self.FULL_SCORE_CSV, dtype={"GEOID10": "string"} + ) def transform(self) -> None: logger.info(f"Transforming data sources for Score + County CSV") @@ -98,11 +100,9 @@ class PostScoreETL(ExtractTransformLoad): del self.score_county_state_merged["GEOID_OTHER"] def load(self) -> None: - logger.info(f"Saving Score + County CSV") + logger.info(f"Saving Full Score CSV with County Information") self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) - # self.score_county_state_merged.to_csv( - # self.COUNTY_SCORE_CSV, index=False - # ) + self.score_county_state_merged.to_csv(self.FULL_SCORE_CSV, index=False) logger.info(f"Saving Tile Score CSV") # TODO: check which are the columns we'll use diff --git a/score/etl/__init__.py b/data/data-pipeline/etl/sources/__init__.py similarity index 100% rename from score/etl/__init__.py rename to data/data-pipeline/etl/sources/__init__.py diff --git a/score/etl/sources/calenviroscreen/README.md b/data/data-pipeline/etl/sources/calenviroscreen/README.md similarity index 100% rename from score/etl/sources/calenviroscreen/README.md rename to data/data-pipeline/etl/sources/calenviroscreen/README.md diff --git a/score/etl/score/__init__.py b/data/data-pipeline/etl/sources/calenviroscreen/__init__.py similarity index 100% rename from score/etl/score/__init__.py rename to data/data-pipeline/etl/sources/calenviroscreen/__init__.py diff --git a/score/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/etl/sources/calenviroscreen/etl.py similarity index 100% rename from score/etl/sources/calenviroscreen/etl.py rename to data/data-pipeline/etl/sources/calenviroscreen/etl.py diff --git a/score/etl/sources/census/README.md b/data/data-pipeline/etl/sources/census/README.md similarity index 100% rename from score/etl/sources/census/README.md rename to data/data-pipeline/etl/sources/census/README.md diff --git a/score/etl/sources/__init__.py b/data/data-pipeline/etl/sources/census/__init__.py similarity index 100% rename from score/etl/sources/__init__.py rename to data/data-pipeline/etl/sources/census/__init__.py diff --git a/score/etl/sources/census/etl.py b/data/data-pipeline/etl/sources/census/etl.py similarity index 100% rename from score/etl/sources/census/etl.py rename to data/data-pipeline/etl/sources/census/etl.py diff --git a/score/etl/sources/census/etl_utils.py b/data/data-pipeline/etl/sources/census/etl_utils.py similarity index 84% rename from score/etl/sources/census/etl_utils.py rename to data/data-pipeline/etl/sources/census/etl_utils.py index 352c850e..dba41945 100644 --- a/score/etl/sources/census/etl_utils.py +++ b/data/data-pipeline/etl/sources/census/etl_utils.py @@ -3,7 +3,14 @@ import csv import os from config import settings -from utils import remove_files_from_dir, remove_all_dirs_from_dir, unzip_file_from_url +from utils import ( + remove_files_from_dir, + remove_all_dirs_from_dir, + unzip_file_from_url, + get_module_logger, +) + +logger = get_module_logger(__name__) def reset_data_directories(data_path: Path) -> None: @@ -27,6 +34,7 @@ def get_state_fips_codes(data_path: Path) -> list: # check if file exists if not os.path.isfile(fips_csv_path): + logger.info(f"Downloading fips from S3 repository") unzip_file_from_url( settings.AWS_JUSTICE40_DATA_URL + "/Census/fips_states_2010.zip", data_path / "tmp", diff --git a/score/etl/sources/census_acs/README.md b/data/data-pipeline/etl/sources/census_acs/README.md similarity index 100% rename from score/etl/sources/census_acs/README.md rename to data/data-pipeline/etl/sources/census_acs/README.md diff --git a/score/etl/sources/calenviroscreen/__init__.py b/data/data-pipeline/etl/sources/census_acs/__init__.py similarity index 100% rename from score/etl/sources/calenviroscreen/__init__.py rename to data/data-pipeline/etl/sources/census_acs/__init__.py diff --git a/score/etl/sources/census_acs/etl.py b/data/data-pipeline/etl/sources/census_acs/etl.py similarity index 100% rename from score/etl/sources/census_acs/etl.py rename to data/data-pipeline/etl/sources/census_acs/etl.py diff --git a/score/etl/sources/ejscreen/README.md b/data/data-pipeline/etl/sources/ejscreen/README.md similarity index 100% rename from score/etl/sources/ejscreen/README.md rename to data/data-pipeline/etl/sources/ejscreen/README.md diff --git a/score/etl/sources/census/__init__.py b/data/data-pipeline/etl/sources/ejscreen/__init__.py similarity index 100% rename from score/etl/sources/census/__init__.py rename to data/data-pipeline/etl/sources/ejscreen/__init__.py diff --git a/score/etl/sources/ejscreen/etl.py b/data/data-pipeline/etl/sources/ejscreen/etl.py similarity index 98% rename from score/etl/sources/ejscreen/etl.py rename to data/data-pipeline/etl/sources/ejscreen/etl.py index 5fbffb24..8d98b040 100644 --- a/score/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/etl/sources/ejscreen/etl.py @@ -9,7 +9,7 @@ logger = get_module_logger(__name__) class EJScreenETL(ExtractTransformLoad): def __init__(self): self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip" - self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctile.csv" + self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv" self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019" self.df: pd.DataFrame diff --git a/score/etl/sources/housing_and_transportation/README.md b/data/data-pipeline/etl/sources/housing_and_transportation/README.md similarity index 100% rename from score/etl/sources/housing_and_transportation/README.md rename to data/data-pipeline/etl/sources/housing_and_transportation/README.md diff --git a/score/etl/sources/census_acs/__init__.py b/data/data-pipeline/etl/sources/housing_and_transportation/__init__.py similarity index 100% rename from score/etl/sources/census_acs/__init__.py rename to data/data-pipeline/etl/sources/housing_and_transportation/__init__.py diff --git a/score/etl/sources/housing_and_transportation/etl.py b/data/data-pipeline/etl/sources/housing_and_transportation/etl.py similarity index 83% rename from score/etl/sources/housing_and_transportation/etl.py rename to data/data-pipeline/etl/sources/housing_and_transportation/etl.py index cbe56cfa..348de679 100644 --- a/score/etl/sources/housing_and_transportation/etl.py +++ b/data/data-pipeline/etl/sources/housing_and_transportation/etl.py @@ -25,12 +25,19 @@ class HousingTransportationETL(ExtractTransformLoad): logger.info( f"Downloading housing data for state/territory with FIPS code {fips}" ) + + # Puerto Rico has no data, so skip + if fips == "72": + continue + unzip_file_from_url( f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir ) # New file name: - tmp_csv_file_path = zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv" + tmp_csv_file_path = ( + zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv" + ) tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path) dfs.append(tmp_df) @@ -44,9 +51,9 @@ class HousingTransportationETL(ExtractTransformLoad): # Rename and reformat block group ID self.df.rename(columns={"blkgrp": self.GEOID_FIELD_NAME}, inplace=True) - self.df[self.GEOID_FIELD_NAME] = self.df[self.GEOID_FIELD_NAME].str.replace( - '"', "" - ) + self.df[self.GEOID_FIELD_NAME] = self.df[ + self.GEOID_FIELD_NAME + ].str.replace('"', "") def load(self) -> None: logger.info(f"Saving Housing and Transportation Data") diff --git a/score/etl/sources/hud_housing/README.md b/data/data-pipeline/etl/sources/hud_housing/README.md similarity index 100% rename from score/etl/sources/hud_housing/README.md rename to data/data-pipeline/etl/sources/hud_housing/README.md diff --git a/score/etl/sources/ejscreen/__init__.py b/data/data-pipeline/etl/sources/hud_housing/__init__.py similarity index 100% rename from score/etl/sources/ejscreen/__init__.py rename to data/data-pipeline/etl/sources/hud_housing/__init__.py diff --git a/score/etl/sources/hud_housing/etl.py b/data/data-pipeline/etl/sources/hud_housing/etl.py similarity index 94% rename from score/etl/sources/hud_housing/etl.py rename to data/data-pipeline/etl/sources/hud_housing/etl.py index 3dd3449a..7651460a 100644 --- a/score/etl/sources/hud_housing/etl.py +++ b/data/data-pipeline/etl/sources/hud_housing/etl.py @@ -11,16 +11,16 @@ class HudHousingETL(ExtractTransformLoad): def __init__(self): self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "hud_housing" self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT" - self.HOUSING_FTP_URL = ( - "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip" - ) + self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip" self.HOUSING_ZIP_FILE_DIR = self.TMP_PATH / "hud_housing" # We measure households earning less than 80% of HUD Area Median Family Income by county # and paying greater than 30% of their income to housing costs. self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)" self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME = "HOUSING_BURDEN_NUMERATOR" - self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = "HOUSING_BURDEN_DENOMINATOR" + self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = ( + "HOUSING_BURDEN_DENOMINATOR" + ) # Note: some variable definitions. # HUD-adjusted median family income (HAMFI). @@ -47,10 +47,15 @@ class HudHousingETL(ExtractTransformLoad): / "140" / "Table8.csv" ) - self.df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path) + self.df = pd.read_csv( + filepath_or_buffer=tmp_csv_file_path, + encoding="latin-1", + ) # Rename and reformat block group ID - self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True) + self.df.rename( + columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True + ) # The CHAS data has census tract ids such as `14000US01001020100` # Whereas the rest of our data uses, for the same tract, `01001020100`. @@ -160,7 +165,9 @@ class HudHousingETL(ExtractTransformLoad): # TODO: add small sample size checks self.df[self.HOUSING_BURDEN_FIELD_NAME] = self.df[ self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME - ].astype(float) / self.df[self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype( + ].astype(float) / self.df[ + self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME + ].astype( float ) diff --git a/score/etl/sources/hud_recap/README.md b/data/data-pipeline/etl/sources/hud_recap/README.md similarity index 100% rename from score/etl/sources/hud_recap/README.md rename to data/data-pipeline/etl/sources/hud_recap/README.md diff --git a/score/etl/sources/housing_and_transportation/__init__.py b/data/data-pipeline/etl/sources/hud_recap/__init__.py similarity index 100% rename from score/etl/sources/housing_and_transportation/__init__.py rename to data/data-pipeline/etl/sources/hud_recap/__init__.py diff --git a/score/etl/sources/hud_recap/etl.py b/data/data-pipeline/etl/sources/hud_recap/etl.py similarity index 100% rename from score/etl/sources/hud_recap/etl.py rename to data/data-pipeline/etl/sources/hud_recap/etl.py diff --git a/score/ipython/county_lookup.ipynb b/data/data-pipeline/ipython/county_lookup.ipynb similarity index 100% rename from score/ipython/county_lookup.ipynb rename to data/data-pipeline/ipython/county_lookup.ipynb diff --git a/score/ipython/scoring_comparison.ipynb b/data/data-pipeline/ipython/scoring_comparison.ipynb similarity index 100% rename from score/ipython/scoring_comparison.ipynb rename to data/data-pipeline/ipython/scoring_comparison.ipynb diff --git a/score/poetry.lock b/data/data-pipeline/poetry.lock similarity index 100% rename from score/poetry.lock rename to data/data-pipeline/poetry.lock diff --git a/score/pyproject.toml b/data/data-pipeline/pyproject.toml similarity index 100% rename from score/pyproject.toml rename to data/data-pipeline/pyproject.toml index 2f1a0588..63a3cf53 100644 --- a/score/pyproject.toml +++ b/data/data-pipeline/pyproject.toml @@ -6,16 +6,16 @@ authors = ["Your Name "] [tool.poetry.dependencies] python = "^3.7.1" +CensusData = "^1.13" +click = "^8.0.1" +dynaconf = "^3.1.4" ipython = "^7.24.1" jupyter = "^1.0.0" jupyter-contrib-nbextensions = "^0.5.1" numpy = "^1.21.0" pandas = "^1.2.5" requests = "^2.25.1" -click = "^8.0.1" -dynaconf = "^3.1.4" types-requests = "^2.25.0" -CensusData = "^1.13" [tool.poetry.dev-dependencies] mypy = "^0.910" diff --git a/score/requirements.txt b/data/data-pipeline/requirements.txt similarity index 100% rename from score/requirements.txt rename to data/data-pipeline/requirements.txt diff --git a/score/settings.toml b/data/data-pipeline/settings.toml similarity index 100% rename from score/settings.toml rename to data/data-pipeline/settings.toml diff --git a/score/etl/sources/hud_housing/__init__.py b/data/data-pipeline/tile/__init__.py similarity index 100% rename from score/etl/sources/hud_housing/__init__.py rename to data/data-pipeline/tile/__init__.py diff --git a/score/tile/generate.py b/data/data-pipeline/tile/generate.py similarity index 100% rename from score/tile/generate.py rename to data/data-pipeline/tile/generate.py diff --git a/score/utils.py b/data/data-pipeline/utils.py similarity index 100% rename from score/utils.py rename to data/data-pipeline/utils.py diff --git a/data-roadmap/README.md b/data/data-roadmap/README.md similarity index 100% rename from data-roadmap/README.md rename to data/data-roadmap/README.md diff --git a/score/etl/sources/hud_recap/__init__.py b/data/data-roadmap/__init__.py similarity index 100% rename from score/etl/sources/hud_recap/__init__.py rename to data/data-roadmap/__init__.py diff --git a/data-roadmap/data_roadmap/data_set_description_field_descriptions.yaml b/data/data-roadmap/data_set_description_field_descriptions.yaml similarity index 100% rename from data-roadmap/data_roadmap/data_set_description_field_descriptions.yaml rename to data/data-roadmap/data_set_description_field_descriptions.yaml diff --git a/data-roadmap/data_roadmap/data_set_description_schema.yaml b/data/data-roadmap/data_set_description_schema.yaml similarity index 100% rename from data-roadmap/data_roadmap/data_set_description_schema.yaml rename to data/data-roadmap/data_set_description_schema.yaml diff --git a/data-roadmap/data_roadmap/data_set_description_template.yaml b/data/data-roadmap/data_set_description_template.yaml similarity index 100% rename from data-roadmap/data_roadmap/data_set_description_template.yaml rename to data/data-roadmap/data_set_description_template.yaml diff --git a/data-roadmap/data_roadmap/data_set_descriptions/PM25.yaml b/data/data-roadmap/data_set_descriptions/PM25.yaml similarity index 100% rename from data-roadmap/data_roadmap/data_set_descriptions/PM25.yaml rename to data/data-roadmap/data_set_descriptions/PM25.yaml diff --git a/score/tile/__init__.py b/data/data-roadmap/data_set_descriptions/__init__.py similarity index 100% rename from score/tile/__init__.py rename to data/data-roadmap/data_set_descriptions/__init__.py diff --git a/data-roadmap/requirements.txt b/data/data-roadmap/requirements.txt similarity index 100% rename from data-roadmap/requirements.txt rename to data/data-roadmap/requirements.txt diff --git a/data-roadmap/setup.py b/data/data-roadmap/setup.py similarity index 100% rename from data-roadmap/setup.py rename to data/data-roadmap/setup.py diff --git a/data-roadmap/data_roadmap/utils/utils_data_set_description_schema.py b/data/data-roadmap/utils/utils_data_set_description_schema.py similarity index 100% rename from data-roadmap/data_roadmap/utils/utils_data_set_description_schema.py rename to data/data-roadmap/utils/utils_data_set_description_schema.py diff --git a/data-roadmap/data_roadmap/utils/utils_data_set_description_schema_test.py b/data/data-roadmap/utils/utils_data_set_description_schema_test.py similarity index 100% rename from data-roadmap/data_roadmap/utils/utils_data_set_description_schema_test.py rename to data/data-roadmap/utils/utils_data_set_description_schema_test.py diff --git a/docker-compose.yml b/docker-compose.yml index 7e7a2f90..3fcd4ee7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,13 +1,13 @@ version: "3.4" services: score: - image: j40_score - container_name: j40_score_1 - build: score + image: j40_data_pipeline + container_name: j40_data_pipeline_1 + build: data/data-pipeline ports: - 8888:8888 volumes: - - ./score:/score + - ./data/data-pipeline:/data-pipeline stdin_open: true tty: true environment: