From 5bd63c083b8127db0c816c6561dfa1dd5815d510 Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Tue, 14 Sep 2021 14:15:34 -0400 Subject: [PATCH] Run all Census, ETL, Score, Combine and Tilefy in one command (#662) * Run all Census, ETL, Score, Combine and Tilefy in one command * docker cmd * some docker improvements * feedback updates * lint --- data/data-pipeline/Dockerfile | 2 + data/data-pipeline/README.md | 26 +++---- .../data_pipeline/application.py | 70 +++++++++++++++++++ data/data-pipeline/data_pipeline/utils.py | 14 ++++ 4 files changed, 99 insertions(+), 13 deletions(-) diff --git a/data/data-pipeline/Dockerfile b/data/data-pipeline/Dockerfile index f74ccb26..f77cebeb 100644 --- a/data/data-pipeline/Dockerfile +++ b/data/data-pipeline/Dockerfile @@ -32,3 +32,5 @@ COPY . . COPY requirements.txt . RUN pip3 install -r requirements.txt RUN pip3 install . + +CMD python3 -m data_pipeline.application data-full-run --check diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index 2724c660..5a0e7fba 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -96,14 +96,14 @@ TODO add mermaid diagram #### Step 1: Run the script to download census data or download from the Justice40 S3 URL 1. Call the `census_data_download` command using the application manager `application.py` **NOTE:** This may take several minutes to execute. - - With Docker: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application census-data-download` + - With Docker: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application census-data-download` - With Poetry: `poetry run download_census` (Install GDAL as described [below](#local-development)) 2. If you have a high speed internet connection and don't want to generate the census data or install `GDAL` locally, you can download a zip version of the Census file [here](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Then unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census/` folder/ #### Step 2: Run the ETL script for each data source 1. Call the `etl-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute. - - With Docker: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application etl-run` + - With Docker: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application etl-run` - With Poetry: `poetry run etl` 2. This command will execute the corresponding ETL script for each data source in `data_pipeline/etl/sources/`. For example, `data_pipeline/etl/sources/ejscreen/etl.py` is the ETL script for EJSCREEN data. 3. Each ETL script will extract the data from its original source, then format the data into `.csv` files that get stored in the relevant folder in `data_pipeline/data/dataset/`. For example, HUD Housing data is stored in `data_pipeline/data/dataset/hud_housing/usa.csv` @@ -114,7 +114,7 @@ _For example: `poetry run etl -d ejscreen` would only run the ETL process for EJ #### Step 3: Calculate the Justice40 score experiments 1. Call the `score-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute. - - With Docker: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application score-run` + - With Docker: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application score-run` - With Poetry: `poetry run score` 1. The `score-run` command will execute the `etl/score/etl.py` script which loads the data from each of the source files added to the `data/dataset/` directory by the ETL scripts in Step 1. 1. These data sets are merged into a single dataframe using their Census Block Group GEOID as a common key, and the data in each of the columns is standardized in two ways: @@ -161,20 +161,20 @@ To build the docker container the first time, make sure you're in the root direc Once completed, run `docker-compose up`. Docker will spin up 3 containers: the client container, the static server container and the data container. Once all data is generated, you can see the application using a browser and navigating to `htto://localhost:8000`. -If you want to run specific data tasks, you can open a new terminal tab or terminal window while `docker-compose up` is running, and then execute any command for the application using this format: +If you want to run specific data tasks, you can open a terminal window, navigate to the root folder for this repository and then execute any command for the application using this format: -`docker exec j40_data_pipeline_1 python3 -m data_pipeline.application [command]` +`docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application [command]` Here's a list of commands: -- Get help: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application --help` -- Generate census data: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application census-data-download` -- Run all ETL and Generate score: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application score-full-run` -- Clean up the data directories: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application data-cleanup` -- Run all ETL processes: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application etl-run` -- Generate Score: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application score-run` -- Combine Score with Geojson and generate high and low zoom map tile sets: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application geo-score` -- Generate Map Tiles: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application generate-map-tiles` +- Get help: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application --help` +- Generate census data: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application census-data-download` +- Run all ETL and Generate score: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application score-full-run` +- Clean up the data directories: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application data-cleanup` +- Run all ETL processes: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application etl-run` +- Generate Score: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application score-run` +- Combine Score with Geojson and generate high and low zoom map tile sets: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application geo-score` +- Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles` ## Local development diff --git a/data/data-pipeline/data_pipeline/application.py b/data/data-pipeline/data_pipeline/application.py index f84040fa..8105120e 100644 --- a/data/data-pipeline/data_pipeline/application.py +++ b/data/data-pipeline/data_pipeline/application.py @@ -1,3 +1,5 @@ +from subprocess import call +import sys import click from data_pipeline.config import settings @@ -11,6 +13,7 @@ from data_pipeline.utils import ( get_module_logger, score_folder_cleanup, temp_folder_cleanup, + check_first_run, ) logger = get_module_logger(__name__) @@ -34,17 +37,22 @@ def census_cleanup(): census_reset(data_path) logger.info("Cleaned up all census data files") + sys.exit() @cli.command(help="Clean up all data folders") def data_cleanup(): """CLI command to clean up the all the data folders""" + data_path = settings.APP_ROOT / "data" + + census_reset(data_path) data_folder_cleanup() score_folder_cleanup() temp_folder_cleanup() logger.info("Cleaned up all data folders") + sys.exit() @cli.command( @@ -63,6 +71,7 @@ def census_data_download(): etl_runner("census") logger.info("Completed downloading census data") + sys.exit() @cli.command( @@ -80,6 +89,7 @@ def etl_run(dataset: str): """ etl_runner(dataset) + sys.exit() @cli.command( @@ -88,7 +98,9 @@ def etl_run(dataset: str): def score_run(): """CLI command to generate the score""" + score_folder_cleanup() score_generate() + sys.exit() @cli.command( @@ -102,6 +114,7 @@ def score_full_run(): temp_folder_cleanup() etl_runner() score_generate() + sys.exit() @cli.command(help="Generate Geojson files with scores baked in") @@ -109,6 +122,7 @@ def geo_score(): """CLI command to generate the score""" score_geo() + sys.exit() @cli.command( @@ -119,6 +133,62 @@ def generate_map_tiles(): data_path = settings.APP_ROOT / "data" generate_tiles(data_path) + sys.exit() + + +@cli.command( + help="Data Full Run (Census download, ETLs, score, combine and tile generation)", +) +@click.option( + "-c", + "--check", + is_flag=True, + help="Check if data run has been run before, and don't run it if so.", +) +def data_full_run(check): + """CLI command to run ETL, score, JSON combine and generate tiles in one command + + Args: + check (bool): Run the full data run only if the first run sempahore file is not set (optional) + + Returns: + None + """ + data_path = settings.APP_ROOT / "data" + + if check and not check_first_run(): + # check if the data full run has been run before + logger.info("*** The data full run was already executed") + sys.exit() + + # census directories + logger.info("*** Initializing all data folders") + census_reset(data_path) + data_folder_cleanup() + score_folder_cleanup() + temp_folder_cleanup() + + logger.info("*** Downloading census data") + etl_runner("census") + + logger.info("*** Running all ETLs") + etl_runner() + + logger.info("*** Generating Score") + score_generate() + + logger.info("*** Combining Score with Census Geojson") + score_geo() + + logger.info("*** Generating Map Tiles") + generate_tiles(data_path) + + file = "first_run.txt" + cmd = f"touch {data_path}/{file}" + call(cmd, shell=True) + + logger.info("*** Map data ready") + sys.exit() if __name__ == "__main__": diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index d0dd81a6..d6a61ead 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -187,6 +187,7 @@ def score_folder_cleanup() -> None: remove_all_from_dir(data_path / "score" / "csv") remove_all_from_dir(data_path / "score" / "geojson") remove_all_from_dir(data_path / "score" / "downloadable") + remove_all_from_dir(data_path / "score" / "tiles") def temp_folder_cleanup() -> None: @@ -198,6 +199,19 @@ def temp_folder_cleanup() -> None: remove_all_from_dir(data_path / "tmp") +def check_first_run() -> bool: + """Checks if a local flag file has been set and returns False + if it hasn't""" + + data_path = settings.APP_ROOT / "data" + file = "first_run.txt" + + if not os.path.isfile(data_path / file): + return True + + return False + + def get_excel_column_name(index: int) -> str: """Map a numeric index to the appropriate column in Excel. E.g., column #95 is "CR". Only works for the first 1000 columns.