From 5bd63c083b8127db0c816c6561dfa1dd5815d510 Mon Sep 17 00:00:00 2001
From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com>
Date: Tue, 14 Sep 2021 14:15:34 -0400
Subject: [PATCH] Run all Census, ETL, Score, Combine and Tilefy in one command
 (#662)

* Run all Census, ETL, Score, Combine and Tilefy in one command

* docker cmd

* some docker improvements

* feedback updates

* lint
---
 data/data-pipeline/Dockerfile                 |  2 +
 data/data-pipeline/README.md                  | 26 +++----
 .../data_pipeline/application.py              | 70 +++++++++++++++++++
 data/data-pipeline/data_pipeline/utils.py     | 14 ++++
 4 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/data/data-pipeline/Dockerfile b/data/data-pipeline/Dockerfile
index f74ccb26..f77cebeb 100644
--- a/data/data-pipeline/Dockerfile
+++ b/data/data-pipeline/Dockerfile
@@ -32,3 +32,5 @@ COPY . .
 COPY requirements.txt .
 RUN pip3 install -r requirements.txt
 RUN pip3 install .
+
+CMD python3 -m data_pipeline.application data-full-run --check
diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md
index 2724c660..5a0e7fba 100644
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@@ -96,14 +96,14 @@ TODO add mermaid diagram
 #### Step 1: Run the script to download census data or download from the Justice40 S3 URL
 
 1. Call the `census_data_download` command using the application manager `application.py` **NOTE:** This may take several minutes to execute.
-   - With Docker: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application census-data-download`
+   - With Docker: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application census-data-download`
    - With Poetry: `poetry run download_census` (Install GDAL as described [below](#local-development))
 2. If you have a high speed internet connection and don't want to generate the census data or install `GDAL` locally, you can download a zip version of the Census file [here](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Then unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census/` folder/
 
 #### Step 2: Run the ETL script for each data source
 
 1. Call the `etl-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute.
-   - With Docker: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application etl-run`
+   - With Docker: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application etl-run`
    - With Poetry: `poetry run etl`
 2. This command will execute the corresponding ETL script for each data source in `data_pipeline/etl/sources/`. For example, `data_pipeline/etl/sources/ejscreen/etl.py` is the ETL script for EJSCREEN data.
 3. Each ETL script will extract the data from its original source, then format the data into `.csv` files that get stored in the relevant folder in `data_pipeline/data/dataset/`. For example, HUD Housing data is stored in `data_pipeline/data/dataset/hud_housing/usa.csv`
@@ -114,7 +114,7 @@ _For example: `poetry run etl -d ejscreen` would only run the ETL process for EJ
 #### Step 3: Calculate the Justice40 score experiments
 
 1. Call the `score-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute.
-   - With Docker: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application score-run`
+   - With Docker: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application score-run`
    - With Poetry: `poetry run score`
 1. The `score-run` command will execute the `etl/score/etl.py` script which loads the data from each of the source files added to the `data/dataset/` directory by the ETL scripts in Step 1.
 1. These data sets are merged into a single dataframe using their Census Block Group GEOID as a common key, and the data in each of the columns is standardized in two ways:
@@ -161,20 +161,20 @@ To build the docker container the first time, make sure you're in the root direc
 
 Once completed, run `docker-compose up`. Docker will spin up 3 containers: the client container, the static server container and the data container. Once all data is generated, you can see the application using a browser and navigating to `htto://localhost:8000`.
 
-If you want to run specific data tasks, you can open a new terminal tab or terminal window while `docker-compose up` is running, and then execute any command for the application using this format:
+If you want to run specific data tasks, you can open a terminal window, navigate to the root folder for this repository and then execute any command for the application using this format:
 
-`docker exec j40_data_pipeline_1 python3 -m data_pipeline.application [command]`
+`docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application [command]`
 
 Here's a list of commands:
 
-- Get help: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application --help`
-- Generate census data: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application census-data-download`
-- Run all ETL and Generate score: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application score-full-run`
-- Clean up the data directories: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application data-cleanup`
-- Run all ETL processes: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application etl-run`
-- Generate Score: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application score-run`
-- Combine Score with Geojson and generate high and low zoom map tile sets: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application geo-score`
-- Generate Map Tiles: `docker exec j40_data_pipeline_1 python3 -m data_pipeline.application generate-map-tiles`
+- Get help: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application --help`
+- Generate census data: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application census-data-download`
+- Run all ETL and Generate score: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application score-full-run`
+- Clean up the data directories: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application data-cleanup`
+- Run all ETL processes: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application etl-run`
+- Generate Score: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application score-run`
+- Combine Score with Geojson and generate high and low zoom map tile sets: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application geo-score`
+- Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`
 
 ## Local development
 
diff --git a/data/data-pipeline/data_pipeline/application.py b/data/data-pipeline/data_pipeline/application.py
index f84040fa..8105120e 100644
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@@ -1,3 +1,5 @@
+from subprocess import call
+import sys
 import click
 
 from data_pipeline.config import settings
@@ -11,6 +13,7 @@ from data_pipeline.utils import (
     get_module_logger,
     score_folder_cleanup,
     temp_folder_cleanup,
+    check_first_run,
 )
 
 logger = get_module_logger(__name__)
@@ -34,17 +37,22 @@ def census_cleanup():
     census_reset(data_path)
 
     logger.info("Cleaned up all census data files")
+    sys.exit()
 
 
 @cli.command(help="Clean up all data folders")
 def data_cleanup():
     """CLI command to clean up the all the data folders"""
 
+    data_path = settings.APP_ROOT / "data"
+
+    census_reset(data_path)
     data_folder_cleanup()
     score_folder_cleanup()
     temp_folder_cleanup()
 
     logger.info("Cleaned up all data folders")
+    sys.exit()
 
 
 @cli.command(
@@ -63,6 +71,7 @@ def census_data_download():
     etl_runner("census")
 
     logger.info("Completed downloading census data")
+    sys.exit()
 
 
 @cli.command(
@@ -80,6 +89,7 @@ def etl_run(dataset: str):
     """
 
     etl_runner(dataset)
+    sys.exit()
 
 
 @cli.command(
@@ -88,7 +98,9 @@ def etl_run(dataset: str):
 def score_run():
     """CLI command to generate the score"""
 
+    score_folder_cleanup()
     score_generate()
+    sys.exit()
 
 
 @cli.command(
@@ -102,6 +114,7 @@ def score_full_run():
     temp_folder_cleanup()
     etl_runner()
     score_generate()
+    sys.exit()
 
 
 @cli.command(help="Generate Geojson files with scores baked in")
@@ -109,6 +122,7 @@ def geo_score():
     """CLI command to generate the score"""
 
     score_geo()
+    sys.exit()
 
 
 @cli.command(
@@ -119,6 +133,62 @@ def generate_map_tiles():
 
     data_path = settings.APP_ROOT / "data"
     generate_tiles(data_path)
+    sys.exit()
+
+
+@cli.command(
+    help="Data Full Run (Census download, ETLs, score, combine and tile generation)",
+)
+@click.option(
+    "-c",
+    "--check",
+    is_flag=True,
+    help="Check if data run has been run before, and don't run it if so.",
+)
+def data_full_run(check):
+    """CLI command to run ETL, score, JSON combine and generate tiles in one command
+
+    Args:
+        check (bool): Run the full data run only if the first run sempahore file is not set (optional)
+
+     Returns:
+        None
+    """
+    data_path = settings.APP_ROOT / "data"
+
+    if check and not check_first_run():
+        # check if the data full run has been run before
+        logger.info("*** The data full run was already executed")
+        sys.exit()
+
+    # census directories
+    logger.info("*** Initializing all data folders")
+    census_reset(data_path)
+    data_folder_cleanup()
+    score_folder_cleanup()
+    temp_folder_cleanup()
+
+    logger.info("*** Downloading census data")
+    etl_runner("census")
+
+    logger.info("*** Running all ETLs")
+    etl_runner()
+
+    logger.info("*** Generating Score")
+    score_generate()
+
+    logger.info("*** Combining Score with Census Geojson")
+    score_geo()
+
+    logger.info("*** Generating Map Tiles")
+    generate_tiles(data_path)
+
+    file = "first_run.txt"
+    cmd = f"touch {data_path}/{file}"
+    call(cmd, shell=True)
+
+    logger.info("*** Map data ready")
+    sys.exit()
 
 
 if __name__ == "__main__":
diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py
index d0dd81a6..d6a61ead 100644
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@@ -187,6 +187,7 @@ def score_folder_cleanup() -> None:
     remove_all_from_dir(data_path / "score" / "csv")
     remove_all_from_dir(data_path / "score" / "geojson")
     remove_all_from_dir(data_path / "score" / "downloadable")
+    remove_all_from_dir(data_path / "score" / "tiles")
 
 
 def temp_folder_cleanup() -> None:
@@ -198,6 +199,19 @@ def temp_folder_cleanup() -> None:
     remove_all_from_dir(data_path / "tmp")
 
 
+def check_first_run() -> bool:
+    """Checks if a local flag file has been set and returns False
+    if it hasn't"""
+
+    data_path = settings.APP_ROOT / "data"
+    file = "first_run.txt"
+
+    if not os.path.isfile(data_path / file):
+        return True
+
+    return False
+
+
 def get_excel_column_name(index: int) -> str:
     """Map a numeric index to the appropriate column in Excel. E.g., column #95 is "CR".
     Only works for the first 1000 columns.