Combine + Tilefy (#806)

* init * score-post * added score csv s3 download; remore poetry cmds from readme * working census tile fetch * PR review * Github Actions Work
2025-07-27 19:31:16 -07:00 · 2021-11-01 18:05:05 -04:00 · 2021-11-01 18:05:05 -04:00 · 1b17af84c8
commit 1b17af84c8
parent 7b87e0ec99
13 changed files with 560 additions and 371 deletions
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -87,17 +87,20 @@ def score_generate() -> None:
    score_post()


-def score_post() -> None:
+def score_post(data_source: str = "local") -> None:
    """Posts the score files to the local directory

    Args:
-        None
+        data_source (str): Source for the census data (optional)
+                           Options:
+                           - local (default): fetch census data from the local data directory
+                           - aws: fetch census from AWS S3 J40 data repository

    Returns:
        None
    """
    # Post Score Processing
-    score_post = PostScoreETL()
+    score_post = PostScoreETL(data_source=data_source)
    score_post.extract()
    score_post.transform()
    score_post.load()
@ -108,10 +111,10 @@ def score_geo(data_source: str = "local") -> None:
    """Generates the geojson files with score data baked in

    Args:
-        census_data_source (str): Source for the census data (optional)
-                                  Options:
-                                  - local (default): fetch census data from the local data directory
-                                  - aws: fetch census from AWS S3 J40 data repository
+        data_source (str): Source for the census data (optional)
+                           Options:
+                           - local (default): fetch census data from the local data directory
+                           - aws: fetch census from AWS S3 J40 data repository

    Returns:
        None
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -6,6 +6,7 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import (
    check_census_data_source,
 )
+from data_pipeline.etl.score.etl_utils import check_score_data_source
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -17,6 +18,7 @@ class GeoScoreETL(ExtractTransformLoad):
    """

    def __init__(self, data_source: str = None):
+        self.DATA_SOURCE = data_source
        self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
        self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
        self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
@ -46,6 +48,12 @@ class GeoScoreETL(ExtractTransformLoad):
            census_data_source=self.DATA_SOURCE,
        )

+        # check score data
+        check_score_data_source(
+            score_csv_data_path=self.SCORE_CSV_PATH,
+            score_data_source=self.DATA_SOURCE,
+        )
+
        logger.info("Reading US GeoJSON (~6 minutes)")
        self.geojson_usa_df = gpd.read_file(
            self.CENSUS_USA_GEOJSON,
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -3,6 +3,9 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import get_module_logger, zip_files

+from data_pipeline.etl.sources.census.etl_utils import (
+    check_census_data_source,
+)
 from . import constants

 logger = get_module_logger(__name__)
@ -14,7 +17,8 @@ class PostScoreETL(ExtractTransformLoad):
    datasets.
    """

-    def __init__(self):
+    def __init__(self, data_source: str = None):
+        self.DATA_SOURCE = data_source
        self.input_counties_df: pd.DataFrame
        self.input_states_df: pd.DataFrame
        self.input_score_df: pd.DataFrame
@ -66,6 +70,13 @@ class PostScoreETL(ExtractTransformLoad):

    def extract(self) -> None:
        logger.info("Starting Extraction")
+
+        # check census data
+        check_census_data_source(
+            census_data_path=self.DATA_PATH / "census",
+            census_data_source=self.DATA_SOURCE,
+        )
+
        super().extract(
            constants.CENSUS_COUNTIES_ZIP_URL,
            constants.TMP_PATH,
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -0,0 +1,50 @@
+import os
+import sys
+from pathlib import Path
+
+from data_pipeline.config import settings
+from data_pipeline.utils import (
+    download_file_from_url,
+    get_module_logger,
+)
+
+logger = get_module_logger(__name__)
+
+
+def check_score_data_source(
+    score_csv_data_path: Path,
+    score_data_source: str,
+) -> None:
+    """Checks if census data is present, and exits gracefully if it doesn't exist. It will download it from S3
+       if census_data_source is set to "aws"
+
+    Args:
+        score_csv_data_path (str): Path for local Score CSV data
+        score_data_source (str): Source for the score data
+                                  Options:
+                                  - local: fetch census data from the local data directory
+                                  - aws: fetch census from AWS S3 J40 data repository
+
+    Returns:
+        None
+
+    """
+    TILE_SCORE_CSV_S3_URL = (
+        settings.AWS_JUSTICE40_DATAPIPELINE_URL
+        + "/data/score/csv/tiles/usa.csv"
+    )
+    TILE_SCORE_CSV = score_csv_data_path / "tiles" / "usa.csv"
+
+    # download from s3 if census_data_source is aws
+    if score_data_source == "aws":
+        logger.info("Fetching Score Tile data from AWS S3")
+        download_file_from_url(
+            file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
+        )
+    else:
+        # check if score data is found locally
+        if not os.path.isfile(TILE_SCORE_CSV):
+            logger.info(
+                "No local score tiles data found. Please use '-d aws` to fetch from AWS"
+            )
+            sys.exit()
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -107,13 +107,13 @@ def check_census_data_source(
        # check if census data is found locally
        if not os.path.isfile(census_data_path / "geojson" / "us.json"):
            logger.info(
-                "No local census data found. Please use '-cds aws` to fetch from AWS"
+                "No local census data found. Please use '-d aws` to fetch from AWS"
            )
            sys.exit()


 def zip_census_data():
-    logger.info("Compressing and uploading census files to AWS S3")
+    logger.info("Compressing census files to data/tmp folder")

    CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
    TMP_PATH = settings.APP_ROOT / "data" / "tmp"