Data sources from S3 (#769)

* Started 535 * Data sources from S3 * lint * renove breakpoints * PR comments * lint * census data completed * lint * renaming data source
2025-07-28 03:31:17 -07:00 · 2021-10-13 16:00:33 -04:00 · 2021-10-13 16:00:33 -04:00 · 3b04356fb3
commit 3b04356fb3
parent d1273b63c5
10 changed files with 317 additions and 67 deletions
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -104,18 +104,21 @@ def score_post() -> None:
    score_post.cleanup()


-def score_geo() -> None:
+def score_geo(data_source: str = "local") -> None:
    """Generates the geojson files with score data baked in

    Args:
-        None
+        census_data_source (str): Source for the census data (optional)
+                                  Options:
+                                  - local (default): fetch census data from the local data directory
+                                  - aws: fetch census from AWS S3 J40 data repository

    Returns:
        None
    """

    # Score Geo
-    score_geo = GeoScoreETL()
+    score_geo = GeoScoreETL(data_source=data_source)
    score_geo.extract()
    score_geo.transform()
    score_geo.load()
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -315,6 +315,7 @@ class ScoreETL(ExtractTransformLoad):

    def extract(self) -> None:
        logger.info("Loading data sets from disk.")
+
        # EJSCreen csv Load
        ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2019" / "usa.csv"
        self.ejscreen_df = pd.read_csv(
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -1,9 +1,11 @@
 import math
-
 import pandas as pd
 import geopandas as gpd

 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.sources.census.etl_utils import (
+    check_census_data_source,
+)
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -14,7 +16,7 @@ class GeoScoreETL(ExtractTransformLoad):
    A class used to generate per state and national GeoJson files with the score baked in
    """

-    def __init__(self):
+    def __init__(self, data_source: str = None):
        self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
        self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
        self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
@ -22,6 +24,7 @@ class GeoScoreETL(ExtractTransformLoad):
        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
        self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"

+        self.DATA_SOURCE = data_source
        self.CENSUS_USA_GEOJSON = (
            self.DATA_PATH / "census" / "geojson" / "us.json"
        )
@ -37,6 +40,12 @@ class GeoScoreETL(ExtractTransformLoad):
        self.geojson_score_usa_low: gpd.GeoDataFrame

    def extract(self) -> None:
+        # check census data
+        check_census_data_source(
+            census_data_path=self.DATA_PATH / "census",
+            census_data_source=self.DATA_SOURCE,
+        )
+
        logger.info("Reading US GeoJSON (~6 minutes)")
        self.geojson_usa_df = gpd.read_file(
            self.CENSUS_USA_GEOJSON,
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -1,22 +1,10 @@
-import json
-import zipfile
 from pathlib import Path
-
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.utils import get_module_logger, get_zip_info
+from data_pipeline.utils import get_module_logger, zip_files

 from . import constants

-## zlib is not available on all systems
-try:
-    import zlib  # noqa # pylint: disable=unused-import
-
-    compression = zipfile.ZIP_DEFLATED
-except (ImportError, AttributeError):
-    compression = zipfile.ZIP_STORED
-
-
 logger = get_module_logger(__name__)


@ -268,11 +256,7 @@ class PostScoreETL(ExtractTransformLoad):

        logger.info("Compressing files")
        files_to_compress = [csv_path, excel_path, pdf_path]
-        with zipfile.ZipFile(zip_path, "w") as zf:
-            for f in files_to_compress:
-                zf.write(f, arcname=Path(f).name, compress_type=compression)
-        zip_info = get_zip_info(zip_path)
-        logger.info(json.dumps(zip_info, indent=4, sort_keys=True, default=str))
+        zip_files(zip_path, files_to_compress)

    def load(self) -> None:
        self._load_score_csv(
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -1,5 +1,6 @@
 import csv
 import os
+import sys
 from pathlib import Path

 import pandas as pd
@ -9,12 +10,14 @@ from data_pipeline.utils import (
    remove_all_dirs_from_dir,
    remove_files_from_dir,
    unzip_file_from_url,
+    zip_directory,
 )

 logger = get_module_logger(__name__)


 def reset_data_directories(data_path: Path) -> None:
+    """Empties all census folders"""
    census_data_path = data_path / "census"

    # csv
@ -31,6 +34,7 @@ def reset_data_directories(data_path: Path) -> None:


 def get_state_fips_codes(data_path: Path) -> list:
+    """Returns a list with state data"""
    fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"

    # check if file exists
@ -69,3 +73,50 @@ def get_state_information(data_path: Path) -> pd.DataFrame:
    df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))

    return df
+
+
+def check_census_data_source(
+    census_data_path: Path, census_data_source: str
+) -> None:
+    """Checks if census data is present, and exits gracefully if it doesn't exist. It will download it from S3
+       if census_data_source is set to "aws"
+
+    Args:
+        census_data_path (str): Path for Census data
+        census_data_source (str): Source for the census data
+                                  Options:
+                                  - local: fetch census data from the local data directory
+                                  - aws: fetch census from AWS S3 J40 data repository
+
+    Returns:
+        None
+
+    """
+    CENSUS_DATA_S3_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/census.zip"
+    DATA_PATH = settings.APP_ROOT / "data"
+
+    # download from s3 if census_data_source is aws
+    if census_data_source == "aws":
+        logger.info("Fetching Census data from AWS S3")
+        unzip_file_from_url(
+            CENSUS_DATA_S3_URL,
+            DATA_PATH / "tmp",
+            DATA_PATH,
+        )
+    else:
+        # check if census data is found locally
+        if not os.path.isfile(census_data_path / "geojson" / "us.json"):
+            logger.info(
+                "No local census data found. Please use '-cds aws` to fetch from AWS"
+            )
+            sys.exit()
+
+
+def zip_census_data():
+    logger.info("Compressing and uploading census files to AWS S3")
+
+    CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
+    TMP_PATH = settings.APP_ROOT / "data" / "tmp"
+
+    # zip folder
+    zip_directory(CENSUS_DATA_PATH, TMP_PATH)