Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
2025-07-28 23:11:16 -07:00 · 2023-03-03 12:26:24 -06:00 · 2023-03-03 12:26:24 -06:00 · 6f39033dde
commit 6f39033dde
parent 4d9c1dd11e
52 changed files with 1787 additions and 686 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
@ -4,6 +4,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger

@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
    """

    def __init__(self):
-        self.MARYLAND_EJSCREEN_URL = (
+
+        # fetch
+        self.maryland_ejscreen_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
        )

-        self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
+        # input
+        self.shape_files_source = self.get_sources_path() / "mdejscreen"
+
+        # output
        self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"

        self.COLUMNS_TO_KEEP = [
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
        ]

        self.df: pd.DataFrame
+        self.dfs_list: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.maryland_ejscreen_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-    def extract(self) -> None:
-        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
        super().extract(
-            self.MARYLAND_EJSCREEN_URL,
-            self.get_tmp_path(),
-        )
+            use_cached_data_sources
+        )  # download and extract data sources

-    def transform(self) -> None:
-        list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
+        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
+        list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))

-        # Ignore counties becauses this is not the level of measurement
+        # Ignore counties because this is not the level of measurement
        # that is consistent with our current scoring and ranking methodology.
-        dfs_list = [
+        self.dfs_list = [
            gpd.read_file(f)
            for f in list_of_files
            if not f.endswith("CountiesEJScore.shp")
        ]

+    def transform(self) -> None:
+
        # Set the Census tract as the index and drop the geometry column
        # that produces the census tract boundaries.
        # The latter is because Geopandas raises an exception if there
        # are duplicate geometry columns.
        # Moreover, since the unit of measurement is at the tract level
        # we can consistantly merge this with other datasets
-        dfs_list = [
+        self.dfs_list = [
            df.set_index("Census_Tra").drop("geometry", axis=1)
-            for df in dfs_list
+            for df in self.dfs_list
        ]
        # pylint: disable=unsubscriptable-object
-        self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
+        self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))

        # Reset index so that we no longer have the tract as our index
        self.df = self.df.reset_index()