Analysis by region (#385)

* Adding regional comparisons * Small ETL fixes
2025-08-08 05:04:18 -07:00 · 2021-07-26 08:02:25 -07:00 · 2021-07-26 08:02:25 -07:00 · 67b39475f7
commit 67b39475f7
parent 81290ce672
7 changed files with 440 additions and 158 deletions
--- a/data/data-pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/etl/sources/calenviroscreen/etl.py
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)

 class CalEnviroScreenETL(ExtractTransformLoad):
    def __init__(self):
-        self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip"
+        self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip"
        self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"

--- a/data/data-pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/etl/sources/census/etl_utils.py
@ -1,5 +1,6 @@
 from pathlib import Path
 import csv
+import pandas as pd
 import os
 from config import settings

@ -53,3 +54,18 @@ def get_state_fips_codes(data_path: Path) -> list:
                fips = row[0].strip()
                fips_state_list.append(fips)
    return fips_state_list
+
+
+def get_state_information(data_path: Path) -> pd.DataFrame:
+    """Load the full state file as a dataframe.
+
+    Useful because of the state regional information.
+    """
+    fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
+
+    df = pd.read_csv(fips_csv_path)
+
+    # Left pad the FIPS codes with 0s
+    df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))
+
+    return df
--- a/data/data-pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/etl/sources/housing_and_transportation/etl.py
@ -44,8 +44,6 @@ class HousingTransportationETL(ExtractTransformLoad):

        self.df = pd.concat(dfs)

-        self.df.head()
-
    def transform(self) -> None:
        logger.info(f"Transforming Housing and Transportation Data")

--- a/data/data-pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/etl/sources/hud_recap/etl.py
@ -33,7 +33,7 @@ class HudRecapETL(ExtractTransformLoad):
        logger.info(f"Transforming HUD Recap Data")

        # Load comparison index (CalEnviroScreen 4)
-        self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"Census Tract": "string"})
+        self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})

        self.df.rename(
            columns={