Generate Geo-aware scores for all zoom levels (#391)

* generate Geo-aware scores for all zoom levels * usa high progress * testing dissolve * checkpoint * changing type * removing breakpoint * validation notebooks * quick update * score validation * fixes for county merge * code completed
2025-08-08 18:44:18 -07:00 · 2021-07-28 16:07:28 -04:00 · 2021-07-28 16:07:28 -04:00 · b404fdcc43
commit b404fdcc43
parent 446c8d1f68
14 changed files with 3023 additions and 270 deletions
--- a/data/data-pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/etl/sources/calenviroscreen/etl.py
@ -9,12 +9,16 @@ logger = get_module_logger(__name__)
 class CalEnviroScreenETL(ExtractTransformLoad):
    def __init__(self):
        self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip"
-        self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
+        self.CALENVIROSCREEN_CSV = (
+            self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
+        )
        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"

        # Definining some variable names
        self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
-        self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile"
+        self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
+            "calenviroscreen_percentile"
+        )
        self.CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = (
            "calenviroscreen_priority_community"
        )
--- a/data/data-pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/etl/sources/census/etl.py
@ -2,6 +2,7 @@ import csv
 import os
 import json
 from pathlib import Path
+import geopandas as gpd

 from .etl_utils import get_state_fips_codes
 from utils import unzip_file_from_url, get_module_logger
@ -11,7 +12,7 @@ logger = get_module_logger(__name__)

 def download_census_csvs(data_path: Path) -> None:
    """Download all census shape files from the Census FTP and extract the geojson
-    to generate national and by state Census Block Group CSVs
+    to generate national and by state Census Block Group CSVs and GeoJSONs

    Args:
        data_path (pathlib.Path): Name of the directory where the files and directories will
@ -108,4 +109,17 @@ def download_census_csvs(data_path: Path) -> None:
                ]
            )

+    ## create national geojson
+    logger.info(f"Generating national geojson file")
+    usa_df = gpd.GeoDataFrame()
+
+    for file_name in geojson_dir_path.rglob("*.json"):
+        logger.info(f"Ingesting {file_name}")
+        state_gdf = gpd.read_file(file_name)
+        usa_df = usa_df.append(state_gdf)
+
+    usa_df = usa_df.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
+    logger.info(f"Writing national geojson file")
+    usa_df.to_file(geojson_dir_path / "us.json", driver="GeoJSON")
+
    logger.info("Census block groups downloading complete")
--- a/data/data-pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/etl/sources/census_acs/etl.py
@ -106,3 +106,8 @@ class CensusACSETL(ExtractTransformLoad):
        self.df[columns_to_include].to_csv(
            path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
        )
+
+    def validate(self) -> None:
+        logger.info(f"Validating Census ACS Data")
+
+        pass