Issue 1900: Tribal overlap with Census tracts (#1903)

* working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov>
2025-09-30 23:43:17 -07:00 · 2022-09-20 14:53:12 -04:00 · 2022-09-20 14:53:12 -04:00 · aca226165c
commit aca226165c
parent 876655d2b2
19 changed files with 1921 additions and 36 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py
@ -4,6 +4,7 @@ from pathlib import Path
 from typing import Optional
 from functools import lru_cache
 import geopandas as gpd
+from data_pipeline.etl.sources.tribal.etl import TribalETL
 from data_pipeline.utils import get_module_logger
 from .census.etl import CensusETL

@ -18,21 +19,44 @@ def get_tract_geojson(
    GEOJSON_PATH = _tract_data_path
    if GEOJSON_PATH is None:
        GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH
-        if not GEOJSON_PATH.exists():
-            logger.debug("Census data has not been computed, running")
-            census_etl = CensusETL()
-            census_etl.extract()
-            census_etl.transform()
-            census_etl.load()
-        else:
-            logger.debug("Loading existing tract geojson")
-    tract_data = gpd.read_file(GEOJSON_PATH, include_fields=["GEOID10"])
-    tract_data.rename(columns={"GEOID10": "GEOID10_TRACT"}, inplace=True)
+    if not GEOJSON_PATH.exists():
+        logger.debug("Census data has not been computed, running")
+        census_etl = CensusETL()
+        census_etl.extract()
+        census_etl.transform()
+        census_etl.load()
+    tract_data = gpd.read_file(
+        GEOJSON_PATH,
+        include_fields=["GEOID10"],
+    )
+    tract_data = tract_data.rename(
+        columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
+    )
    return tract_data


+@lru_cache()
+def get_tribal_geojson(
+    _tribal_data_path: Optional[Path] = None,
+) -> gpd.GeoDataFrame:
+    logger.info("Loading Tribal geometry data from Tribal ETL")
+    GEOJSON_PATH = _tribal_data_path
+    if GEOJSON_PATH is None:
+        GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH
+    if not GEOJSON_PATH.exists():
+        logger.debug("Tribal data has not been computed, running")
+        tribal_etl = TribalETL()
+        tribal_etl.extract()
+        tribal_etl.transform()
+        tribal_etl.load()
+    tribal_data = gpd.read_file(
+        GEOJSON_PATH,
+    )
+    return tribal_data
+
+
 def add_tracts_for_geometries(
-    df: gpd.GeoDataFrame, _tract_data_path: Optional[Path] = None
+    df: gpd.GeoDataFrame, tract_data: Optional[gpd.GeoDataFrame] = None
 ) -> gpd.GeoDataFrame:
    """Adds tract-geoids to dataframe df that contains spatial geometries

@ -40,8 +64,8 @@ def add_tracts_for_geometries(

    Args:
        df (GeoDataFrame): a geopandas GeoDataFrame with a point geometry column
-        _tract_data_path (Path): an override to directly pass a GEOJSON file of
-                              tracts->Geometries, to simplify testing.
+        tract_data (GeoDataFrame): optional override to directly pass a
+            geodataframe of the tract boundaries. Also helps simplify testing.

    Returns:
        GeoDataFrame: the above dataframe, with an additional GEOID10_TRACT column that
@ -49,7 +73,12 @@ def add_tracts_for_geometries(
                      spatial analysis
    """
    logger.debug("Appending tract data to dataframe")
-    tract_data = get_tract_geojson(_tract_data_path)
+
+    if tract_data is None:
+        tract_data = get_tract_geojson()
+    else:
+        logger.debug("Using existing tract data.")
+
    assert (
        tract_data.crs == df.crs
    ), f"Dataframe must be projected to {tract_data.crs}"