Add donut hole calculation to score (#1828)

Adds adjacency index to the pipeline. Requires thorough QA
2025-10-02 16:33:17 -07:00 · 2022-08-18 12:04:46 -04:00 · 2022-08-18 12:04:46 -04:00 · 6e41e0d9f0
commit 6e41e0d9f0
parent 88dc2e5a8e
17 changed files with 969 additions and 8 deletions
--- a/data/data-pipeline/data_pipeline/score/utils.py
+++ b/data/data-pipeline/data_pipeline/score/utils.py
@ -0,0 +1,56 @@
+"""Utilities to help generate the score."""
+import pandas as pd
+import geopandas as gpd
+import data_pipeline.score.field_names as field_names
+
+# XXX: @jorge I am torn about the coupling that importing from
+# etl.sources vs keeping the code DRY. Thoughts?
+from data_pipeline.etl.sources.geo_utils import get_tract_geojson
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+def calculate_tract_adjacency_scores(
+    df: pd.DataFrame, score_column: str
+) -> pd.DataFrame:
+    """Calculate the mean score of each tract in df based on its neighbors
+
+    Args:
+        df (pandas.DataFrame): A dataframe with at least the following columns:
+          * field_names.GEOID_TRACT_FIELD
+          * score_column
+
+        score_column (str): The name of the column that contains the scores
+                            to average
+    Returns:
+        df (pandas.DataFrame): A dataframe with two columns:
+          * field_names.GEOID_TRACT_FIELD
+          * {score_column}_ADJACENT_MEAN, which is the average of score_column for
+            each tract that touches the tract identified
+            in field_names.GEOID_TRACT_FIELD
+    """
+    ORIGINAL_TRACT = "ORIGINAL_TRACT"
+    logger.debug("Calculating tract adjacency scores")
+    tract_data = get_tract_geojson()
+    df: gpd.GeoDataFrame = tract_data.merge(
+        df, on=field_names.GEOID_TRACT_FIELD
+    )
+    df = df.rename(columns={field_names.GEOID_TRACT_FIELD: ORIGINAL_TRACT})
+
+    logger.debug("Perfoming spatial join to find all adjacent tracts")
+    adjacent_tracts: gpd.GeoDataFrame = df.sjoin(
+        tract_data, predicate="touches"
+    )
+
+    logger.debug("Calculating means based on adjacency")
+    return (
+        adjacent_tracts.groupby(field_names.GEOID_TRACT_FIELD)[[score_column]]
+        .mean()
+        .reset_index()
+        .rename(
+            columns={
+                score_column: f"{score_column}{field_names.ADJACENCY_INDEX_SUFFIX}",
+            }
+        )
+    )