Add donut hole calculation to score (#1828)

Adds adjacency index to the pipeline. Requires thorough QA
This commit is contained in:
Matt Bowen 2022-08-18 12:04:46 -04:00 committed by GitHub
commit 6e41e0d9f0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 969 additions and 8 deletions

View file

@ -1,6 +1,8 @@
# Suffixes
PERCENTILE_FIELD_SUFFIX = " (percentile)"
ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
# Geographic field names
GEOID_TRACT_FIELD = "GEOID10_TRACT"
@ -691,6 +693,9 @@ CATEGORY_COUNT = "Total categories exceeded"
FPL_200_SERIES = "Is low income?"
FPL_200_SERIES_IMPUTED_AND_ADJUSTED = "Is low income (imputed and adjusted)?"
FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS = (
"Meets the less stringent low income criterion for the adjacency index?"
)
FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
"Is low income and has a low percent of higher ed students?"
)
@ -715,5 +720,10 @@ HISTORIC_REDLINING_SCORE_EXCEEDED_LOW_INCOME_FIELD = (
"Tract-level redlining score meets or exceeds 3.25 and is low income"
)
ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD = (
"Is the tract surrounded by disadvantaged communities?"
)
# End of names for individual factors being exceeded
####

View file

@ -6,6 +6,7 @@ from data_pipeline.score.score import Score
import data_pipeline.score.field_names as field_names
from data_pipeline.utils import get_module_logger
import data_pipeline.etl.score.constants as constants
from data_pipeline.score.utils import calculate_tract_adjacency_scores
logger = get_module_logger(__name__)
@ -20,6 +21,12 @@ class ScoreNarwhal(Score):
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
# We define a donut hole DAC as a tract that is entirely surrounded by
# DACs (score threshold = 1) and above median for low income, as a starting
# point. As we ground-truth, these thresholds might change.
self.LOW_INCOME_THRESHOLD_DONUT: float = 0.50
self.SCORE_THRESHOLD_DONUT: float = 1.00
super().__init__(df)
def _combine_island_areas_with_states_and_set_thresholds(
@ -907,6 +914,54 @@ class ScoreNarwhal(Score):
| workforce_combined_criteria_for_island_areas
)
def _mark_donut_hole_tracts(self) -> pd.DataFrame:
"""Mark tracts that do not qualify on their own, but are surrounded by those that do
A donut hole is a tract surrounded by tracts that are marked for inclusion
by the scoring system AND meet a less stringent low-income threshhold.
We calculate "donut holes" after the initial score generation
"""
logger.info("Marking donut hole tracts")
# This is the boolean we pass to the front end for the donut-hole-specific
# low income criterion
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS] = (
self.df[
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.LOW_INCOME_THRESHOLD_DONUT
)
self.df = self.df.merge(
calculate_tract_adjacency_scores(
self.df, field_names.SCORE_N_COMMUNITIES
),
how="left",
on=field_names.GEOID_TRACT_FIELD,
)
# This is the boolean we pass to the front end for color
self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD] = (
self.df[
(
field_names.SCORE_N_COMMUNITIES
+ field_names.ADJACENCY_INDEX_SUFFIX
)
]
>= self.SCORE_THRESHOLD_DONUT
)
# This should be the "final list" of Score Narwhal communities, meaning that we would
# expect this to be True if either the tract is a donut hole community OR the tract is a DAC
self.df[
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
] = (
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS]
& self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
)
def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score Narhwal")
@ -946,5 +1001,6 @@ class ScoreNarwhal(Score):
field_names.SCORE_N_COMMUNITIES
+ field_names.PERCENTILE_FIELD_SUFFIX
] = self.df[field_names.SCORE_N_COMMUNITIES].astype(int)
self._mark_donut_hole_tracts()
return self.df

View file

@ -0,0 +1,56 @@
"""Utilities to help generate the score."""
import pandas as pd
import geopandas as gpd
import data_pipeline.score.field_names as field_names
# XXX: @jorge I am torn about the coupling that importing from
# etl.sources vs keeping the code DRY. Thoughts?
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
def calculate_tract_adjacency_scores(
df: pd.DataFrame, score_column: str
) -> pd.DataFrame:
"""Calculate the mean score of each tract in df based on its neighbors
Args:
df (pandas.DataFrame): A dataframe with at least the following columns:
* field_names.GEOID_TRACT_FIELD
* score_column
score_column (str): The name of the column that contains the scores
to average
Returns:
df (pandas.DataFrame): A dataframe with two columns:
* field_names.GEOID_TRACT_FIELD
* {score_column}_ADJACENT_MEAN, which is the average of score_column for
each tract that touches the tract identified
in field_names.GEOID_TRACT_FIELD
"""
ORIGINAL_TRACT = "ORIGINAL_TRACT"
logger.debug("Calculating tract adjacency scores")
tract_data = get_tract_geojson()
df: gpd.GeoDataFrame = tract_data.merge(
df, on=field_names.GEOID_TRACT_FIELD
)
df = df.rename(columns={field_names.GEOID_TRACT_FIELD: ORIGINAL_TRACT})
logger.debug("Perfoming spatial join to find all adjacent tracts")
adjacent_tracts: gpd.GeoDataFrame = df.sjoin(
tract_data, predicate="touches"
)
logger.debug("Calculating means based on adjacency")
return (
adjacent_tracts.groupby(field_names.GEOID_TRACT_FIELD)[[score_column]]
.mean()
.reset_index()
.rename(
columns={
score_column: f"{score_column}{field_names.ADJACENCY_INDEX_SUFFIX}",
}
)
)