mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Add donut hole calculation to score (#1828)
Adds adjacency index to the pipeline. Requires thorough QA
This commit is contained in:
parent
88dc2e5a8e
commit
6e41e0d9f0
17 changed files with 969 additions and 8 deletions
|
@ -20,9 +20,21 @@ fields:
|
||||||
- score_name: Total categories exceeded
|
- score_name: Total categories exceeded
|
||||||
label: Total categories exceeded
|
label: Total categories exceeded
|
||||||
format: int64
|
format: int64
|
||||||
- score_name: Definition M (communities)
|
- score_name: Definition N (communities)
|
||||||
label: Identified as disadvantaged
|
label: Identified as disadvantaged
|
||||||
format: bool
|
format: bool
|
||||||
|
- score_name: Definition N (communities) (including adjacency index)
|
||||||
|
label: Identified as disadvantaged (including adjacency index)
|
||||||
|
format: bool
|
||||||
|
- score_name: Is the tract surrounded by disadvantaged communities?
|
||||||
|
label: Is the tract surrounded by disadvantaged communities?
|
||||||
|
format: bool
|
||||||
|
- score_name: Meets the less stringent low income criterion for the adjacency index?
|
||||||
|
label: Meets the less stringent low income criterion for the adjacency index?
|
||||||
|
format: bool
|
||||||
|
- score_name: Definition N (communities) (average of neighbors)
|
||||||
|
label: Share of neighbors that are identified as disadvantaged
|
||||||
|
format: percentage
|
||||||
- score_name: Total population
|
- score_name: Total population
|
||||||
label: Total population
|
label: Total population
|
||||||
format: float
|
format: float
|
||||||
|
|
|
@ -24,9 +24,21 @@ sheets:
|
||||||
- score_name: Total categories exceeded
|
- score_name: Total categories exceeded
|
||||||
label: Total categories exceeded
|
label: Total categories exceeded
|
||||||
format: int64
|
format: int64
|
||||||
- score_name: Definition M (communities)
|
- score_name: Definition N (communities)
|
||||||
label: Identified as disadvantaged
|
label: Identified as disadvantaged
|
||||||
format: bool
|
format: bool
|
||||||
|
- score_name: Definition N (communities) (including adjacency index)
|
||||||
|
label: Identified as disadvantaged (including adjacency index)
|
||||||
|
format: bool
|
||||||
|
- score_name: Is the tract surrounded by disadvantaged communities?
|
||||||
|
label: Is the tract surrounded by disadvantaged communities?
|
||||||
|
format: bool
|
||||||
|
- score_name: Meets the less stringent low income criterion for the adjacency index?
|
||||||
|
label: Meets the less stringent low income criterion for the adjacency index?
|
||||||
|
format: bool
|
||||||
|
- score_name: Definition N (communities) (average of neighbors)
|
||||||
|
label: Share of neighbors that are identified as disadvantaged
|
||||||
|
format: percentage
|
||||||
- score_name: Total population
|
- score_name: Total population
|
||||||
label: Total population
|
label: Total population
|
||||||
format: float
|
format: float
|
||||||
|
@ -315,4 +327,3 @@ sheets:
|
||||||
- score_name: Does the tract have at least 35 acres in it?
|
- score_name: Does the tract have at least 35 acres in it?
|
||||||
label: Does the tract have at least 35 acres in it?
|
label: Does the tract have at least 35 acres in it?
|
||||||
format: bool
|
format: bool
|
||||||
|
|
|
@ -207,7 +207,8 @@ TILES_SCORE_COLUMNS = {
|
||||||
field_names.M_POLLUTION: "M_PLN",
|
field_names.M_POLLUTION: "M_PLN",
|
||||||
field_names.M_HEALTH: "M_HLTH",
|
field_names.M_HEALTH: "M_HLTH",
|
||||||
# temporarily update this so that it's the Narwhal score that gets visualized on the map
|
# temporarily update this so that it's the Narwhal score that gets visualized on the map
|
||||||
field_names.SCORE_N_COMMUNITIES: "SM_C",
|
# The NEW final score value INCLUDES the adjacency index.
|
||||||
|
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX: "SM_C",
|
||||||
field_names.SCORE_N_COMMUNITIES
|
field_names.SCORE_N_COMMUNITIES
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
|
+ field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
|
||||||
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
|
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
|
||||||
|
@ -305,6 +306,9 @@ TILES_SCORE_COLUMNS = {
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
|
+ field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
|
||||||
field_names.HIGH_FUTURE_FLOOD_RISK_FIELD: "FLD_ET",
|
field_names.HIGH_FUTURE_FLOOD_RISK_FIELD: "FLD_ET",
|
||||||
field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD: "WF_ET",
|
field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD: "WF_ET",
|
||||||
|
field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD: "ADJ_ET",
|
||||||
|
field_names.SCORE_N_COMMUNITIES
|
||||||
|
+ field_names.ADJACENCY_INDEX_SUFFIX: "ADJ_PFS",
|
||||||
field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
|
field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
|
+ field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
|
||||||
field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
|
field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
|
||||||
|
@ -364,6 +368,7 @@ TILES_SCORE_FLOAT_COLUMNS = [
|
||||||
field_names.FUTURE_FLOOD_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
field_names.FUTURE_FLOOD_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||||
field_names.FUTURE_WILDFIRE_RISK_FIELD
|
field_names.FUTURE_WILDFIRE_RISK_FIELD
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
||||||
|
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENCY_INDEX_SUFFIX,
|
||||||
field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
|
field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
||||||
]
|
]
|
||||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
715
data/data-pipeline/data_pipeline/ipython/explore_adjacency.ipynb
Normal file
715
data/data-pipeline/data_pipeline/ipython/explore_adjacency.ipynb
Normal file
File diff suppressed because one or more lines are too long
|
@ -1,6 +1,8 @@
|
||||||
# Suffixes
|
# Suffixes
|
||||||
PERCENTILE_FIELD_SUFFIX = " (percentile)"
|
PERCENTILE_FIELD_SUFFIX = " (percentile)"
|
||||||
ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
|
ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
|
||||||
|
ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
|
||||||
|
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
|
||||||
|
|
||||||
# Geographic field names
|
# Geographic field names
|
||||||
GEOID_TRACT_FIELD = "GEOID10_TRACT"
|
GEOID_TRACT_FIELD = "GEOID10_TRACT"
|
||||||
|
@ -691,6 +693,9 @@ CATEGORY_COUNT = "Total categories exceeded"
|
||||||
|
|
||||||
FPL_200_SERIES = "Is low income?"
|
FPL_200_SERIES = "Is low income?"
|
||||||
FPL_200_SERIES_IMPUTED_AND_ADJUSTED = "Is low income (imputed and adjusted)?"
|
FPL_200_SERIES_IMPUTED_AND_ADJUSTED = "Is low income (imputed and adjusted)?"
|
||||||
|
FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS = (
|
||||||
|
"Meets the less stringent low income criterion for the adjacency index?"
|
||||||
|
)
|
||||||
FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
|
FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
|
||||||
"Is low income and has a low percent of higher ed students?"
|
"Is low income and has a low percent of higher ed students?"
|
||||||
)
|
)
|
||||||
|
@ -715,5 +720,10 @@ HISTORIC_REDLINING_SCORE_EXCEEDED_LOW_INCOME_FIELD = (
|
||||||
"Tract-level redlining score meets or exceeds 3.25 and is low income"
|
"Tract-level redlining score meets or exceeds 3.25 and is low income"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD = (
|
||||||
|
"Is the tract surrounded by disadvantaged communities?"
|
||||||
|
)
|
||||||
|
|
||||||
# End of names for individual factors being exceeded
|
# End of names for individual factors being exceeded
|
||||||
####
|
####
|
||||||
|
|
|
@ -6,6 +6,7 @@ from data_pipeline.score.score import Score
|
||||||
import data_pipeline.score.field_names as field_names
|
import data_pipeline.score.field_names as field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
import data_pipeline.etl.score.constants as constants
|
import data_pipeline.etl.score.constants as constants
|
||||||
|
from data_pipeline.score.utils import calculate_tract_adjacency_scores
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -20,6 +21,12 @@ class ScoreNarwhal(Score):
|
||||||
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
||||||
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
|
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
|
||||||
|
|
||||||
|
# We define a donut hole DAC as a tract that is entirely surrounded by
|
||||||
|
# DACs (score threshold = 1) and above median for low income, as a starting
|
||||||
|
# point. As we ground-truth, these thresholds might change.
|
||||||
|
self.LOW_INCOME_THRESHOLD_DONUT: float = 0.50
|
||||||
|
self.SCORE_THRESHOLD_DONUT: float = 1.00
|
||||||
|
|
||||||
super().__init__(df)
|
super().__init__(df)
|
||||||
|
|
||||||
def _combine_island_areas_with_states_and_set_thresholds(
|
def _combine_island_areas_with_states_and_set_thresholds(
|
||||||
|
@ -907,6 +914,54 @@ class ScoreNarwhal(Score):
|
||||||
| workforce_combined_criteria_for_island_areas
|
| workforce_combined_criteria_for_island_areas
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _mark_donut_hole_tracts(self) -> pd.DataFrame:
|
||||||
|
"""Mark tracts that do not qualify on their own, but are surrounded by those that do
|
||||||
|
|
||||||
|
A donut hole is a tract surrounded by tracts that are marked for inclusion
|
||||||
|
by the scoring system AND meet a less stringent low-income threshhold.
|
||||||
|
|
||||||
|
We calculate "donut holes" after the initial score generation
|
||||||
|
"""
|
||||||
|
logger.info("Marking donut hole tracts")
|
||||||
|
|
||||||
|
# This is the boolean we pass to the front end for the donut-hole-specific
|
||||||
|
# low income criterion
|
||||||
|
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS] = (
|
||||||
|
self.df[
|
||||||
|
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
|
||||||
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
|
]
|
||||||
|
>= self.LOW_INCOME_THRESHOLD_DONUT
|
||||||
|
)
|
||||||
|
|
||||||
|
self.df = self.df.merge(
|
||||||
|
calculate_tract_adjacency_scores(
|
||||||
|
self.df, field_names.SCORE_N_COMMUNITIES
|
||||||
|
),
|
||||||
|
how="left",
|
||||||
|
on=field_names.GEOID_TRACT_FIELD,
|
||||||
|
)
|
||||||
|
|
||||||
|
# This is the boolean we pass to the front end for color
|
||||||
|
self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD] = (
|
||||||
|
self.df[
|
||||||
|
(
|
||||||
|
field_names.SCORE_N_COMMUNITIES
|
||||||
|
+ field_names.ADJACENCY_INDEX_SUFFIX
|
||||||
|
)
|
||||||
|
]
|
||||||
|
>= self.SCORE_THRESHOLD_DONUT
|
||||||
|
)
|
||||||
|
|
||||||
|
# This should be the "final list" of Score Narwhal communities, meaning that we would
|
||||||
|
# expect this to be True if either the tract is a donut hole community OR the tract is a DAC
|
||||||
|
self.df[
|
||||||
|
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
|
||||||
|
] = (
|
||||||
|
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS]
|
||||||
|
& self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
|
||||||
|
)
|
||||||
|
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score Narhwal")
|
logger.info("Adding Score Narhwal")
|
||||||
|
|
||||||
|
@ -946,5 +1001,6 @@ class ScoreNarwhal(Score):
|
||||||
field_names.SCORE_N_COMMUNITIES
|
field_names.SCORE_N_COMMUNITIES
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
] = self.df[field_names.SCORE_N_COMMUNITIES].astype(int)
|
] = self.df[field_names.SCORE_N_COMMUNITIES].astype(int)
|
||||||
|
self._mark_donut_hole_tracts()
|
||||||
|
|
||||||
return self.df
|
return self.df
|
||||||
|
|
56
data/data-pipeline/data_pipeline/score/utils.py
Normal file
56
data/data-pipeline/data_pipeline/score/utils.py
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
"""Utilities to help generate the score."""
|
||||||
|
import pandas as pd
|
||||||
|
import geopandas as gpd
|
||||||
|
import data_pipeline.score.field_names as field_names
|
||||||
|
|
||||||
|
# XXX: @jorge I am torn about the coupling that importing from
|
||||||
|
# etl.sources vs keeping the code DRY. Thoughts?
|
||||||
|
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
|
||||||
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_tract_adjacency_scores(
|
||||||
|
df: pd.DataFrame, score_column: str
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Calculate the mean score of each tract in df based on its neighbors
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): A dataframe with at least the following columns:
|
||||||
|
* field_names.GEOID_TRACT_FIELD
|
||||||
|
* score_column
|
||||||
|
|
||||||
|
score_column (str): The name of the column that contains the scores
|
||||||
|
to average
|
||||||
|
Returns:
|
||||||
|
df (pandas.DataFrame): A dataframe with two columns:
|
||||||
|
* field_names.GEOID_TRACT_FIELD
|
||||||
|
* {score_column}_ADJACENT_MEAN, which is the average of score_column for
|
||||||
|
each tract that touches the tract identified
|
||||||
|
in field_names.GEOID_TRACT_FIELD
|
||||||
|
"""
|
||||||
|
ORIGINAL_TRACT = "ORIGINAL_TRACT"
|
||||||
|
logger.debug("Calculating tract adjacency scores")
|
||||||
|
tract_data = get_tract_geojson()
|
||||||
|
df: gpd.GeoDataFrame = tract_data.merge(
|
||||||
|
df, on=field_names.GEOID_TRACT_FIELD
|
||||||
|
)
|
||||||
|
df = df.rename(columns={field_names.GEOID_TRACT_FIELD: ORIGINAL_TRACT})
|
||||||
|
|
||||||
|
logger.debug("Perfoming spatial join to find all adjacent tracts")
|
||||||
|
adjacent_tracts: gpd.GeoDataFrame = df.sjoin(
|
||||||
|
tract_data, predicate="touches"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug("Calculating means based on adjacency")
|
||||||
|
return (
|
||||||
|
adjacent_tracts.groupby(field_names.GEOID_TRACT_FIELD)[[score_column]]
|
||||||
|
.mean()
|
||||||
|
.reset_index()
|
||||||
|
.rename(
|
||||||
|
columns={
|
||||||
|
score_column: f"{score_column}{field_names.ADJACENCY_INDEX_SUFFIX}",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
0
data/data-pipeline/data_pipeline/tests/score/__init__.py
Normal file
0
data/data-pipeline/data_pipeline/tests/score/__init__.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
GEOID10_TRACT,included
|
||||||
|
24027602100,True
|
||||||
|
24027602303,True
|
||||||
|
24027605503,True
|
||||||
|
24027605502,True
|
||||||
|
24027603004,False
|
||||||
|
24027605104,True
|
||||||
|
24027603003,True
|
||||||
|
24027603001,True
|
||||||
|
24027602201,True
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,71 @@
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
# flake8: noqa=F841
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
from functools import partial
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pandas as pd
|
||||||
|
from data_pipeline.score.utils import (
|
||||||
|
calculate_tract_adjacency_scores as original_calculate_tract_adjacency_score,
|
||||||
|
)
|
||||||
|
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
|
||||||
|
from data_pipeline.score import field_names
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def patch_calculate_tract_adjacency_scores():
|
||||||
|
tract_data = Path(__file__).parent / "data" / "us.geojson"
|
||||||
|
get_tract_geojson_mock = partial(
|
||||||
|
get_tract_geojson, _tract_data_path=tract_data
|
||||||
|
)
|
||||||
|
with mock.patch(
|
||||||
|
"data_pipeline.score.utils.get_tract_geojson",
|
||||||
|
new=get_tract_geojson_mock,
|
||||||
|
):
|
||||||
|
yield original_calculate_tract_adjacency_score
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def score_data():
|
||||||
|
score_csv = Path(__file__).parent / "data" / "scores.csv"
|
||||||
|
return pd.read_csv(
|
||||||
|
score_csv, dtype={field_names.GEOID_TRACT_FIELD: str, "included": bool}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_adjacent_are_true(score_data):
|
||||||
|
score_data["included"] = True
|
||||||
|
score_data.loc[
|
||||||
|
score_data.GEOID10_TRACT == "24027603004", "included"
|
||||||
|
] = False
|
||||||
|
with patch_calculate_tract_adjacency_scores() as calculate_tract_adjacency_scores:
|
||||||
|
adjancency_scores = calculate_tract_adjacency_scores(
|
||||||
|
score_data, "included"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
adjancency_scores.loc[
|
||||||
|
adjancency_scores.GEOID10_TRACT == "24027603004",
|
||||||
|
"included" + field_names.ADJACENCY_INDEX_SUFFIX,
|
||||||
|
].iloc[0]
|
||||||
|
== 1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_adjacent_are_false(score_data):
|
||||||
|
score_data["included"] = False
|
||||||
|
score_data.loc[
|
||||||
|
score_data.GEOID10_TRACT == "24027603004", "included"
|
||||||
|
] = False
|
||||||
|
with patch_calculate_tract_adjacency_scores() as calculate_tract_adjacency_scores:
|
||||||
|
adjancency_scores = calculate_tract_adjacency_scores(
|
||||||
|
score_data, "included"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
adjancency_scores.loc[
|
||||||
|
adjancency_scores.GEOID10_TRACT == "24027603004",
|
||||||
|
"included" + field_names.ADJACENCY_INDEX_SUFFIX,
|
||||||
|
].iloc[0]
|
||||||
|
== 0.0
|
||||||
|
)
|
Loading…
Add table
Reference in a new issue