mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 07:21:18 -07:00
Add donut hole calculation to score (#1828)
Adds adjacency index to the pipeline. Requires thorough QA
This commit is contained in:
parent
88dc2e5a8e
commit
6e41e0d9f0
17 changed files with 969 additions and 8 deletions
56
data/data-pipeline/data_pipeline/score/utils.py
Normal file
56
data/data-pipeline/data_pipeline/score/utils.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
"""Utilities to help generate the score."""
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import data_pipeline.score.field_names as field_names
|
||||
|
||||
# XXX: @jorge I am torn about the coupling that importing from
|
||||
# etl.sources vs keeping the code DRY. Thoughts?
|
||||
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def calculate_tract_adjacency_scores(
|
||||
df: pd.DataFrame, score_column: str
|
||||
) -> pd.DataFrame:
|
||||
"""Calculate the mean score of each tract in df based on its neighbors
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): A dataframe with at least the following columns:
|
||||
* field_names.GEOID_TRACT_FIELD
|
||||
* score_column
|
||||
|
||||
score_column (str): The name of the column that contains the scores
|
||||
to average
|
||||
Returns:
|
||||
df (pandas.DataFrame): A dataframe with two columns:
|
||||
* field_names.GEOID_TRACT_FIELD
|
||||
* {score_column}_ADJACENT_MEAN, which is the average of score_column for
|
||||
each tract that touches the tract identified
|
||||
in field_names.GEOID_TRACT_FIELD
|
||||
"""
|
||||
ORIGINAL_TRACT = "ORIGINAL_TRACT"
|
||||
logger.debug("Calculating tract adjacency scores")
|
||||
tract_data = get_tract_geojson()
|
||||
df: gpd.GeoDataFrame = tract_data.merge(
|
||||
df, on=field_names.GEOID_TRACT_FIELD
|
||||
)
|
||||
df = df.rename(columns={field_names.GEOID_TRACT_FIELD: ORIGINAL_TRACT})
|
||||
|
||||
logger.debug("Perfoming spatial join to find all adjacent tracts")
|
||||
adjacent_tracts: gpd.GeoDataFrame = df.sjoin(
|
||||
tract_data, predicate="touches"
|
||||
)
|
||||
|
||||
logger.debug("Calculating means based on adjacency")
|
||||
return (
|
||||
adjacent_tracts.groupby(field_names.GEOID_TRACT_FIELD)[[score_column]]
|
||||
.mean()
|
||||
.reset_index()
|
||||
.rename(
|
||||
columns={
|
||||
score_column: f"{score_column}{field_names.ADJACENCY_INDEX_SUFFIX}",
|
||||
}
|
||||
)
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue