Issue 967: Calculate urban/rural percentiles (#1006)

This commit is contained in:
Lucas Merrill Brown 2021-12-07 17:28:36 -05:00 committed by GitHub
parent 780d1126ff
commit 1a61026ecf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 108 additions and 15 deletions

View file

@ -1,6 +1,7 @@
import functools import functools
from collections import namedtuple from collections import namedtuple
import numpy as np
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
@ -253,6 +254,94 @@ class ScoreETL(ExtractTransformLoad):
f"Too many rows in the join: {len(df_to_check)} in {dataframe_descriptor}" f"Too many rows in the join: {len(df_to_check)} in {dataframe_descriptor}"
) )
@staticmethod
def _add_percentiles_to_df(
df: pd.DataFrame,
input_column_name: str,
output_column_name_root: str,
ascending: bool = True,
) -> pd.DataFrame:
"""Creates percentiles.
One percentile will be created and returned as
f"{output_column_name_root}{field_names.PERCENTILE_FIELD_SUFFIX}".
E.g., "PM2.5 exposure (percentile)".
This will be for the entire country.
For an "apples-to-apples" comparison of urban tracts to other urban tracts,
and compare rural tracts to other rural tracts.
This percentile will be created and returned as
f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}".
E.g., "PM2.5 exposure (percentile urban/rural)".
This field exists for every tract, but for urban tracts this value will be the
percentile compared to other urban tracts, and for rural tracts this value
will be the percentile compared to other rural tracts.
Specific methdology:
1. Decide a methodology for confirming whether a tract counts as urban or
rural. Currently in the codebase, we use Geocorr to identify the % rural of
a tract, and mark the tract as rural if the percentage is >50% and urban
otherwise. This may or may not be the right methodology.
2. Once tracts are marked as urban or rural, create one percentile rank
that only ranks urban tracts, and one percentile rank that only ranks rural
tracts.
3. Combine into a single field.
`output_column_name_root` is different from `input_column_name` to enable the
reverse percentile use case. In that use case, `input_column_name` may be
something like "3rd grade reading proficiency" and `output_column_name_root`
may be something like "Low 3rd grade reading proficiency".
"""
# Create the "basic" percentile.
df[
f"{output_column_name_root}"
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
] = df[input_column_name].rank(pct=True, ascending=ascending)
# Create the urban/rural percentiles.
urban_rural_percentile_fields_to_combine = []
for (urban_or_rural_string, urban_heuristic_bool) in [
("urban", True),
("rural", False),
]:
# Create a field with only those values
this_category_only_value_field = (
f"{input_column_name} (value {urban_or_rural_string} only)"
)
df[this_category_only_value_field] = np.where(
df[field_names.URBAN_HEURISTIC_FIELD] == urban_heuristic_bool,
df[input_column_name],
None,
)
# Calculate the percentile for only this category
this_category_only_percentile_field = (
f"{output_column_name_root} "
f"(percentile {urban_or_rural_string} only)"
)
df[this_category_only_percentile_field] = df[
this_category_only_value_field
].rank(
pct=True,
# Set ascending to the parameter value.
ascending=ascending,
)
# Add the field name to this list. Later, we'll combine this list.
urban_rural_percentile_fields_to_combine.append(
this_category_only_percentile_field
)
# Combine both urban and rural into one field:
df[
f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}"
] = df[urban_rural_percentile_fields_to_combine].mean(
axis=1, skipna=True
)
return df
# TODO Move a lot of this to the ETL part of the pipeline # TODO Move a lot of this to the ETL part of the pipeline
def _prepare_initial_df(self) -> pd.DataFrame: def _prepare_initial_df(self) -> pd.DataFrame:
logger.info("Preparing initial dataframe") logger.info("Preparing initial dataframe")
@ -393,11 +482,14 @@ class ScoreETL(ExtractTransformLoad):
df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric) df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
# Convert all columns to numeric and do math # Convert all columns to numeric and do math
for col in numeric_columns: for numeric_column in numeric_columns:
# Calculate percentiles df_copy = self._add_percentiles_to_df(
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[ df=df_copy,
col input_column_name=numeric_column,
].rank(pct=True) # For this use case, the input name and output name root are the same.
output_column_name_root=numeric_column,
ascending=True,
)
# Min-max normalization: # Min-max normalization:
# ( # (
@ -409,16 +501,16 @@ class ScoreETL(ExtractTransformLoad):
# Maximum of all values # Maximum of all values
# - minimum of all values # - minimum of all values
# ) # )
min_value = df_copy[col].min(skipna=True) min_value = df_copy[numeric_column].min(skipna=True)
max_value = df_copy[col].max(skipna=True) max_value = df_copy[numeric_column].max(skipna=True)
logger.info( logger.info(
f"For data set {col}, the min value is {min_value} and the max value is {max_value}." f"For data set {numeric_column}, the min value is {min_value} and the max value is {max_value}."
) )
df_copy[f"{col}{field_names.MIN_MAX_FIELD_SUFFIX}"] = ( df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
df_copy[col] - min_value df_copy[numeric_column] - min_value
) / (max_value - min_value) ) / (max_value - min_value)
# Create reversed percentiles for these fields # Create reversed percentiles for these fields
@ -427,11 +519,11 @@ class ScoreETL(ExtractTransformLoad):
# For instance, for 3rd grade reading level (score from 0-500), # For instance, for 3rd grade reading level (score from 0-500),
# calculate reversed percentiles and give the result the name # calculate reversed percentiles and give the result the name
# `Low 3rd grade reading level (percentile)`. # `Low 3rd grade reading level (percentile)`.
df_copy[ df_copy = self._add_percentiles_to_df(
f"{reverse_percentile.low_field_name}" df=df_copy,
f"{field_names.PERCENTILE_FIELD_SUFFIX}" input_column_name=reverse_percentile.field_name,
] = df_copy[reverse_percentile.field_name].rank( output_column_name_root=reverse_percentile.low_field_name,
pct=True, ascending=False ascending=False,
) )
# Special logic: create a combined population field. # Special logic: create a combined population field.

View file

@ -1,5 +1,6 @@
# Suffixes # Suffixes
PERCENTILE_FIELD_SUFFIX = " (percentile)" PERCENTILE_FIELD_SUFFIX = " (percentile)"
PERCENTILE_URBAN_RURAL_FIELD_SUFFIX = " (percentile urban/rural)"
MIN_MAX_FIELD_SUFFIX = " (min-max normalized)" MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)" TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)"