mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Issue 967: Calculate urban/rural percentiles (#1006)
This commit is contained in:
parent
780d1126ff
commit
1a61026ecf
2 changed files with 108 additions and 15 deletions
|
@ -1,6 +1,7 @@
|
||||||
import functools
|
import functools
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
@ -253,6 +254,94 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
f"Too many rows in the join: {len(df_to_check)} in {dataframe_descriptor}"
|
f"Too many rows in the join: {len(df_to_check)} in {dataframe_descriptor}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_percentiles_to_df(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
input_column_name: str,
|
||||||
|
output_column_name_root: str,
|
||||||
|
ascending: bool = True,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Creates percentiles.
|
||||||
|
|
||||||
|
One percentile will be created and returned as
|
||||||
|
f"{output_column_name_root}{field_names.PERCENTILE_FIELD_SUFFIX}".
|
||||||
|
E.g., "PM2.5 exposure (percentile)".
|
||||||
|
This will be for the entire country.
|
||||||
|
|
||||||
|
For an "apples-to-apples" comparison of urban tracts to other urban tracts,
|
||||||
|
and compare rural tracts to other rural tracts.
|
||||||
|
|
||||||
|
This percentile will be created and returned as
|
||||||
|
f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}".
|
||||||
|
E.g., "PM2.5 exposure (percentile urban/rural)".
|
||||||
|
This field exists for every tract, but for urban tracts this value will be the
|
||||||
|
percentile compared to other urban tracts, and for rural tracts this value
|
||||||
|
will be the percentile compared to other rural tracts.
|
||||||
|
|
||||||
|
Specific methdology:
|
||||||
|
1. Decide a methodology for confirming whether a tract counts as urban or
|
||||||
|
rural. Currently in the codebase, we use Geocorr to identify the % rural of
|
||||||
|
a tract, and mark the tract as rural if the percentage is >50% and urban
|
||||||
|
otherwise. This may or may not be the right methodology.
|
||||||
|
2. Once tracts are marked as urban or rural, create one percentile rank
|
||||||
|
that only ranks urban tracts, and one percentile rank that only ranks rural
|
||||||
|
tracts.
|
||||||
|
3. Combine into a single field.
|
||||||
|
|
||||||
|
`output_column_name_root` is different from `input_column_name` to enable the
|
||||||
|
reverse percentile use case. In that use case, `input_column_name` may be
|
||||||
|
something like "3rd grade reading proficiency" and `output_column_name_root`
|
||||||
|
may be something like "Low 3rd grade reading proficiency".
|
||||||
|
"""
|
||||||
|
# Create the "basic" percentile.
|
||||||
|
df[
|
||||||
|
f"{output_column_name_root}"
|
||||||
|
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
|
||||||
|
] = df[input_column_name].rank(pct=True, ascending=ascending)
|
||||||
|
|
||||||
|
# Create the urban/rural percentiles.
|
||||||
|
urban_rural_percentile_fields_to_combine = []
|
||||||
|
for (urban_or_rural_string, urban_heuristic_bool) in [
|
||||||
|
("urban", True),
|
||||||
|
("rural", False),
|
||||||
|
]:
|
||||||
|
# Create a field with only those values
|
||||||
|
this_category_only_value_field = (
|
||||||
|
f"{input_column_name} (value {urban_or_rural_string} only)"
|
||||||
|
)
|
||||||
|
df[this_category_only_value_field] = np.where(
|
||||||
|
df[field_names.URBAN_HEURISTIC_FIELD] == urban_heuristic_bool,
|
||||||
|
df[input_column_name],
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate the percentile for only this category
|
||||||
|
this_category_only_percentile_field = (
|
||||||
|
f"{output_column_name_root} "
|
||||||
|
f"(percentile {urban_or_rural_string} only)"
|
||||||
|
)
|
||||||
|
df[this_category_only_percentile_field] = df[
|
||||||
|
this_category_only_value_field
|
||||||
|
].rank(
|
||||||
|
pct=True,
|
||||||
|
# Set ascending to the parameter value.
|
||||||
|
ascending=ascending,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add the field name to this list. Later, we'll combine this list.
|
||||||
|
urban_rural_percentile_fields_to_combine.append(
|
||||||
|
this_category_only_percentile_field
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine both urban and rural into one field:
|
||||||
|
df[
|
||||||
|
f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}"
|
||||||
|
] = df[urban_rural_percentile_fields_to_combine].mean(
|
||||||
|
axis=1, skipna=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
# TODO Move a lot of this to the ETL part of the pipeline
|
# TODO Move a lot of this to the ETL part of the pipeline
|
||||||
def _prepare_initial_df(self) -> pd.DataFrame:
|
def _prepare_initial_df(self) -> pd.DataFrame:
|
||||||
logger.info("Preparing initial dataframe")
|
logger.info("Preparing initial dataframe")
|
||||||
|
@ -393,11 +482,14 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
|
df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
|
||||||
|
|
||||||
# Convert all columns to numeric and do math
|
# Convert all columns to numeric and do math
|
||||||
for col in numeric_columns:
|
for numeric_column in numeric_columns:
|
||||||
# Calculate percentiles
|
df_copy = self._add_percentiles_to_df(
|
||||||
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[
|
df=df_copy,
|
||||||
col
|
input_column_name=numeric_column,
|
||||||
].rank(pct=True)
|
# For this use case, the input name and output name root are the same.
|
||||||
|
output_column_name_root=numeric_column,
|
||||||
|
ascending=True,
|
||||||
|
)
|
||||||
|
|
||||||
# Min-max normalization:
|
# Min-max normalization:
|
||||||
# (
|
# (
|
||||||
|
@ -409,16 +501,16 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
# Maximum of all values
|
# Maximum of all values
|
||||||
# - minimum of all values
|
# - minimum of all values
|
||||||
# )
|
# )
|
||||||
min_value = df_copy[col].min(skipna=True)
|
min_value = df_copy[numeric_column].min(skipna=True)
|
||||||
|
|
||||||
max_value = df_copy[col].max(skipna=True)
|
max_value = df_copy[numeric_column].max(skipna=True)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"For data set {col}, the min value is {min_value} and the max value is {max_value}."
|
f"For data set {numeric_column}, the min value is {min_value} and the max value is {max_value}."
|
||||||
)
|
)
|
||||||
|
|
||||||
df_copy[f"{col}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
|
df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
|
||||||
df_copy[col] - min_value
|
df_copy[numeric_column] - min_value
|
||||||
) / (max_value - min_value)
|
) / (max_value - min_value)
|
||||||
|
|
||||||
# Create reversed percentiles for these fields
|
# Create reversed percentiles for these fields
|
||||||
|
@ -427,11 +519,11 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
# For instance, for 3rd grade reading level (score from 0-500),
|
# For instance, for 3rd grade reading level (score from 0-500),
|
||||||
# calculate reversed percentiles and give the result the name
|
# calculate reversed percentiles and give the result the name
|
||||||
# `Low 3rd grade reading level (percentile)`.
|
# `Low 3rd grade reading level (percentile)`.
|
||||||
df_copy[
|
df_copy = self._add_percentiles_to_df(
|
||||||
f"{reverse_percentile.low_field_name}"
|
df=df_copy,
|
||||||
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
|
input_column_name=reverse_percentile.field_name,
|
||||||
] = df_copy[reverse_percentile.field_name].rank(
|
output_column_name_root=reverse_percentile.low_field_name,
|
||||||
pct=True, ascending=False
|
ascending=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Special logic: create a combined population field.
|
# Special logic: create a combined population field.
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# Suffixes
|
# Suffixes
|
||||||
PERCENTILE_FIELD_SUFFIX = " (percentile)"
|
PERCENTILE_FIELD_SUFFIX = " (percentile)"
|
||||||
|
PERCENTILE_URBAN_RURAL_FIELD_SUFFIX = " (percentile urban/rural)"
|
||||||
MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
|
MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
|
||||||
TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)"
|
TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)"
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue