mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 19:30:17 -07:00
Issue 954: Add various data sources from Child Opportunity Index (#986)
* Adds four fields: * Summer days above 90F * Percent low access to healthy food * Percent impenetrable surface areas * Low third grade reading proficiency * Each of these four gets added into Definition L in various factors. * Additionally, I add college attendance fields to the ETL for Census ACS. * This PR also introduces the notion of "reverse percentiles", relevant to ticket #970.
This commit is contained in:
parent
df564658a5
commit
5a6d6d8557
8 changed files with 357 additions and 40 deletions
|
@ -1,4 +1,6 @@
|
|||
import functools
|
||||
from collections import namedtuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
|
@ -29,6 +31,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.persistent_poverty_df: pd.DataFrame
|
||||
self.census_decennial_df: pd.DataFrame
|
||||
self.census_2010_df: pd.DataFrame
|
||||
self.child_opportunity_index_df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Loading data sets from disk.")
|
||||
|
@ -162,6 +165,19 @@ class ScoreETL(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
|
||||
# Load COI data
|
||||
child_opportunity_index_csv = (
|
||||
constants.DATA_PATH
|
||||
/ "dataset"
|
||||
/ "child_opportunity_index"
|
||||
/ "usa.csv"
|
||||
)
|
||||
self.child_opportunity_index_df = pd.read_csv(
|
||||
child_opportunity_index_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
||||
logger.info("Joining Census Tract dataframes")
|
||||
|
||||
|
@ -255,6 +271,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.census_acs_median_incomes_df,
|
||||
self.census_decennial_df,
|
||||
self.census_2010_df,
|
||||
self.child_opportunity_index_df,
|
||||
]
|
||||
|
||||
# Sanity check each data frame before merging.
|
||||
|
@ -323,6 +340,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.MEDIAN_HOUSE_VALUE_FIELD,
|
||||
field_names.COLLEGE_ATTENDANCE_FIELD,
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
|
||||
|
@ -333,6 +351,9 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
|
||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
|
||||
field_names.EXTREME_HEAT_FIELD,
|
||||
field_names.HEALTHY_FOOD_FIELD,
|
||||
field_names.IMPENETRABLE_SURFACES_FIELD,
|
||||
]
|
||||
|
||||
non_numeric_columns = [
|
||||
|
@ -340,7 +361,32 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.PERSISTENT_POVERTY_FIELD,
|
||||
]
|
||||
|
||||
columns_to_keep = non_numeric_columns + numeric_columns
|
||||
# For some columns, high values are "good", so we want to reverse the percentile
|
||||
# so that high values are "bad" and any scoring logic can still check if it's
|
||||
# >= some threshold.
|
||||
# TODO: Add more fields here.
|
||||
# https://github.com/usds/justice40-tool/issues/970
|
||||
ReversePercentile = namedtuple(
|
||||
typename="ReversePercentile",
|
||||
field_names=["field_name", "low_field_name"],
|
||||
)
|
||||
reverse_percentiles = [
|
||||
# This dictionary follows the format:
|
||||
# <field name> : <field name for low values>
|
||||
# for instance, 3rd grade reading level : Low 3rd grade reading level.
|
||||
# This low field will not exist yet, it is only calculated for the
|
||||
# percentile.
|
||||
ReversePercentile(
|
||||
field_name=field_names.READING_FIELD,
|
||||
low_field_name=field_names.LOW_READING_FIELD,
|
||||
)
|
||||
]
|
||||
|
||||
columns_to_keep = (
|
||||
non_numeric_columns
|
||||
+ numeric_columns
|
||||
+ [rp.field_name for rp in reverse_percentiles]
|
||||
)
|
||||
|
||||
df_copy = df[columns_to_keep].copy()
|
||||
|
||||
|
@ -375,6 +421,19 @@ class ScoreETL(ExtractTransformLoad):
|
|||
df_copy[col] - min_value
|
||||
) / (max_value - min_value)
|
||||
|
||||
# Create reversed percentiles for these fields
|
||||
for reverse_percentile in reverse_percentiles:
|
||||
# Calculate reverse percentiles
|
||||
# For instance, for 3rd grade reading level (score from 0-500),
|
||||
# calculate reversed percentiles and give the result the name
|
||||
# `Low 3rd grade reading level (percentile)`.
|
||||
df_copy[
|
||||
f"{reverse_percentile.low_field_name}"
|
||||
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
|
||||
] = df_copy[reverse_percentile.field_name].rank(
|
||||
pct=True, ascending=False
|
||||
)
|
||||
|
||||
# Special logic: create a combined population field.
|
||||
# We sometimes run analytics on "population", and this makes a single field
|
||||
# that is either the island area's population in 2009 or the state's
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue