Issue 954: Add various data sources from Child Opportunity Index (#986)

* Adds four fields:
    * Summer days above 90F
    * Percent low access to healthy food
    * Percent impenetrable surface areas
    * Low third grade reading proficiency

* Each of these four gets added into Definition L in various factors.

* Additionally, I add college attendance fields to the ETL for Census ACS.

* This PR also introduces the notion of "reverse percentiles", relevant to ticket #970.
This commit is contained in:
Lucas Merrill Brown 2021-12-07 11:33:49 -05:00 committed by GitHub
commit 5a6d6d8557
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 357 additions and 40 deletions

View file

@ -1,4 +1,6 @@
import functools
from collections import namedtuple
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
@ -29,6 +31,7 @@ class ScoreETL(ExtractTransformLoad):
self.persistent_poverty_df: pd.DataFrame
self.census_decennial_df: pd.DataFrame
self.census_2010_df: pd.DataFrame
self.child_opportunity_index_df: pd.DataFrame
def extract(self) -> None:
logger.info("Loading data sets from disk.")
@ -162,6 +165,19 @@ class ScoreETL(ExtractTransformLoad):
low_memory=False,
)
# Load COI data
child_opportunity_index_csv = (
constants.DATA_PATH
/ "dataset"
/ "child_opportunity_index"
/ "usa.csv"
)
self.child_opportunity_index_df = pd.read_csv(
child_opportunity_index_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
logger.info("Joining Census Tract dataframes")
@ -255,6 +271,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_acs_median_incomes_df,
self.census_decennial_df,
self.census_2010_df,
self.child_opportunity_index_df,
]
# Sanity check each data frame before merging.
@ -323,6 +340,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD,
field_names.COLLEGE_ATTENDANCE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
@ -333,6 +351,9 @@ class ScoreETL(ExtractTransformLoad):
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
field_names.EXTREME_HEAT_FIELD,
field_names.HEALTHY_FOOD_FIELD,
field_names.IMPENETRABLE_SURFACES_FIELD,
]
non_numeric_columns = [
@ -340,7 +361,32 @@ class ScoreETL(ExtractTransformLoad):
field_names.PERSISTENT_POVERTY_FIELD,
]
columns_to_keep = non_numeric_columns + numeric_columns
# For some columns, high values are "good", so we want to reverse the percentile
# so that high values are "bad" and any scoring logic can still check if it's
# >= some threshold.
# TODO: Add more fields here.
# https://github.com/usds/justice40-tool/issues/970
ReversePercentile = namedtuple(
typename="ReversePercentile",
field_names=["field_name", "low_field_name"],
)
reverse_percentiles = [
# This dictionary follows the format:
# <field name> : <field name for low values>
# for instance, 3rd grade reading level : Low 3rd grade reading level.
# This low field will not exist yet, it is only calculated for the
# percentile.
ReversePercentile(
field_name=field_names.READING_FIELD,
low_field_name=field_names.LOW_READING_FIELD,
)
]
columns_to_keep = (
non_numeric_columns
+ numeric_columns
+ [rp.field_name for rp in reverse_percentiles]
)
df_copy = df[columns_to_keep].copy()
@ -375,6 +421,19 @@ class ScoreETL(ExtractTransformLoad):
df_copy[col] - min_value
) / (max_value - min_value)
# Create reversed percentiles for these fields
for reverse_percentile in reverse_percentiles:
# Calculate reverse percentiles
# For instance, for 3rd grade reading level (score from 0-500),
# calculate reversed percentiles and give the result the name
# `Low 3rd grade reading level (percentile)`.
df_copy[
f"{reverse_percentile.low_field_name}"
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
] = df_copy[reverse_percentile.field_name].rank(
pct=True, ascending=False
)
# Special logic: create a combined population field.
# We sometimes run analytics on "population", and this makes a single field
# that is either the island area's population in 2009 or the state's