Issue 954: Add various data sources from Child Opportunity Index (#986)

* Adds four fields:
    * Summer days above 90F
    * Percent low access to healthy food
    * Percent impenetrable surface areas
    * Low third grade reading proficiency

* Each of these four gets added into Definition L in various factors.

* Additionally, I add college attendance fields to the ETL for Census ACS.

* This PR also introduces the notion of "reverse percentiles", relevant to ticket #970.
This commit is contained in:
Lucas Merrill Brown 2021-12-07 11:33:49 -05:00 committed by GitHub
commit 5a6d6d8557
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 357 additions and 40 deletions

View file

@ -114,6 +114,27 @@ class CensusACSETL(ExtractTransformLoad):
)
self.HIGH_SCHOOL_ED_FIELD = "Percent individuals age 25 or over with less than high school degree"
# College attendance fields
self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED = (
"B14004_001E" # Estimate!!Total
)
self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PUBLIC = "B14004_003E" # Estimate!!Total!!Male!!Enrolled in public college or graduate school
self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PRIVATE = "B14004_008E" # Estimate!!Total!!Male!!Enrolled in private college or graduate school
self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PUBLIC = "B14004_019E" # Estimate!!Total!!Female!!Enrolled in public college or graduate school
self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE = "B14004_024E" # Estimate!!Total!!Female!!Enrolled in private college or graduate school
self.COLLEGE_ATTENDANCE_FIELDS = [
self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED,
self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PUBLIC,
self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PRIVATE,
self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PUBLIC,
self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE,
]
self.COLLEGE_ATTENDANCE_FIELD = (
"Percent enrollment in college or graduate school"
)
self.RE_FIELDS = [
"B02001_001E",
"B02001_002E",
@ -156,15 +177,30 @@ class CensusACSETL(ExtractTransformLoad):
self.STATE_GEOID_FIELD_NAME = "GEOID2"
self.COLUMNS_TO_KEEP = (
[
self.GEOID_TRACT_FIELD_NAME,
self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME,
self.MEDIAN_INCOME_FIELD_NAME,
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.HIGH_SCHOOL_ED_FIELD,
self.COLLEGE_ATTENDANCE_FIELD,
]
+ self.RE_OUTPUT_FIELDS
+ [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
)
self.df: pd.DataFrame
def extract(self) -> None:
# Define the variables to retrieve
variables = (
[
# Income field
self.MEDIAN_INCOME_FIELD,
# House value
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
@ -172,6 +208,7 @@ class CensusACSETL(ExtractTransformLoad):
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
+ self.RE_FIELDS
+ self.COLLEGE_ATTENDANCE_FIELDS
)
self.df = retrieve_census_acs_data(
@ -308,6 +345,14 @@ class CensusACSETL(ExtractTransformLoad):
df["B03003_003E"] / df["B03003_001E"]
)
# Calculate college attendance:
df[self.COLLEGE_ATTENDANCE_FIELD] = (
df[self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PUBLIC]
+ df[self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PRIVATE]
+ df[self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PUBLIC]
+ df[self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE]
) / df[self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED]
# Save results to self.
self.df = df
@ -317,23 +362,7 @@ class CensusACSETL(ExtractTransformLoad):
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
columns_to_include = (
[
self.GEOID_TRACT_FIELD_NAME,
self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME,
self.MEDIAN_INCOME_FIELD_NAME,
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.HIGH_SCHOOL_ED_FIELD,
]
+ self.RE_OUTPUT_FIELDS
+ [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
)
self.df[columns_to_include].to_csv(
self.df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)

View file

@ -0,0 +1,120 @@
from pathlib import Path
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger, unzip_file_from_url
logger = get_module_logger(__name__)
class ChildOpportunityIndex(ExtractTransformLoad):
"""ETL Child Opportunity Index data.
COI compiles a number of useful data sets. In the future, we could pull these
data sets in directly from their original creators.
Data dictionary available when you download zip from `self.COI_FILE_URL`.
Data source overview: https://data.diversitydatakids.org/dataset/coi20-child-opportunity-index-2-0-database.
Full technical documents: https://www.diversitydatakids.org/sites/default/files/2020-02/ddk_coi2.0_technical_documentation_20200212.pdf.
Github repo: https://github.com/diversitydatakids/COI/
"""
def __init__(self):
self.COI_FILE_URL = (
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
"3a0ededa30a0?format=csv"
)
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "child_opportunity_index"
)
self.TRACT_INPUT_COLUMN_NAME = "geoid"
self.EXTREME_HEAT_INPUT_FIELD = "HE_HEAT"
self.HEALTHY_FOOD_INPUT_FIELD = "HE_FOOD"
self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
self.READING_INPUT_FIELD = "ED_READING"
# Constants for output
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
field_names.EXTREME_HEAT_FIELD,
field_names.HEALTHY_FOOD_FIELD,
field_names.IMPENETRABLE_SURFACES_FIELD,
field_names.READING_FIELD,
]
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame
def extract(self) -> None:
logger.info("Starting 51MB data download.")
unzip_file_from_url(
file_url=self.COI_FILE_URL,
download_path=self.TMP_PATH,
unzipped_file_path=self.TMP_PATH / "child_opportunity_index",
)
self.raw_df = pd.read_csv(
filepath_or_buffer=self.TMP_PATH
/ "child_opportunity_index"
/ "raw.csv",
# The following need to remain as strings for all of their digits, not get
# converted to numbers.
dtype={
self.TRACT_INPUT_COLUMN_NAME: "string",
},
low_memory=False,
)
def transform(self) -> None:
logger.info("Starting transforms.")
output_df = self.raw_df.rename(
columns={
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
self.EXTREME_HEAT_INPUT_FIELD: field_names.EXTREME_HEAT_FIELD,
self.HEALTHY_FOOD_INPUT_FIELD: field_names.HEALTHY_FOOD_FIELD,
self.IMPENETRABLE_SURFACES_INPUT_FIELD: field_names.IMPENETRABLE_SURFACES_FIELD,
self.READING_INPUT_FIELD: field_names.READING_FIELD,
}
)
# Sanity check the tract field.
if len(output_df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()) != 1:
raise ValueError("Wrong tract length.")
# COI has two rows per tract: one for 2010 and one for 2015.
output_df = output_df[output_df["year"] == 2015]
# Convert percents from 0-100 to 0-1 to standardize with our other fields.
percent_fields_to_convert = [
field_names.HEALTHY_FOOD_FIELD,
field_names.IMPENETRABLE_SURFACES_FIELD,
]
for percent_field_to_convert in percent_fields_to_convert:
output_df[percent_field_to_convert] = (
output_df[percent_field_to_convert] / 100
)
self.output_df = output_df
def validate(self) -> None:
logger.info("Validating data.")
pass
def load(self) -> None:
logger.info("Saving CSV")
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.output_df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)