Update Census AMI to ETL into tracts, not CBGs (#900)

* Update Census AMI to ETL into tracts, not CBGs

Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Shelby Switzer 2021-11-18 11:05:32 -05:00 committed by lucasmbrown-usds
parent 537844236a
commit 617f41526f
2 changed files with 81 additions and 27 deletions

View file

@ -225,7 +225,6 @@ class ScoreETL(ExtractTransformLoad):
# Join all the data sources that use census block groups
census_block_group_dfs = [
self.ejscreen_df,
self.census_acs_median_incomes_df,
]
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
@ -241,6 +240,7 @@ class ScoreETL(ExtractTransformLoad):
self.persistent_poverty_df,
self.housing_and_transportation_df,
self.national_risk_index_df,
self.census_acs_median_incomes_df,
]
census_tract_df = self._join_tract_dfs(census_tract_dfs)

View file

@ -1,5 +1,6 @@
import json
from pathlib import Path
import numpy as np
import pandas as pd
import requests
@ -29,6 +30,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
)
self.MSA_ID_FIELD_NAME: str = "MSA ID"
self.MSA_TYPE_FIELD_NAME: str = "MSA Type"
self.POPULATION_FIELD_NAME: str = "pop10"
self.TEMPORARY_SORT_FIELD: str = "temporary sort field"
# Set constants for MSA median incomes
self.MSA_MEDIAN_INCOME_URL: str = (
@ -46,7 +49,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)"
self.COLUMNS_TO_KEEP = [
self.GEOID_FIELD_NAME,
self.GEOID_TRACT_FIELD_NAME,
self.PLACE_FIELD_NAME,
self.COUNTY_FIELD_NAME,
self.STATE_ABBREVIATION_FIELD_NAME,
@ -76,15 +79,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
)
# Create the full GEOID out of the component parts.
geocorr_df[self.GEOID_FIELD_NAME] = (
geocorr_df["county"] + geocorr_df["tract"] + geocorr_df["bg"]
geocorr_df[self.GEOID_TRACT_FIELD_NAME] = (
geocorr_df["county"] + geocorr_df["tract"]
)
# QA the combined field:
tract_values = geocorr_df[self.GEOID_FIELD_NAME].str.len().unique()
if any(tract_values != [12]):
tract_values = (
geocorr_df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()
)
if any(tract_values != [11]):
print(tract_values)
raise ValueError("Some of the census BG data has the wrong length.")
raise ValueError(
"Some of the census tract data has the wrong length."
)
# Rename some fields
geocorr_df.rename(
@ -101,18 +108,55 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
)
# Remove duplicated rows.
# Some rows appear twice: once for the population within a CBG that's also within a census place,
# and once for the population that's within a CBG that's *not* within a census place.
# Drop the row that's not within a census place.
# Some rows appear more than once: once for the population within a tract that's also within a census place,
# and once for the population that's within a tract that's *not* within a census place.
# Sort based on the following rule:
# Assign the place name to the tract that has the highest population of any row with a non-blank place name.
#
# Therefore if there are three place name entries for a tract, the tract
# will be labeled with the place name that has the highest population.
# E.g., for the following (real) data:
#
# | tract | Place Name | Population |
# |-------------|---------------------|------------|
# | 01001020802 | Pine Level CDP, AL | 2642 |
# | 01001020802 | Prattville city, AL | 2347 |
# | 01001020802 | | 5302 |
# |-------------|---------------------|------------|
#
# The largest percent of population in this tract lives in a place that has no name.
# The largest percent of population in a tract with a name is `Pine Level CDP, AL`.
# Therefore the tract should be identified as `Pine Level CDP, AL`.
# Sort field. This is created purely as a convenience function for sorting purposes.
# This field is as follows:
# | tract | Place Name | Population | Temporary Sort Field |
# |-------------|---------------------|------------|------------|
# | 01001020802 | Pine Level CDP, AL | 2642 | 102642 |
# | 01001020802 | Prattville city, AL | 2347 | 102347 |
# | 01001020802 | | 5302 | 5302 |
# |-------------|---------------------|------------|------------|
#
geocorr_df[self.TEMPORARY_SORT_FIELD] = np.where(
geocorr_df[self.PLACE_FIELD_NAME].str.strip() != "",
# Give place names a major bonus in ranking.
100000 + geocorr_df[self.POPULATION_FIELD_NAME],
# Otherwise just use population.
geocorr_df[self.POPULATION_FIELD_NAME],
)
# Sort by whether the place has a place name:
geocorr_df.sort_values(
by=self.PLACE_FIELD_NAME, axis=0, ascending=True, inplace=True
# Sort by sort field descending, so the highest entry is first.
by=self.TEMPORARY_SORT_FIELD,
axis=0,
ascending=False,
inplace=True,
)
# Drop all the duplicated rows except for the first one (which will have the place name):
rows_to_drop = geocorr_df.duplicated(
keep="first", subset=[self.GEOID_FIELD_NAME]
keep="first", subset=[self.GEOID_TRACT_FIELD_NAME]
)
# Keep everything that's *not* a row to drop:
@ -121,11 +165,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# Sort by GEOID again to put the dataframe back to original order:
# Note: avoiding using inplace because of unusual `SettingWithCopyWarning` warning.
geocorr_df = geocorr_df.sort_values(
by=self.GEOID_FIELD_NAME, axis=0, ascending=True, inplace=False
by=self.GEOID_TRACT_FIELD_NAME,
axis=0,
ascending=True,
inplace=False,
)
if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS:
raise ValueError("Too many CBGs.")
if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
raise ValueError("Too many tracts.")
return geocorr_df
@ -175,12 +222,22 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# Load and clean GEOCORR data
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
# The specific query used is the following, which takes a couple of minutes to run:
# https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g1_=bg&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo=
logger.info("Starting download of Geocorr information.")
# https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo=
#
# That query was constructed from the website https://mcdc.missouri.edu/applications/geocorr2014.html,
# with the "source geographies" selected being:
# - State
# - County
# - Place (City, Town, Village, CDP, etc)
# - Census Tract
# and with the "target geographies" selected being:
# - Core based statistical area (CBSA)
# - CBSA Type (Metro or Micro)
logger.info("Starting download of 1.5MB Geocorr information.")
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr2014_all_states.csv.zip",
+ "/geocorr2014_all_states_tracts_only.csv.zip",
download_path=self.TMP_PATH,
unzipped_file_path=self.TMP_PATH / "geocorr",
)
@ -188,7 +245,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.raw_geocorr_df = pd.read_csv(
filepath_or_buffer=self.TMP_PATH
/ "geocorr"
/ "geocorr2014_all_states.csv",
/ "geocorr2014_all_states_tracts_only.csv",
# Skip second row, which has descriptions.
skiprows=[1],
# The following need to remain as strings for all of their digits, not get converted to numbers.
@ -220,14 +277,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
msa_median_incomes_df = self._transform_msa_median_incomes()
state_median_incomes_df = self._transform_state_median_incomes()
# Join CBGs on MSA incomes
# Join tracts on MSA incomes
merged_df = geocorr_df.merge(
msa_median_incomes_df, on=self.MSA_ID_FIELD_NAME, how="left"
)
# Merge state income with CBGs
# Merge state income with tracts
merged_df[self.STATE_GEOID_FIELD_NAME] = (
merged_df[self.GEOID_FIELD_NAME].astype(str).str[0:2]
merged_df[self.GEOID_TRACT_FIELD_NAME].astype(str).str[0:2]
)
merged_with_state_income_df = merged_df.merge(
@ -236,11 +293,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
on=self.STATE_GEOID_FIELD_NAME,
)
if (
len(merged_with_state_income_df)
> self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS
):
raise ValueError("Too many CBGs in join.")
if len(merged_with_state_income_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
raise ValueError("Too many tracts in join.")
# Choose reference income: MSA if MSA type is Metro, otherwise use State.
merged_with_state_income_df[self.AMI_REFERENCE_FIELD_NAME] = [