Update Census AMI to ETL into tracts, not CBGs (#900)

* Update Census AMI to ETL into tracts, not CBGs

Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Shelby Switzer 2021-11-18 11:05:32 -05:00 committed by lucasmbrown-usds
parent 537844236a
commit 617f41526f
2 changed files with 81 additions and 27 deletions

View file

@ -225,7 +225,6 @@ class ScoreETL(ExtractTransformLoad):
# Join all the data sources that use census block groups # Join all the data sources that use census block groups
census_block_group_dfs = [ census_block_group_dfs = [
self.ejscreen_df, self.ejscreen_df,
self.census_acs_median_incomes_df,
] ]
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs) census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
@ -241,6 +240,7 @@ class ScoreETL(ExtractTransformLoad):
self.persistent_poverty_df, self.persistent_poverty_df,
self.housing_and_transportation_df, self.housing_and_transportation_df,
self.national_risk_index_df, self.national_risk_index_df,
self.census_acs_median_incomes_df,
] ]
census_tract_df = self._join_tract_dfs(census_tract_dfs) census_tract_df = self._join_tract_dfs(census_tract_dfs)

View file

@ -1,5 +1,6 @@
import json import json
from pathlib import Path from pathlib import Path
import numpy as np
import pandas as pd import pandas as pd
import requests import requests
@ -29,6 +30,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
) )
self.MSA_ID_FIELD_NAME: str = "MSA ID" self.MSA_ID_FIELD_NAME: str = "MSA ID"
self.MSA_TYPE_FIELD_NAME: str = "MSA Type" self.MSA_TYPE_FIELD_NAME: str = "MSA Type"
self.POPULATION_FIELD_NAME: str = "pop10"
self.TEMPORARY_SORT_FIELD: str = "temporary sort field"
# Set constants for MSA median incomes # Set constants for MSA median incomes
self.MSA_MEDIAN_INCOME_URL: str = ( self.MSA_MEDIAN_INCOME_URL: str = (
@ -46,7 +49,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference" self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)" self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)"
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
self.GEOID_FIELD_NAME, self.GEOID_TRACT_FIELD_NAME,
self.PLACE_FIELD_NAME, self.PLACE_FIELD_NAME,
self.COUNTY_FIELD_NAME, self.COUNTY_FIELD_NAME,
self.STATE_ABBREVIATION_FIELD_NAME, self.STATE_ABBREVIATION_FIELD_NAME,
@ -76,15 +79,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
) )
# Create the full GEOID out of the component parts. # Create the full GEOID out of the component parts.
geocorr_df[self.GEOID_FIELD_NAME] = ( geocorr_df[self.GEOID_TRACT_FIELD_NAME] = (
geocorr_df["county"] + geocorr_df["tract"] + geocorr_df["bg"] geocorr_df["county"] + geocorr_df["tract"]
) )
# QA the combined field: # QA the combined field:
tract_values = geocorr_df[self.GEOID_FIELD_NAME].str.len().unique() tract_values = (
if any(tract_values != [12]): geocorr_df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()
)
if any(tract_values != [11]):
print(tract_values) print(tract_values)
raise ValueError("Some of the census BG data has the wrong length.") raise ValueError(
"Some of the census tract data has the wrong length."
)
# Rename some fields # Rename some fields
geocorr_df.rename( geocorr_df.rename(
@ -101,18 +108,55 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
) )
# Remove duplicated rows. # Remove duplicated rows.
# Some rows appear twice: once for the population within a CBG that's also within a census place, # Some rows appear more than once: once for the population within a tract that's also within a census place,
# and once for the population that's within a CBG that's *not* within a census place. # and once for the population that's within a tract that's *not* within a census place.
# Drop the row that's not within a census place. # Sort based on the following rule:
# Assign the place name to the tract that has the highest population of any row with a non-blank place name.
#
# Therefore if there are three place name entries for a tract, the tract
# will be labeled with the place name that has the highest population.
# E.g., for the following (real) data:
#
# | tract | Place Name | Population |
# |-------------|---------------------|------------|
# | 01001020802 | Pine Level CDP, AL | 2642 |
# | 01001020802 | Prattville city, AL | 2347 |
# | 01001020802 | | 5302 |
# |-------------|---------------------|------------|
#
# The largest percent of population in this tract lives in a place that has no name.
# The largest percent of population in a tract with a name is `Pine Level CDP, AL`.
# Therefore the tract should be identified as `Pine Level CDP, AL`.
# Sort field. This is created purely as a convenience function for sorting purposes.
# This field is as follows:
# | tract | Place Name | Population | Temporary Sort Field |
# |-------------|---------------------|------------|------------|
# | 01001020802 | Pine Level CDP, AL | 2642 | 102642 |
# | 01001020802 | Prattville city, AL | 2347 | 102347 |
# | 01001020802 | | 5302 | 5302 |
# |-------------|---------------------|------------|------------|
#
geocorr_df[self.TEMPORARY_SORT_FIELD] = np.where(
geocorr_df[self.PLACE_FIELD_NAME].str.strip() != "",
# Give place names a major bonus in ranking.
100000 + geocorr_df[self.POPULATION_FIELD_NAME],
# Otherwise just use population.
geocorr_df[self.POPULATION_FIELD_NAME],
)
# Sort by whether the place has a place name: # Sort by whether the place has a place name:
geocorr_df.sort_values( geocorr_df.sort_values(
by=self.PLACE_FIELD_NAME, axis=0, ascending=True, inplace=True # Sort by sort field descending, so the highest entry is first.
by=self.TEMPORARY_SORT_FIELD,
axis=0,
ascending=False,
inplace=True,
) )
# Drop all the duplicated rows except for the first one (which will have the place name): # Drop all the duplicated rows except for the first one (which will have the place name):
rows_to_drop = geocorr_df.duplicated( rows_to_drop = geocorr_df.duplicated(
keep="first", subset=[self.GEOID_FIELD_NAME] keep="first", subset=[self.GEOID_TRACT_FIELD_NAME]
) )
# Keep everything that's *not* a row to drop: # Keep everything that's *not* a row to drop:
@ -121,11 +165,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# Sort by GEOID again to put the dataframe back to original order: # Sort by GEOID again to put the dataframe back to original order:
# Note: avoiding using inplace because of unusual `SettingWithCopyWarning` warning. # Note: avoiding using inplace because of unusual `SettingWithCopyWarning` warning.
geocorr_df = geocorr_df.sort_values( geocorr_df = geocorr_df.sort_values(
by=self.GEOID_FIELD_NAME, axis=0, ascending=True, inplace=False by=self.GEOID_TRACT_FIELD_NAME,
axis=0,
ascending=True,
inplace=False,
) )
if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS: if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
raise ValueError("Too many CBGs.") raise ValueError("Too many tracts.")
return geocorr_df return geocorr_df
@ -175,12 +222,22 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# Load and clean GEOCORR data # Load and clean GEOCORR data
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census. # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
# The specific query used is the following, which takes a couple of minutes to run: # The specific query used is the following, which takes a couple of minutes to run:
# https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g1_=bg&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo= # https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo=
logger.info("Starting download of Geocorr information.") #
# That query was constructed from the website https://mcdc.missouri.edu/applications/geocorr2014.html,
# with the "source geographies" selected being:
# - State
# - County
# - Place (City, Town, Village, CDP, etc)
# - Census Tract
# and with the "target geographies" selected being:
# - Core based statistical area (CBSA)
# - CBSA Type (Metro or Micro)
logger.info("Starting download of 1.5MB Geocorr information.")
unzip_file_from_url( unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr2014_all_states.csv.zip", + "/geocorr2014_all_states_tracts_only.csv.zip",
download_path=self.TMP_PATH, download_path=self.TMP_PATH,
unzipped_file_path=self.TMP_PATH / "geocorr", unzipped_file_path=self.TMP_PATH / "geocorr",
) )
@ -188,7 +245,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.raw_geocorr_df = pd.read_csv( self.raw_geocorr_df = pd.read_csv(
filepath_or_buffer=self.TMP_PATH filepath_or_buffer=self.TMP_PATH
/ "geocorr" / "geocorr"
/ "geocorr2014_all_states.csv", / "geocorr2014_all_states_tracts_only.csv",
# Skip second row, which has descriptions. # Skip second row, which has descriptions.
skiprows=[1], skiprows=[1],
# The following need to remain as strings for all of their digits, not get converted to numbers. # The following need to remain as strings for all of their digits, not get converted to numbers.
@ -220,14 +277,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
msa_median_incomes_df = self._transform_msa_median_incomes() msa_median_incomes_df = self._transform_msa_median_incomes()
state_median_incomes_df = self._transform_state_median_incomes() state_median_incomes_df = self._transform_state_median_incomes()
# Join CBGs on MSA incomes # Join tracts on MSA incomes
merged_df = geocorr_df.merge( merged_df = geocorr_df.merge(
msa_median_incomes_df, on=self.MSA_ID_FIELD_NAME, how="left" msa_median_incomes_df, on=self.MSA_ID_FIELD_NAME, how="left"
) )
# Merge state income with CBGs # Merge state income with tracts
merged_df[self.STATE_GEOID_FIELD_NAME] = ( merged_df[self.STATE_GEOID_FIELD_NAME] = (
merged_df[self.GEOID_FIELD_NAME].astype(str).str[0:2] merged_df[self.GEOID_TRACT_FIELD_NAME].astype(str).str[0:2]
) )
merged_with_state_income_df = merged_df.merge( merged_with_state_income_df = merged_df.merge(
@ -236,11 +293,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
on=self.STATE_GEOID_FIELD_NAME, on=self.STATE_GEOID_FIELD_NAME,
) )
if ( if len(merged_with_state_income_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
len(merged_with_state_income_df) raise ValueError("Too many tracts in join.")
> self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS
):
raise ValueError("Too many CBGs in join.")
# Choose reference income: MSA if MSA type is Metro, otherwise use State. # Choose reference income: MSA if MSA type is Metro, otherwise use State.
merged_with_state_income_df[self.AMI_REFERENCE_FIELD_NAME] = [ merged_with_state_income_df[self.AMI_REFERENCE_FIELD_NAME] = [