mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 09:41:26 -08:00
Update Census AMI to ETL into tracts, not CBGs (#900)
* Update Census AMI to ETL into tracts, not CBGs Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
parent
537844236a
commit
617f41526f
2 changed files with 81 additions and 27 deletions
|
@ -225,7 +225,6 @@ class ScoreETL(ExtractTransformLoad):
|
|||
# Join all the data sources that use census block groups
|
||||
census_block_group_dfs = [
|
||||
self.ejscreen_df,
|
||||
self.census_acs_median_incomes_df,
|
||||
]
|
||||
|
||||
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
|
||||
|
@ -241,6 +240,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.persistent_poverty_df,
|
||||
self.housing_and_transportation_df,
|
||||
self.national_risk_index_df,
|
||||
self.census_acs_median_incomes_df,
|
||||
]
|
||||
census_tract_df = self._join_tract_dfs(census_tract_dfs)
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
|
@ -29,6 +30,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
)
|
||||
self.MSA_ID_FIELD_NAME: str = "MSA ID"
|
||||
self.MSA_TYPE_FIELD_NAME: str = "MSA Type"
|
||||
self.POPULATION_FIELD_NAME: str = "pop10"
|
||||
self.TEMPORARY_SORT_FIELD: str = "temporary sort field"
|
||||
|
||||
# Set constants for MSA median incomes
|
||||
self.MSA_MEDIAN_INCOME_URL: str = (
|
||||
|
@ -46,7 +49,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
|
||||
self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)"
|
||||
self.COLUMNS_TO_KEEP = [
|
||||
self.GEOID_FIELD_NAME,
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
self.PLACE_FIELD_NAME,
|
||||
self.COUNTY_FIELD_NAME,
|
||||
self.STATE_ABBREVIATION_FIELD_NAME,
|
||||
|
@ -76,15 +79,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
# Create the full GEOID out of the component parts.
|
||||
geocorr_df[self.GEOID_FIELD_NAME] = (
|
||||
geocorr_df["county"] + geocorr_df["tract"] + geocorr_df["bg"]
|
||||
geocorr_df[self.GEOID_TRACT_FIELD_NAME] = (
|
||||
geocorr_df["county"] + geocorr_df["tract"]
|
||||
)
|
||||
|
||||
# QA the combined field:
|
||||
tract_values = geocorr_df[self.GEOID_FIELD_NAME].str.len().unique()
|
||||
if any(tract_values != [12]):
|
||||
tract_values = (
|
||||
geocorr_df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()
|
||||
)
|
||||
if any(tract_values != [11]):
|
||||
print(tract_values)
|
||||
raise ValueError("Some of the census BG data has the wrong length.")
|
||||
raise ValueError(
|
||||
"Some of the census tract data has the wrong length."
|
||||
)
|
||||
|
||||
# Rename some fields
|
||||
geocorr_df.rename(
|
||||
|
@ -101,18 +108,55 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
# Remove duplicated rows.
|
||||
# Some rows appear twice: once for the population within a CBG that's also within a census place,
|
||||
# and once for the population that's within a CBG that's *not* within a census place.
|
||||
# Drop the row that's not within a census place.
|
||||
# Some rows appear more than once: once for the population within a tract that's also within a census place,
|
||||
# and once for the population that's within a tract that's *not* within a census place.
|
||||
# Sort based on the following rule:
|
||||
# Assign the place name to the tract that has the highest population of any row with a non-blank place name.
|
||||
#
|
||||
# Therefore if there are three place name entries for a tract, the tract
|
||||
# will be labeled with the place name that has the highest population.
|
||||
# E.g., for the following (real) data:
|
||||
#
|
||||
# | tract | Place Name | Population |
|
||||
# |-------------|---------------------|------------|
|
||||
# | 01001020802 | Pine Level CDP, AL | 2642 |
|
||||
# | 01001020802 | Prattville city, AL | 2347 |
|
||||
# | 01001020802 | | 5302 |
|
||||
# |-------------|---------------------|------------|
|
||||
#
|
||||
# The largest percent of population in this tract lives in a place that has no name.
|
||||
# The largest percent of population in a tract with a name is `Pine Level CDP, AL`.
|
||||
# Therefore the tract should be identified as `Pine Level CDP, AL`.
|
||||
|
||||
# Sort field. This is created purely as a convenience function for sorting purposes.
|
||||
# This field is as follows:
|
||||
# | tract | Place Name | Population | Temporary Sort Field |
|
||||
# |-------------|---------------------|------------|------------|
|
||||
# | 01001020802 | Pine Level CDP, AL | 2642 | 102642 |
|
||||
# | 01001020802 | Prattville city, AL | 2347 | 102347 |
|
||||
# | 01001020802 | | 5302 | 5302 |
|
||||
# |-------------|---------------------|------------|------------|
|
||||
#
|
||||
geocorr_df[self.TEMPORARY_SORT_FIELD] = np.where(
|
||||
geocorr_df[self.PLACE_FIELD_NAME].str.strip() != "",
|
||||
# Give place names a major bonus in ranking.
|
||||
100000 + geocorr_df[self.POPULATION_FIELD_NAME],
|
||||
# Otherwise just use population.
|
||||
geocorr_df[self.POPULATION_FIELD_NAME],
|
||||
)
|
||||
|
||||
# Sort by whether the place has a place name:
|
||||
geocorr_df.sort_values(
|
||||
by=self.PLACE_FIELD_NAME, axis=0, ascending=True, inplace=True
|
||||
# Sort by sort field descending, so the highest entry is first.
|
||||
by=self.TEMPORARY_SORT_FIELD,
|
||||
axis=0,
|
||||
ascending=False,
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
# Drop all the duplicated rows except for the first one (which will have the place name):
|
||||
rows_to_drop = geocorr_df.duplicated(
|
||||
keep="first", subset=[self.GEOID_FIELD_NAME]
|
||||
keep="first", subset=[self.GEOID_TRACT_FIELD_NAME]
|
||||
)
|
||||
|
||||
# Keep everything that's *not* a row to drop:
|
||||
|
@ -121,11 +165,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
# Sort by GEOID again to put the dataframe back to original order:
|
||||
# Note: avoiding using inplace because of unusual `SettingWithCopyWarning` warning.
|
||||
geocorr_df = geocorr_df.sort_values(
|
||||
by=self.GEOID_FIELD_NAME, axis=0, ascending=True, inplace=False
|
||||
by=self.GEOID_TRACT_FIELD_NAME,
|
||||
axis=0,
|
||||
ascending=True,
|
||||
inplace=False,
|
||||
)
|
||||
|
||||
if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS:
|
||||
raise ValueError("Too many CBGs.")
|
||||
if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
|
||||
raise ValueError("Too many tracts.")
|
||||
|
||||
return geocorr_df
|
||||
|
||||
|
@ -175,12 +222,22 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
# Load and clean GEOCORR data
|
||||
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
|
||||
# The specific query used is the following, which takes a couple of minutes to run:
|
||||
# https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g1_=bg&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo=
|
||||
logger.info("Starting download of Geocorr information.")
|
||||
# https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo=
|
||||
#
|
||||
# That query was constructed from the website https://mcdc.missouri.edu/applications/geocorr2014.html,
|
||||
# with the "source geographies" selected being:
|
||||
# - State
|
||||
# - County
|
||||
# - Place (City, Town, Village, CDP, etc)
|
||||
# - Census Tract
|
||||
# and with the "target geographies" selected being:
|
||||
# - Core based statistical area (CBSA)
|
||||
# - CBSA Type (Metro or Micro)
|
||||
logger.info("Starting download of 1.5MB Geocorr information.")
|
||||
|
||||
unzip_file_from_url(
|
||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||
+ "/geocorr2014_all_states.csv.zip",
|
||||
+ "/geocorr2014_all_states_tracts_only.csv.zip",
|
||||
download_path=self.TMP_PATH,
|
||||
unzipped_file_path=self.TMP_PATH / "geocorr",
|
||||
)
|
||||
|
@ -188,7 +245,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
self.raw_geocorr_df = pd.read_csv(
|
||||
filepath_or_buffer=self.TMP_PATH
|
||||
/ "geocorr"
|
||||
/ "geocorr2014_all_states.csv",
|
||||
/ "geocorr2014_all_states_tracts_only.csv",
|
||||
# Skip second row, which has descriptions.
|
||||
skiprows=[1],
|
||||
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
||||
|
@ -220,14 +277,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
msa_median_incomes_df = self._transform_msa_median_incomes()
|
||||
state_median_incomes_df = self._transform_state_median_incomes()
|
||||
|
||||
# Join CBGs on MSA incomes
|
||||
# Join tracts on MSA incomes
|
||||
merged_df = geocorr_df.merge(
|
||||
msa_median_incomes_df, on=self.MSA_ID_FIELD_NAME, how="left"
|
||||
)
|
||||
|
||||
# Merge state income with CBGs
|
||||
# Merge state income with tracts
|
||||
merged_df[self.STATE_GEOID_FIELD_NAME] = (
|
||||
merged_df[self.GEOID_FIELD_NAME].astype(str).str[0:2]
|
||||
merged_df[self.GEOID_TRACT_FIELD_NAME].astype(str).str[0:2]
|
||||
)
|
||||
|
||||
merged_with_state_income_df = merged_df.merge(
|
||||
|
@ -236,11 +293,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
on=self.STATE_GEOID_FIELD_NAME,
|
||||
)
|
||||
|
||||
if (
|
||||
len(merged_with_state_income_df)
|
||||
> self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS
|
||||
):
|
||||
raise ValueError("Too many CBGs in join.")
|
||||
if len(merged_with_state_income_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
|
||||
raise ValueError("Too many tracts in join.")
|
||||
|
||||
# Choose reference income: MSA if MSA type is Metro, otherwise use State.
|
||||
merged_with_state_income_df[self.AMI_REFERENCE_FIELD_NAME] = [
|
||||
|
|
Loading…
Add table
Reference in a new issue