Update Census AMI to ETL into tracts, not CBGs (#900)

* Update Census AMI to ETL into tracts, not CBGs Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-07-30 00:01:16 -07:00 · 2021-11-18 11:05:32 -05:00 · 2021-11-18 11:05:32 -05:00 · 617f41526f
commit 617f41526f
parent 537844236a
2 changed files with 81 additions and 27 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -225,7 +225,6 @@ class ScoreETL(ExtractTransformLoad):
        # Join all the data sources that use census block groups
        census_block_group_dfs = [
            self.ejscreen_df,
-            self.census_acs_median_incomes_df,
        ]

        census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
@ -241,6 +240,7 @@ class ScoreETL(ExtractTransformLoad):
            self.persistent_poverty_df,
            self.housing_and_transportation_df,
            self.national_risk_index_df,
+            self.census_acs_median_incomes_df,
        ]
        census_tract_df = self._join_tract_dfs(census_tract_dfs)

--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@ -1,5 +1,6 @@
 import json
 from pathlib import Path
+import numpy as np
 import pandas as pd
 import requests

@ -29,6 +30,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        )
        self.MSA_ID_FIELD_NAME: str = "MSA ID"
        self.MSA_TYPE_FIELD_NAME: str = "MSA Type"
+        self.POPULATION_FIELD_NAME: str = "pop10"
+        self.TEMPORARY_SORT_FIELD: str = "temporary sort field"

        # Set constants for MSA median incomes
        self.MSA_MEDIAN_INCOME_URL: str = (
@ -46,7 +49,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
        self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)"
        self.COLUMNS_TO_KEEP = [
-            self.GEOID_FIELD_NAME,
+            self.GEOID_TRACT_FIELD_NAME,
            self.PLACE_FIELD_NAME,
            self.COUNTY_FIELD_NAME,
            self.STATE_ABBREVIATION_FIELD_NAME,
@ -76,15 +79,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        )

        # Create the full GEOID out of the component parts.
-        geocorr_df[self.GEOID_FIELD_NAME] = (
-            geocorr_df["county"] + geocorr_df["tract"] + geocorr_df["bg"]
+        geocorr_df[self.GEOID_TRACT_FIELD_NAME] = (
+            geocorr_df["county"] + geocorr_df["tract"]
        )

        # QA the combined field:
-        tract_values = geocorr_df[self.GEOID_FIELD_NAME].str.len().unique()
-        if any(tract_values != [12]):
+        tract_values = (
+            geocorr_df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()
+        )
+        if any(tract_values != [11]):
            print(tract_values)
-            raise ValueError("Some of the census BG data has the wrong length.")
+            raise ValueError(
+                "Some of the census tract data has the wrong length."
+            )

        # Rename some fields
        geocorr_df.rename(
@ -101,18 +108,55 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        )

        # Remove duplicated rows.
-        # Some rows appear twice: once for the population within a CBG that's also within a census place,
-        # and once for the population that's within a CBG that's *not* within a census place.
-        # Drop the row that's not within a census place.
+        # Some rows appear more than once: once for the population within a tract that's also within a census place,
+        # and once for the population that's within a tract that's *not* within a census place.
+        # Sort based on the following rule:
+        # Assign the place name to the tract that has the highest population of any row with a non-blank place name.
+        #
+        # Therefore if there are three place name entries for a tract, the tract
+        # will be labeled with the place name that has the highest population.
+        # E.g., for the following (real) data:
+        #
+        #     | tract       | Place Name          | Population |
+        #     |-------------|---------------------|------------|
+        #     | 01001020802 | Pine Level CDP, AL  | 2642       |
+        #     | 01001020802 | Prattville city, AL | 2347       |
+        #     | 01001020802 |                     | 5302       |
+        #     |-------------|---------------------|------------|
+        #
+        # The largest percent of population in this tract lives in a place that has no name.
+        # The largest percent of population in a tract with a name is `Pine Level CDP, AL`.
+        # Therefore the tract should be identified as `Pine Level CDP, AL`.
+
+        # Sort field. This is created purely as a convenience function for sorting purposes.
+        # This field is as follows:
+        #     | tract       | Place Name          | Population | Temporary Sort Field |
+        #     |-------------|---------------------|------------|------------|
+        #     | 01001020802 | Pine Level CDP, AL  | 2642       | 102642       |
+        #     | 01001020802 | Prattville city, AL | 2347       | 102347       |
+        #     | 01001020802 |                     | 5302       | 5302       |
+        #     |-------------|---------------------|------------|------------|
+        #
+        geocorr_df[self.TEMPORARY_SORT_FIELD] = np.where(
+            geocorr_df[self.PLACE_FIELD_NAME].str.strip() != "",
+            # Give place names a major bonus in ranking.
+            100000 + geocorr_df[self.POPULATION_FIELD_NAME],
+            # Otherwise just use population.
+            geocorr_df[self.POPULATION_FIELD_NAME],
+        )

        # Sort by whether the place has a place name:
        geocorr_df.sort_values(
-            by=self.PLACE_FIELD_NAME, axis=0, ascending=True, inplace=True
+            # Sort by sort field descending, so the highest entry is first.
+            by=self.TEMPORARY_SORT_FIELD,
+            axis=0,
+            ascending=False,
+            inplace=True,
        )

        # Drop all the duplicated rows except for the first one (which will have the place name):
        rows_to_drop = geocorr_df.duplicated(
-            keep="first", subset=[self.GEOID_FIELD_NAME]
+            keep="first", subset=[self.GEOID_TRACT_FIELD_NAME]
        )

        # Keep everything that's *not* a row to drop:
@ -121,11 +165,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        # Sort by GEOID again to put the dataframe back to original order:
        # Note: avoiding using inplace because of unusual `SettingWithCopyWarning` warning.
        geocorr_df = geocorr_df.sort_values(
-            by=self.GEOID_FIELD_NAME, axis=0, ascending=True, inplace=False
+            by=self.GEOID_TRACT_FIELD_NAME,
+            axis=0,
+            ascending=True,
+            inplace=False,
        )

-        if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS:
-            raise ValueError("Too many CBGs.")
+        if len(geocorr_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
+            raise ValueError("Too many tracts.")

        return geocorr_df

@ -175,12 +222,22 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        # Load and clean GEOCORR data
        # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
        # The specific query used is the following, which takes a couple of minutes to run:
-        # https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g1_=bg&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo=
-        logger.info("Starting download of Geocorr information.")
+        # https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2014.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=state&g1_=county&g1_=placefp&g1_=tract&g2_=cbsa10&g2_=cbsatype10&wtvar=pop10&nozerob=1&title=&csvout=1&namoptf=b&listout=1&lstfmt=html&namoptr=b&oropt=&counties=&metros=&places=&latitude=&longitude=&locname=&distance=&kiloms=0&nrings=&r1=&r2=&r3=&r4=&r5=&r6=&r7=&r8=&r9=&r10=&lathi=&latlo=&longhi=&longlo=
+        #
+        # That query was constructed from the website https://mcdc.missouri.edu/applications/geocorr2014.html,
+        # with the "source geographies" selected being:
+        # - State
+        # - County
+        # - Place (City, Town, Village, CDP, etc)
+        # - Census Tract
+        # and with the "target geographies" selected being:
+        # - Core based statistical area (CBSA)
+        # - CBSA Type (Metro or Micro)
+        logger.info("Starting download of 1.5MB Geocorr information.")

        unzip_file_from_url(
            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/geocorr2014_all_states.csv.zip",
+            + "/geocorr2014_all_states_tracts_only.csv.zip",
            download_path=self.TMP_PATH,
            unzipped_file_path=self.TMP_PATH / "geocorr",
        )
@ -188,7 +245,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.raw_geocorr_df = pd.read_csv(
            filepath_or_buffer=self.TMP_PATH
            / "geocorr"
-            / "geocorr2014_all_states.csv",
+            / "geocorr2014_all_states_tracts_only.csv",
            # Skip second row, which has descriptions.
            skiprows=[1],
            # The following need to remain as strings for all of their digits, not get converted to numbers.
@ -220,14 +277,14 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        msa_median_incomes_df = self._transform_msa_median_incomes()
        state_median_incomes_df = self._transform_state_median_incomes()

-        # Join CBGs on MSA incomes
+        # Join tracts on MSA incomes
        merged_df = geocorr_df.merge(
            msa_median_incomes_df, on=self.MSA_ID_FIELD_NAME, how="left"
        )

-        # Merge state income with CBGs
+        # Merge state income with tracts
        merged_df[self.STATE_GEOID_FIELD_NAME] = (
-            merged_df[self.GEOID_FIELD_NAME].astype(str).str[0:2]
+            merged_df[self.GEOID_TRACT_FIELD_NAME].astype(str).str[0:2]
        )

        merged_with_state_income_df = merged_df.merge(
@ -236,11 +293,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            on=self.STATE_GEOID_FIELD_NAME,
        )

-        if (
-            len(merged_with_state_income_df)
-            > self.EXPECTED_MAX_CENSUS_BLOCK_GROUPS
-        ):
-            raise ValueError("Too many CBGs in join.")
+        if len(merged_with_state_income_df) > self.EXPECTED_MAX_CENSUS_TRACTS:
+            raise ValueError("Too many tracts in join.")

        # Choose reference income: MSA if MSA type is Metro, otherwise use State.
        merged_with_state_income_df[self.AMI_REFERENCE_FIELD_NAME] = [