County mapping to Tracts (#923)

* County mapping to Tracts * passing tests * last traces of cbg
2025-07-25 08:20:16 -07:00 · 2021-11-26 11:23:40 -05:00 · 2021-11-26 11:23:40 -05:00 · cc7bf0d73d
commit cc7bf0d73d
parent a4108d24c0
11 changed files with 50 additions and 44 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -59,7 +59,7 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
 # Column subsets
 CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
 TILES_SCORE_COLUMNS = [
-    "GEOID10",
+    "GEOID10_TRACT",
    "State Name",
    "County Name",
    "Total population",
@ -155,7 +155,7 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(

 # Finally we augment with the GEOID10, county, and state
 DOWNLOADABLE_SCORE_COLUMNS = [
-    "GEOID10",
+    "GEOID10_TRACT",
    "County Name",
    "State Name",
    "Score G (communities)",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -23,7 +23,7 @@ class PostScoreETL(ExtractTransformLoad):
        self.input_counties_df: pd.DataFrame
        self.input_states_df: pd.DataFrame
        self.input_score_df: pd.DataFrame
-        self.input_national_cbg_df: pd.DataFrame
+        self.input_national_tract_df: pd.DataFrame

        self.output_score_county_state_merged_df: pd.DataFrame
        self.output_score_tiles_df: pd.DataFrame
@ -50,7 +50,9 @@ class PostScoreETL(ExtractTransformLoad):

    def _extract_score(self, score_path: Path) -> pd.DataFrame:
        logger.info("Reading Score CSV")
-        df = pd.read_csv(score_path, dtype={"GEOID10": "string"})
+        df = pd.read_csv(
+            score_path, dtype={self.GEOID_TRACT_FIELD_NAME: "string"}
+        )

        # Convert total population to an int:
        df["Total population"] = df["Total population"].astype(
@ -59,12 +61,14 @@ class PostScoreETL(ExtractTransformLoad):

        return df

-    def _extract_national_cbg(self, national_cbg_path: Path) -> pd.DataFrame:
-        logger.info("Reading national CBG")
+    def _extract_national_tract(
+        self, national_tract_path: Path
+    ) -> pd.DataFrame:
+        logger.info("Reading national tract file")
        return pd.read_csv(
-            national_cbg_path,
-            names=["GEOID10"],
-            dtype={"GEOID10": "string"},
+            national_tract_path,
+            names=[self.GEOID_TRACT_FIELD_NAME],
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
            low_memory=False,
            header=None,
        )
@ -91,7 +95,7 @@ class PostScoreETL(ExtractTransformLoad):
        self.input_score_df = self._extract_score(
            constants.DATA_SCORE_CSV_FULL_FILE_PATH
        )
-        self.input_national_cbg_df = self._extract_national_cbg(
+        self.input_national_tract_df = self._extract_national_tract(
            constants.DATA_CENSUS_CSV_FILE_PATH
        )

@ -130,21 +134,22 @@ class PostScoreETL(ExtractTransformLoad):

    def _transform_score(self, initial_score_df: pd.DataFrame) -> pd.DataFrame:
        """
-        Necessary modifications to the score dataframe
+        Add the GEOID field to the score dataframe to do the merge with counties
        """
-        # Add the tract level column
-        new_df = initial_score_df.copy()
-        new_df["GEOID"] = initial_score_df.GEOID10.str[:5]
-        return new_df
+        # add GEOID column for counties
+        initial_score_df["GEOID"] = initial_score_df[
+            self.GEOID_TRACT_FIELD_NAME
+        ].str[:5]
+
+        return initial_score_df

    def _create_score_data(
        self,
-        national_cbg_df: pd.DataFrame,
+        national_tract_df: pd.DataFrame,
        counties_df: pd.DataFrame,
        states_df: pd.DataFrame,
        score_df: pd.DataFrame,
    ) -> pd.DataFrame:
-
        # merge state with counties
        logger.info("Merging state with county info")
        county_state_merged = counties_df.merge(
@ -153,15 +158,19 @@ class PostScoreETL(ExtractTransformLoad):

        # merge state + county with score
        score_county_state_merged = score_df.merge(
-            county_state_merged, on="GEOID", how="left"
+            county_state_merged,
+            on="GEOID",  # GEOID is the county ID
+            how="left",
        )

-        # check if there are census cbgs without score
-        logger.info("Removing CBG rows without score")
+        # check if there are census tracts without score
+        logger.info("Removing tract rows without score")

-        # merge census cbgs with score
-        merged_df = national_cbg_df.merge(
-            score_county_state_merged, on="GEOID10", how="left"
+        # merge census tracts with score
+        merged_df = national_tract_df.merge(
+            score_county_state_merged,
+            on=self.GEOID_TRACT_FIELD_NAME,
+            how="left",
        )

        # recast population to integer
@ -169,14 +178,14 @@ class PostScoreETL(ExtractTransformLoad):
            merged_df["Total population"].fillna(0.0).astype(int)
        )

-        # list the null score cbgs
-        null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()]
+        # list the null score tracts
+        null_tract_df = merged_df[merged_df["Score E (percentile)"].isnull()]

        # subtract data sets
        # this follows the XOR pattern outlined here:
        # https://stackoverflow.com/a/37313953
        de_duplicated_df = pd.concat(
-            [merged_df, null_cbg_df, null_cbg_df]
+            [merged_df, null_tract_df, null_tract_df]
        ).drop_duplicates(keep=False)

        # set the score to the new df
@ -212,7 +221,7 @@ class PostScoreETL(ExtractTransformLoad):
        transformed_score = self._transform_score(self.input_score_df)

        output_score_county_state_merged_df = self._create_score_data(
-            self.input_national_cbg_df,
+            self.input_national_tract_df,
            transformed_counties,
            transformed_states,
            transformed_score,
--- a/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
@ -75,11 +75,9 @@ def score_pdf_initial(sample_data_dir):
 def counties_transformed_expected():
    return pd.DataFrame.from_dict(
        data={
-            "State Abbreviation": pd.Series(["AL", "AL"], dtype="string"),
-            "GEOID": pd.Series(["01001", "01003"], dtype="string"),
-            "County Name": pd.Series(
-                ["AutaugaCounty", "BaldwinCounty"], dtype="object"
-            ),
+            "State Abbreviation": pd.Series(["AL"], dtype="string"),
+            "GEOID": pd.Series(["01073"], dtype="string"),
+            "County Name": pd.Series(["Jefferson County"], dtype="object"),
        },
    )

@ -105,11 +103,11 @@ def score_transformed_expected():


@pytest.fixture()
-def national_cbg_df():
+def national_tract_df():
    return pd.DataFrame.from_dict(
        data={
-            "GEOID10": pd.Series(
-                ["010010201001", "010010201002"], dtype="string"
+            "GEOID10_TRACT": pd.Series(
+                ["01073001100", "01073001400"], dtype="string"
            ),
        },
    )
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/county_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/county_data_initial.csv
@ -1,3 +1,2 @@
 USPS	GEOID	ANSICODE	NAME	POP10	HU10	ALAND	AWATER	ALAND_SQMI	AWATER_SQMI	INTPTLAT	INTPTLONG
-AL	01001	00161526	AutaugaCounty	54571	22135	1539582278	25775735	594.436	9.952	32.536382	-86.644490
-AL	01003	00161527	BaldwinCounty	182265	104061	4117521611	1133190229	1589.784	437.527	30.659218	-87.746067
+AL	01073	00161562	Jefferson County	658466	300552	2878192209	32474487	    1111.276	      12.538	 33.553444	 -86.896536
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -29,7 +29,7 @@ def test_extract_states(etl, state_data_initial):

 def test_extract_score(etl, score_data_initial):
    extracted = etl._extract_score(score_data_initial)
-    string_cols = ["GEOID10"]
+    string_cols = ["GEOID10_TRACT"]
    assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)


@ -63,14 +63,14 @@ def test_transform_score(etl, score_data_initial, score_transformed_expected):
 # pylint: disable=too-many-arguments
 def test_create_score_data(
    etl,
-    national_cbg_df,
+    national_tract_df,
    counties_transformed_expected,
    states_transformed_expected,
    score_transformed_expected,
    score_data_expected,
 ):
    score_data_actual = etl._create_score_data(
-        national_cbg_df,
+        national_tract_df,
        counties_transformed_expected,
        states_transformed_expected,
        score_transformed_expected,