Adding persistent poverty tracts (#738)

* persistent poverty working * fixing left-padding * running black and adding persistent poverty to comp tool * fixing bug * running black and fixing linter * fixing linter * fixing linter error
2025-09-30 07:23:17 -07:00 · 2021-09-22 16:57:08 -05:00 · 2021-09-22 16:57:08 -05:00 · b1a4d26be8
commit b1a4d26be8
parent d1ced6d584
15 changed files with 518 additions and 201 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -83,6 +83,9 @@ class ScoreETL(ExtractTransformLoad):
        # Urban Rural Map
        self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"

+        # Persistent poverty
+        self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
+
        # dataframes
        self.df: pd.DataFrame
        self.ejscreen_df: pd.DataFrame
@ -95,6 +98,7 @@ class ScoreETL(ExtractTransformLoad):
        self.doe_energy_burden_df: pd.DataFrame
        self.national_risk_index_df: pd.DataFrame
        self.geocorr_urban_rural_df: pd.DataFrame
+        self.persistent_poverty_df: pd.DataFrame

    def data_sets(self) -> list:
        # Define a named tuple that will be used for each data set input.
@ -206,6 +210,11 @@ class ScoreETL(ExtractTransformLoad):
                renamed_field=self.URBAN_HERUISTIC_FIELD_NAME,
                bucket=None,
            ),
+            DataSet(
+                input_field=self.PERSISTENT_POVERTY_FIELD,
+                renamed_field=self.PERSISTENT_POVERTY_FIELD,
+                bucket=None,
+            ),
            # The following data sets have buckets, because they're used in Score C
            DataSet(
                input_field="CANCER",
@ -405,6 +414,16 @@ class ScoreETL(ExtractTransformLoad):
            low_memory=False,
        )

+        # Load persistent poverty
+        persistent_poverty_csv = (
+            self.DATA_PATH / "dataset" / "persistent_poverty" / "usa.csv"
+        )
+        self.persistent_poverty_df = pd.read_csv(
+            persistent_poverty_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+
    def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
        logger.info("Joining Census Block Group dataframes")
        census_block_group_df = functools.reduce(
@ -692,6 +711,7 @@ class ScoreETL(ExtractTransformLoad):
            self.cdc_life_expectancy_df,
            self.doe_energy_burden_df,
            self.geocorr_urban_rural_df,
+            self.persistent_poverty_df,
        ]
        census_tract_df = self._join_tract_dfs(census_tract_dfs)

@ -743,7 +763,11 @@ class ScoreETL(ExtractTransformLoad):
        # TODO do this at the same time as calculating percentiles in future refactor
        for data_set in data_sets:
            # Skip GEOID_FIELD_NAME, because it's a string.
-            if data_set.renamed_field == self.GEOID_FIELD_NAME:
+            # Skip `PERSISTENT_POVERTY_FIELD` because it's a straight pass-through.
+            if data_set.renamed_field in (
+                self.GEOID_FIELD_NAME,
+                self.PERSISTENT_POVERTY_FIELD,
+            ):
                continue

            df[data_set.renamed_field] = pd.to_numeric(