Adding persistent poverty tracts (#738)

* persistent poverty working * fixing left-padding * running black and adding persistent poverty to comp tool * fixing bug * running black and fixing linter * fixing linter * fixing linter error
2025-07-24 04:10:17 -07:00 · 2021-09-22 16:57:08 -05:00 · 2021-09-22 16:57:08 -05:00 · b1a4d26be8
commit b1a4d26be8
parent d1ced6d584
15 changed files with 518 additions and 201 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -22,8 +22,9 @@ class ExtractTransformLoad:
    FILES_PATH: Path = settings.APP_ROOT / "files"
    GEOID_FIELD_NAME: str = "GEOID10"
    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
-    # TODO: investigate. Census says there are only 217,740 CBGs in the US.
+    # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
    EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
+    EXPECTED_MAX_CENSUS_TRACTS: int = 73076

    def get_yaml_config(self) -> None:
        """Reads the YAML configuration file for the dataset and stores
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -64,6 +64,11 @@ DATASET_LIST = [
        "module_dir": "geocorr",
        "class_name": "GeoCorrETL",
    },
+    {
+        "name": "persistent_poverty",
+        "module_dir": "persistent_poverty",
+        "class_name": "PersistentPovertyETL",
+    },
 ]
 CENSUS_INFO = {
    "name": "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -83,6 +83,9 @@ class ScoreETL(ExtractTransformLoad):
        # Urban Rural Map
        self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"

+        # Persistent poverty
+        self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
+
        # dataframes
        self.df: pd.DataFrame
        self.ejscreen_df: pd.DataFrame
@ -95,6 +98,7 @@ class ScoreETL(ExtractTransformLoad):
        self.doe_energy_burden_df: pd.DataFrame
        self.national_risk_index_df: pd.DataFrame
        self.geocorr_urban_rural_df: pd.DataFrame
+        self.persistent_poverty_df: pd.DataFrame

    def data_sets(self) -> list:
        # Define a named tuple that will be used for each data set input.
@ -206,6 +210,11 @@ class ScoreETL(ExtractTransformLoad):
                renamed_field=self.URBAN_HERUISTIC_FIELD_NAME,
                bucket=None,
            ),
+            DataSet(
+                input_field=self.PERSISTENT_POVERTY_FIELD,
+                renamed_field=self.PERSISTENT_POVERTY_FIELD,
+                bucket=None,
+            ),
            # The following data sets have buckets, because they're used in Score C
            DataSet(
                input_field="CANCER",
@ -405,6 +414,16 @@ class ScoreETL(ExtractTransformLoad):
            low_memory=False,
        )

+        # Load persistent poverty
+        persistent_poverty_csv = (
+            self.DATA_PATH / "dataset" / "persistent_poverty" / "usa.csv"
+        )
+        self.persistent_poverty_df = pd.read_csv(
+            persistent_poverty_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+
    def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
        logger.info("Joining Census Block Group dataframes")
        census_block_group_df = functools.reduce(
@ -692,6 +711,7 @@ class ScoreETL(ExtractTransformLoad):
            self.cdc_life_expectancy_df,
            self.doe_energy_burden_df,
            self.geocorr_urban_rural_df,
+            self.persistent_poverty_df,
        ]
        census_tract_df = self._join_tract_dfs(census_tract_dfs)

@ -743,7 +763,11 @@ class ScoreETL(ExtractTransformLoad):
        # TODO do this at the same time as calculating percentiles in future refactor
        for data_set in data_sets:
            # Skip GEOID_FIELD_NAME, because it's a string.
-            if data_set.renamed_field == self.GEOID_FIELD_NAME:
+            # Skip `PERSISTENT_POVERTY_FIELD` because it's a straight pass-through.
+            if data_set.renamed_field in (
+                self.GEOID_FIELD_NAME,
+                self.PERSISTENT_POVERTY_FIELD,
+            ):
                continue

            df[data_set.renamed_field] = pd.to_numeric(
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/README.md
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
@ -0,0 +1,174 @@
+import functools
+import pandas as pd
+
+from data_pipeline.config import settings
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.utils import (
+    get_module_logger,
+    unzip_file_from_url,
+)
+
+logger = get_module_logger(__name__)
+
+
+class PersistentPovertyETL(ExtractTransformLoad):
+    """Persistent poverty data.
+
+    Loaded from `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTDB.htm`.
+
+    Codebook: `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTBDDload/Dfiles/codebooks.pdf`.
+    """
+
+    def __init__(self):
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
+
+        # Need to change hyperlink to S3
+        # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/persistent_poverty_urban_rural.csv.zip"
+        self.GEOID_TRACT_INPUT_FIELD_NAME_1 = "TRTID10"
+        self.GEOID_TRACT_INPUT_FIELD_NAME_2 = "tractid"
+        # self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
+
+        self.POVERTY_PREFIX = "Individuals in Poverty (percent)"
+        self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
+
+        self.COLUMNS_TO_KEEP = [
+            self.GEOID_TRACT_FIELD_NAME,
+            f"{self.POVERTY_PREFIX} (1990)",
+            f"{self.POVERTY_PREFIX} (2000)",
+            f"{self.POVERTY_PREFIX} (2010)",
+            self.PERSISTENT_POVERTY_FIELD,
+        ]
+
+        self.df: pd.DataFrame
+
+    def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
+        df = functools.reduce(
+            lambda df_a, df_b: pd.merge(
+                left=df_a,
+                right=df_b,
+                # All data frames will now have this field for tract.
+                on=self.GEOID_TRACT_FIELD_NAME,
+                how="outer",
+            ),
+            dfs,
+        )
+
+        # Left-pad the tracts with 0s
+        expected_length_of_census_tract_field = 11
+        df[self.GEOID_TRACT_FIELD_NAME] = (
+            df[self.GEOID_TRACT_FIELD_NAME]
+            .astype(str)
+            .apply(lambda x: x.zfill(expected_length_of_census_tract_field))
+        )
+
+        # Sanity check the join.
+        if len(df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()) != 1:
+            raise ValueError(
+                f"One of the input CSVs uses {self.GEOID_TRACT_FIELD_NAME} with a different length."
+            )
+
+        if len(df) > self.EXPECTED_MAX_CENSUS_TRACTS:
+            raise ValueError(f"Too many rows in the join: {len(df)}")
+
+        return df
+
+    def extract(self) -> None:
+        logger.info("Starting to download 86MB persistent poverty file.")
+
+        unzipped_file_path = self.TMP_PATH / "persistent_poverty"
+
+        unzip_file_from_url(
+            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/LTDB_Std_All_Sample.zip",
+            download_path=self.TMP_PATH,
+            unzipped_file_path=unzipped_file_path,
+        )
+
+        file_names = [
+            "ltdb_std_1990_sample.csv",
+            "ltdb_std_2000_sample.csv",
+            "ltdb_std_2010_sample.csv",
+        ]
+
+        temporary_input_dfs = []
+
+        for file_name in file_names:
+            print(file_name)
+            temporary_input_df = pd.read_csv(
+                filepath_or_buffer=unzipped_file_path
+                / f"ltdb_std_all_sample/{file_name}",
+                dtype={
+                    self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
+                    self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
+                },
+                low_memory=False,
+                encoding="latin1",
+            )
+
+            # Some CSVs have self.GEOID_TRACT_INPUT_FIELD_NAME_1 as the name of the tract field,
+            # and some have self.GEOID_TRACT_INPUT_FIELD_NAME_2. Rename them both to the same tract name.
+            temporary_input_df.rename(
+                columns={
+                    self.GEOID_TRACT_INPUT_FIELD_NAME_1: self.GEOID_TRACT_FIELD_NAME,
+                    self.GEOID_TRACT_INPUT_FIELD_NAME_2: self.GEOID_TRACT_FIELD_NAME,
+                },
+                inplace=True,
+                # Ignore errors b/c of the different field names in different CSVs.
+                errors="ignore",
+            )
+
+            temporary_input_dfs.append(temporary_input_df)
+
+        self.df = self._join_input_dfs(temporary_input_dfs)
+
+    def transform(self) -> None:
+        logger.info("Starting persistent poverty transform")
+        transformed_df = self.df
+
+        # Note: the fields are defined as following.
+        # dpovXX Description: persons for whom poverty status is determined
+        # npovXX Description: persons in poverty
+        transformed_df[f"{self.POVERTY_PREFIX} (1990)"] = (
+            transformed_df["NPOV90"] / transformed_df["DPOV90"]
+        )
+        transformed_df[f"{self.POVERTY_PREFIX} (2000)"] = (
+            transformed_df["NPOV00"] / transformed_df["DPOV00"]
+        )
+        # Note: for 2010, they use ACS data ending in 2012 that has 2010 as its midpoint year.
+        transformed_df[f"{self.POVERTY_PREFIX} (2010)"] = (
+            transformed_df["npov12"] / transformed_df["dpov12"]
+        )
+
+        poverty_threshold = 0.2
+
+        transformed_df[self.PERSISTENT_POVERTY_FIELD] = (
+            (
+                transformed_df[f"{self.POVERTY_PREFIX} (1990)"]
+                >= poverty_threshold
+            )
+            & (
+                transformed_df[f"{self.POVERTY_PREFIX} (2000)"]
+                >= poverty_threshold
+            )
+            & (
+                transformed_df[f"{self.POVERTY_PREFIX} (2010)"]
+                >= poverty_threshold
+            )
+        )
+
+        self.df = transformed_df
+
+    def load(self) -> None:
+        logger.info("Saving persistent poverty data.")
+
+        # mkdir census
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+
+        self.df[self.COLUMNS_TO_KEEP].to_csv(
+            path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
+        )
+
+    def validate(self) -> None:
+        logger.info("Validating persistent poverty data.")
+
+        pass
--- a/data/data-pipeline/data_pipeline/ipython/ACS
+++ b/data/data-pipeline/data_pipeline/ipython/ACS
@ -36,12 +36,8 @@
    "DATA_PATH = Path.cwd().parent / \"data\"\n",
    "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
    "ACS_YEAR = \"2019\"\n",
-    "OUTPUT_PATH = (\n",
-    "            DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
-    "        )\n",
-    "CENSUS_USA_CSV = (\n",
-    "            DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
-    "        )"
+    "OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
+    "CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
   ]
  },
  {
@ -52,12 +48,12 @@
   "outputs": [],
   "source": [
    "cbg_usa_df = pd.read_csv(\n",
-    "            CENSUS_USA_CSV,\n",
-    "            names=['GEOID10'],\n",
-    "            dtype={\"GEOID10\": \"string\"},\n",
-    "            low_memory=False,\n",
-    "            header=None\n",
-    "        )"
+    "    CENSUS_USA_CSV,\n",
+    "    names=[\"GEOID10\"],\n",
+    "    dtype={\"GEOID10\": \"string\"},\n",
+    "    low_memory=False,\n",
+    "    header=None,\n",
+    ")"
   ]
  },
  {
@ -163,10 +159,10 @@
   "outputs": [],
   "source": [
    "acs_df = pd.read_csv(\n",
-    "            OUTPUT_PATH / \"usa.csv\",\n",
-    "            dtype={\"GEOID10\": \"string\"},\n",
-    "            low_memory=False,\n",
-    "        )"
+    "    OUTPUT_PATH / \"usa.csv\",\n",
+    "    dtype={\"GEOID10\": \"string\"},\n",
+    "    low_memory=False,\n",
+    ")"
   ]
  },
  {
@ -292,9 +288,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "merged_df = cbg_usa_df.merge(\n",
-    "            acs_df, on=\"GEOID10\", how=\"left\"\n",
-    "        )"
+    "merged_df = cbg_usa_df.merge(acs_df, on=\"GEOID10\", how=\"left\")"
   ]
  },
  {
--- a/data/data-pipeline/data_pipeline/ipython/EJScreen
+++ b/data/data-pipeline/data_pipeline/ipython/EJScreen
@ -35,12 +35,8 @@
   "source": [
    "DATA_PATH = Path.cwd().parent / \"data\"\n",
    "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
-    "OUTPUT_PATH = (\n",
-    "            DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n",
-    "        )\n",
-    "CENSUS_USA_CSV = (\n",
-    "            DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
-    "        )"
+    "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n",
+    "CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
   ]
  },
  {
@ -51,12 +47,12 @@
   "outputs": [],
   "source": [
    "cbg_usa_df = pd.read_csv(\n",
-    "            CENSUS_USA_CSV,\n",
-    "            names=['GEOID10'],\n",
-    "            dtype={\"GEOID10\": \"string\"},\n",
-    "            low_memory=False,\n",
-    "            header=None\n",
-    "        )"
+    "    CENSUS_USA_CSV,\n",
+    "    names=[\"GEOID10\"],\n",
+    "    dtype={\"GEOID10\": \"string\"},\n",
+    "    low_memory=False,\n",
+    "    header=None,\n",
+    ")"
   ]
  },
  {
@ -162,10 +158,10 @@
   "outputs": [],
   "source": [
    "ejscreen_df = pd.read_csv(\n",
-    "            OUTPUT_PATH / \"usa.csv\",\n",
-    "            dtype={\"ID\": \"string\"},\n",
-    "            low_memory=False,\n",
-    "        )"
+    "    OUTPUT_PATH / \"usa.csv\",\n",
+    "    dtype={\"ID\": \"string\"},\n",
+    "    low_memory=False,\n",
+    ")"
   ]
  },
  {
@ -176,9 +172,9 @@
   "outputs": [],
   "source": [
    "ejscreen_df.rename(\n",
-    "            columns={\"ID\": \"GEOID10\"},\n",
-    "            inplace=True,\n",
-    "        )"
+    "    columns={\"ID\": \"GEOID10\"},\n",
+    "    inplace=True,\n",
+    ")"
   ]
  },
  {
@ -458,9 +454,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "merged_df = cbg_usa_df.merge(\n",
-    "            ejscreen_df, on=\"GEOID10\", how=\"left\"\n",
-    "        )"
+    "merged_df = cbg_usa_df.merge(ejscreen_df, on=\"GEOID10\", how=\"left\")"
   ]
  },
  {
@ -1092,9 +1086,7 @@
   "id": "d1a7b71d",
   "metadata": {},
   "outputs": [],
-   "source": [
-    "\n"
-   ]
+   "source": []
  }
 ],
 "metadata": {
--- a/data/data-pipeline/data_pipeline/ipython/Score
+++ b/data/data-pipeline/data_pipeline/ipython/Score
@ -35,12 +35,8 @@
   "source": [
    "DATA_PATH = Path.cwd().parent / \"data\"\n",
    "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
-    "OUTPUT_PATH = (\n",
-    "            DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
-    "        )\n",
-    "CENSUS_USA_CSV = (\n",
-    "            DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
-    "        )"
+    "OUTPUT_PATH = DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
+    "CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
   ]
  },
  {
@ -51,12 +47,12 @@
   "outputs": [],
   "source": [
    "cbg_usa_df = pd.read_csv(\n",
-    "            CENSUS_USA_CSV,\n",
-    "            names=['GEOID10'],\n",
-    "            dtype={\"GEOID10\": \"string\"},\n",
-    "            low_memory=False,\n",
-    "            header=None\n",
-    "        )"
+    "    CENSUS_USA_CSV,\n",
+    "    names=[\"GEOID10\"],\n",
+    "    dtype={\"GEOID10\": \"string\"},\n",
+    "    low_memory=False,\n",
+    "    header=None,\n",
+    ")"
   ]
  },
  {
@ -162,10 +158,10 @@
   "outputs": [],
   "source": [
    "score_df = pd.read_csv(\n",
-    "            OUTPUT_PATH / \"usa.csv\",\n",
-    "            dtype={\"GEOID10\": \"string\"},\n",
-    "            low_memory=False,\n",
-    "        )"
+    "    OUTPUT_PATH / \"usa.csv\",\n",
+    "    dtype={\"GEOID10\": \"string\"},\n",
+    "    low_memory=False,\n",
+    ")"
   ]
  },
  {
@ -381,9 +377,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "merged_df = cbg_usa_df.merge(\n",
-    "            score_df, on=\"GEOID10\", how=\"left\"\n",
-    "        )"
+    "merged_df = cbg_usa_df.merge(score_df, on=\"GEOID10\", how=\"left\")"
   ]
  },
  {
--- a/data/data-pipeline/data_pipeline/ipython/Score_Dissolve_Script.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/Score_Dissolve_Script.ipynb
@ -33,7 +33,9 @@
   "source": [
    "def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
    "    state_gdf = gpd.read_file(file_name)\n",
-    "    state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n",
+    "    state_repr = state_gdf.to_crs(\n",
+    "        \"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\"\n",
+    "    )\n",
    "    state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n",
    "    state_merged_simplified = state_merged[\n",
    "        [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
@ -67,9 +69,9 @@
    "\n",
    "def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n",
    "    # dissolve tracts by bucket\n",
-    "    state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n",
-    "        drop=True\n",
-    "    )\n",
+    "    state_attr = state_tracts[\n",
+    "        [\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]\n",
+    "    ].reset_index(drop=True)\n",
    "    state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n",
    "    return state_dissolve\n",
    "\n",
@ -91,10 +93,12 @@
    "    gdf_compressed = gpd.GeoDataFrame(\n",
    "        compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
    "    )\n",
-    "    gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n",
+    "    gdf_compressed.to_file(\n",
+    "        CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\"\n",
+    "    )\n",
    "\n",
    "\n",
-    "def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n",
+    "def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets: int):\n",
    "    print(f\"Processing file {file_name}...\")\n",
    "    state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n",
    "    state_tracts = aggregate_to_tracts(state_merged_simplified)\n",
@ -115,7 +119,9 @@
    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
    "CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
    "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
-    "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
+    "score_df = pd.read_csv(\n",
+    "    CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False\n",
+    ")"
   ]
  },
  {
@ -185,9 +191,9 @@
   },
   "outputs": [],
   "source": [
-    "for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
-    "   state_gdf = gpd.read_file(file_name)\n",
-    "   master_df = master_df.append(state_gdf)"
+    "for file_name in CENSUS_GEOJSON_DIR.rglob(\"*.json\"):\n",
+    "    state_gdf = gpd.read_file(file_name)\n",
+    "    master_df = master_df.append(state_gdf)"
   ]
  },
  {
@ -672,7 +678,9 @@
   },
   "outputs": [],
   "source": [
-    "usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
+    "usa_merged_compressed.to_file(\n",
+    "    CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\"\n",
+    ")"
   ]
  },
  {
@ -684,8 +692,8 @@
   "outputs": [],
   "source": [
    "usa_simplified = usa_merged[\n",
-    "        [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
-    "    ].reset_index(drop=True)"
+    "    [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
+    "].reset_index(drop=True)"
   ]
  },
  {
@ -696,9 +704,7 @@
   },
   "outputs": [],
   "source": [
-    "usa_simplified.rename(\n",
-    "        columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
-    "    )"
+    "usa_simplified.rename(columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True)"
   ]
  },
  {
@ -714,8 +720,8 @@
   "outputs": [],
   "source": [
    "usa_cbg_compressed = gpd.GeoDataFrame(\n",
-    "        usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
-    "    )"
+    "    usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
+    ")"
   ]
  },
  {
@ -726,7 +732,9 @@
   },
   "outputs": [],
   "source": [
-    "usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
+    "usa_cbg_compressed.to_file(\n",
+    "    CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\"\n",
+    ")"
   ]
  },
  {
@ -764,8 +772,8 @@
   "outputs": [],
   "source": [
    "tracts_compressed = gpd.GeoDataFrame(\n",
-    "        usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
-    "    )"
+    "    usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
+    ")"
   ]
  },
  {
@ -776,7 +784,9 @@
   },
   "outputs": [],
   "source": [
-    "tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
+    "tracts_compressed.to_file(\n",
+    "    CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\"\n",
+    ")"
   ]
  },
  {
@ -877,8 +887,8 @@
   "outputs": [],
   "source": [
    "gdf_compressed = gpd.GeoDataFrame(\n",
-    "        compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
-    "    )"
+    "    compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
+    ")"
   ]
  },
  {
@ -917,7 +927,9 @@
   },
   "outputs": [],
   "source": [
-    "gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
+    "gdf_compressed.to_file(\n",
+    "    CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\"\n",
+    ")"
   ]
  }
 ],
--- a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
@ -39,7 +39,9 @@
   "source": [
    "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
    "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
-    "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))"
+    "censusdata.printtable(\n",
+    "    censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n",
+    ")"
   ],
   "outputs": [],
   "metadata": {
@ -65,8 +67,8 @@
    "            year=ACS_YEAR,\n",
    "            geo=censusdata.censusgeo(\n",
    "                [\n",
-    "                    (\"state\", fips) \n",
-    "                 #, (\"county\", \"*\"), (\"block group\", \"*\")\n",
+    "                    (\"state\", fips)\n",
+    "                    # , (\"county\", \"*\"), (\"block group\", \"*\")\n",
    "                ]\n",
    "            ),\n",
    "            var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n",
@ -75,7 +77,9 @@
    "\n",
    "df = pd.concat(dfs)\n",
    "\n",
-    "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
+    "df[GEOID_FIELD_NAME] = df.index.to_series().apply(\n",
+    "    func=fips_from_censusdata_censusgeo\n",
+    ")\n",
    "\n",
    "df.head()"
   ],
@ -90,7 +94,13 @@
   "source": [
    "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
    "\n",
-    "df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n",
+    "df.rename(\n",
+    "    columns={\n",
+    "        \"GEOID10\": \"GEOID2\",\n",
+    "        \"B19013_001E\": \"Median household income (State)\",\n",
+    "    },\n",
+    "    inplace=True,\n",
+    ")\n",
    "\n",
    "# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
   ],
--- a/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb
@ -20,7 +20,7 @@
    "module_path = os.path.abspath(os.path.join(\"..\"))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
-    "    \n",
+    "\n",
    "from data_pipeline.utils import unzip_file_from_url\n",
    "from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes"
   ],
@ -57,9 +57,16 @@
   "cell_type": "code",
   "execution_count": null,
   "source": [
-    "counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n",
-    "counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n",
-    "counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n",
+    "counties_df = pd.read_csv(\n",
+    "    CENSUS_COUNTIES_TXT,\n",
+    "    sep=\"\\t\",\n",
+    "    dtype={\"GEOID\": \"string\", \"USPS\": \"string\"},\n",
+    "    low_memory=False,\n",
+    ")\n",
+    "counties_df = counties_df[[\"USPS\", \"GEOID\", \"NAME\"]]\n",
+    "counties_df.rename(\n",
+    "    columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True\n",
+    ")\n",
    "counties_df.head()"
   ],
   "outputs": [],
@ -69,8 +76,17 @@
   "cell_type": "code",
   "execution_count": null,
   "source": [
-    "states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n",
-    "states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n",
+    "states_df = pd.read_csv(\n",
+    "    STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"}\n",
+    ")\n",
+    "states_df.rename(\n",
+    "    columns={\n",
+    "        \"fips\": \"State Code\",\n",
+    "        \"state_name\": \"State Name\",\n",
+    "        \"state_abbreviation\": \"State Abbreviation\",\n",
+    "    },\n",
+    "    inplace=True,\n",
+    ")\n",
    "states_df.head()"
   ],
   "outputs": [],
@ -80,7 +96,7 @@
   "cell_type": "code",
   "execution_count": null,
   "source": [
-    "county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n",
+    "county_state_merged = counties_df.join(states_df, rsuffix=\" Other\")\n",
    "del county_state_merged[\"State Abbreviation Other\"]\n",
    "county_state_merged.head()"
   ],
@ -102,7 +118,7 @@
   "cell_type": "code",
   "execution_count": null,
   "source": [
-    "score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n",
+    "score_county_state_merged = score_df.join(county_state_merged, rsuffix=\"_OTHER\")\n",
    "del score_county_state_merged[\"GEOID_OTHER\"]\n",
    "score_county_state_merged.head()"
   ],
--- a/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
@ -35,7 +35,7 @@
    "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
    "\n",
    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
-    "tqdm_notebook.pandas()\n"
+    "tqdm_notebook.pandas()"
   ]
  },
  {
@ -89,14 +89,12 @@
    "    \"Poverty (Less than 200% of federal poverty line)\",\n",
    "    \"Percent individuals age 25 or over with less than high school degree\",\n",
    "    \"Unemployed civilians (percent)\",\n",
-    "    \"Linguistic isolation (percent)\"\n",
+    "    \"Linguistic isolation (percent)\",\n",
    "]\n",
    "\n",
    "column_to_plot = columns_to_plot[0]\n",
    "print(f\"Plotting {column_to_plot}\")\n",
-    "print(cejst_df[\n",
-    "    column_to_plot\n",
-    "].hist())"
+    "print(cejst_df[column_to_plot].hist())"
   ]
  },
  {
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -152,15 +152,17 @@
    "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
    "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
    "\n",
-    "calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
+    "calenviroscreen_data_path = (\n",
+    "    DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
+    ")\n",
    "calenviroscreen_df = pd.read_csv(\n",
    "    calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
    ")\n",
    "\n",
    "# Convert priority community field to a bool.\n",
-    "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
+    "calenviroscreen_df[\n",
    "    CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
-    "].astype(bool)\n",
+    "] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n",
    "\n",
    "calenviroscreen_df.head()"
   ]
@ -168,19 +170,33 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "1bf54af1",
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "df458f08",
+   "metadata": {},
   "outputs": [],
   "source": [
-    "# Load HUD data\n",
-    "hud_recap_data_path = DATA_DIR / \"dataset\" / \"hud_recap\" / \"usa.csv\"\n",
-    "hud_recap_df = pd.read_csv(\n",
-    "    hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
+    "# Load persistent poverty data\n",
+    "persistent_poverty_path = (\n",
+    "    DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
+    ")\n",
+    "persistent_poverty_df = pd.read_csv(\n",
+    "    persistent_poverty_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
    ")\n",
    "\n",
-    "hud_recap_df.head()"
+    "# Since \"Persistent Poverty Census Tract\" is labeled in both the score file (at the CBG level) and this tract file,\n",
+    "# rename this field so it's easy to access the tract-level scores directly.\n",
+    "\n",
+    "PERSISTENT_POVERTY_TRACT_LEVEL_FIELD = \"Persistent Poverty, Tract Level\"\n",
+    "PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
+    "\n",
+    "persistent_poverty_df.rename(\n",
+    "    columns={\n",
+    "        PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n",
+    "    },\n",
+    "    inplace=True,\n",
+    "    errors=\"raise\",\n",
+    ")\n",
+    "\n",
+    "persistent_poverty_df"
   ]
  },
  {
@ -193,7 +209,7 @@
   "outputs": [],
   "source": [
    "# Join all dataframes that use tracts\n",
-    "census_tract_dfs = [calenviroscreen_df, hud_recap_df]\n",
+    "census_tract_dfs = [calenviroscreen_df, persistent_poverty_df]\n",
    "\n",
    "census_tract_df = functools.reduce(\n",
    "    lambda left, right: pd.merge(\n",
@ -231,7 +247,6 @@
    "    on=GEOID_TRACT_FIELD_NAME,\n",
    ")\n",
    "\n",
-    "\n",
    "if len(merged_df) > 220405:\n",
    "    raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
    "\n",
@ -314,10 +329,20 @@
    "        priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"Persistent Poverty (CBG)\",\n",
+    "        priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
    "]\n",
    "\n",
    "census_tract_indices = [\n",
    "    Index(\n",
+    "        method_name=\"Persistent Poverty\",\n",
+    "        priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
    "        method_name=\"CalEnviroScreen 4.0\",\n",
    "        priority_communities_field=\"calenviroscreen_priority_community\",\n",
    "        other_census_tract_fields_to_keep=[\n",
@ -325,11 +350,6 @@
    "            CALENVIROSCREEN_PERCENTILE_FIELD,\n",
    "        ],\n",
    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"HUD RECAP\",\n",
-    "        priority_communities_field=\"hud_recap_priority_community\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
    "]"
   ]
  },
@ -354,7 +374,8 @@
    "\n",
    "        # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n",
    "        df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
-    "            df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
+    "            df[priority_communities_field]\n",
+    "            * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
    "        )\n",
    "\n",
    "    def calculate_state_comparison(\n",
@ -393,7 +414,9 @@
    "            summary_dict[\"Geography name\"] = division_id\n",
    "\n",
    "        total_cbgs_in_geography = len(frame)\n",
-    "        total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n",
+    "        total_population_in_geography = frame[\n",
+    "            CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
+    "        ].sum()\n",
    "\n",
    "        if geography_field == URBAN_HEURISTIC_FIELD:\n",
    "            urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n",
@ -401,9 +424,9 @@
    "            summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
    "\n",
    "        for priority_communities_field in priority_communities_fields:\n",
-    "            summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
+    "            summary_dict[\n",
    "                f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
-    "            ].sum()\n",
+    "            ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n",
    "\n",
    "            summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n",
    "                f\"{priority_communities_field}\"\n",
@ -415,7 +438,9 @@
    "                / total_cbgs_in_geography\n",
    "            )\n",
    "\n",
-    "            summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
+    "            summary_dict[\n",
+    "                f\"{priority_communities_field} (percent population)\"\n",
+    "            ] = (\n",
    "                summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
    "                / total_population_in_geography\n",
    "            )\n",
@ -461,7 +486,9 @@
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    region_distribution_df = region_grouped_df.progress_apply(\n",
-    "        lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
+    "        lambda frame: calculate_state_comparison(\n",
+    "            frame, geography_field=\"region\"\n",
+    "        )\n",
    "    )\n",
    "\n",
    "    # Next, run the comparison by division\n",
@ -469,7 +496,9 @@
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    division_distribution_df = division_grouped_df.progress_apply(\n",
-    "        lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
+    "        lambda frame: calculate_state_comparison(\n",
+    "            frame, geography_field=\"division\"\n",
+    "        )\n",
    "    )\n",
    "\n",
    "    # Next, run the comparison by urban/rural\n",
@ -524,7 +553,9 @@
    "        column_character = get_excel_column_name(column_index)\n",
    "\n",
    "        # Set all columns to larger width\n",
-    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
+    "        worksheet.set_column(\n",
+    "            f\"{column_character}:{column_character}\", column_width\n",
+    "        )\n",
    "\n",
    "        # Special formatting for all percent columns\n",
    "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
@ -539,9 +570,7 @@
    "\n",
    "        # Special formatting for columns that capture the percent of population considered priority.\n",
    "        if \"(percent population)\" in column:\n",
-    "            column_ranges = (\n",
-    "                f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
-    "            )\n",
+    "            column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
    "\n",
    "            # Add green to red conditional formatting.\n",
    "            worksheet.conditional_format(\n",
@ -654,7 +683,9 @@
    "\n",
    "    # Put criteria description column first.\n",
    "    new_column_order = [criteria_description_field_name] + [\n",
-    "        col for col in comparison_df.columns if col != criteria_description_field_name\n",
+    "        col\n",
+    "        for col in comparison_df.columns\n",
+    "        if col != criteria_description_field_name\n",
    "    ]\n",
    "\n",
    "    comparison_df = comparison_df[new_column_order]\n",
@ -700,12 +731,12 @@
    "        column_character = get_excel_column_name(column_index)\n",
    "\n",
    "        # Set all columns to larger width\n",
-    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
+    "        worksheet.set_column(\n",
+    "            f\"{column_character}:{column_character}\", column_width\n",
+    "        )\n",
    "\n",
    "        # Add green to red conditional formatting.\n",
-    "        column_ranges = (\n",
-    "            f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
-    "        )\n",
+    "        column_ranges = f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
    "        worksheet.conditional_format(\n",
    "            column_ranges,\n",
    "            # Min: green, max: red.\n",
@ -718,7 +749,11 @@
    "\n",
    "        # Special formatting for all percent columns\n",
    "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
-    "        if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
+    "        if (\n",
+    "            \"percent \" in column\n",
+    "            or \"(percent)\" in column\n",
+    "            or \"Percent \" in column\n",
+    "        ):\n",
    "            # Make these columns percentages.\n",
    "            percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
    "            worksheet.set_column(\n",
@ -756,9 +791,7 @@
    "    )\n",
    "\n",
    "    # Write secondary comparison to CSV.\n",
-    "    file_name_part = (\n",
-    "        f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
-    "    )\n",
+    "    file_name_part = f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
    "    output_dir.mkdir(parents=True, exist_ok=True)\n",
    "    file_path = output_dir / (file_name_part + \".csv\")\n",
    "    file_path_xlsx = output_dir / (file_name_part + \".xlsx\")\n",
@ -770,7 +803,8 @@
    "    )\n",
    "\n",
    "    write_cbg_score_comparison_excel(\n",
-    "        cbg_score_comparison_df=cbg_score_comparison_df, file_path=file_path_xlsx\n",
+    "        cbg_score_comparison_df=cbg_score_comparison_df,\n",
+    "        file_path=file_path_xlsx,\n",
    "    )\n",
    "\n",
    "\n",
@ -801,11 +835,15 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "eeb9699d",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
   "outputs": [],
   "source": [
    "def write_markdown_and_docx_content(\n",
-    "    markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str\n",
+    "    markdown_content: str,\n",
+    "    file_dir: pathlib.PosixPath,\n",
+    "    file_name_without_extension: str,\n",
    ") -> pathlib.PosixPath:\n",
    "    \"\"\"Write Markdown content to both .md and .docx files.\"\"\"\n",
    "    # Set the file paths for both files.\n",
@ -837,7 +875,9 @@
    "\n",
    "    # List of all states/territories in their FIPS codes:\n",
    "    state_ids = sorted(df[state_field].unique())\n",
-    "    state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
+    "    state_names = \", \".join(\n",
+    "        [us.states.lookup(state_id).name for state_id in state_ids]\n",
+    "    )\n",
    "\n",
    "    # Create markdown content for comparisons.\n",
    "    markdown_content = f\"\"\"\n",
@ -851,11 +891,16 @@
    "\n",
    "\"\"\"\n",
    "\n",
-    "    for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n",
+    "    for (index1, index2) in itertools.combinations(\n",
+    "        census_block_group_indices, 2\n",
+    "    ):\n",
    "        # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n",
    "        count_df = (\n",
    "            df.groupby(\n",
-    "                [index1.priority_communities_field, index2.priority_communities_field]\n",
+    "                [\n",
+    "                    index1.priority_communities_field,\n",
+    "                    index2.priority_communities_field,\n",
+    "                ]\n",
    "            )[GEOID_FIELD_NAME]\n",
    "            .count()\n",
    "            .reset_index(name=count_field_name)\n",
@ -887,16 +932,24 @@
    "\n",
    "        # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n",
    "        true_true_cbgs = (\n",
-    "            true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n",
+    "            true_true_cbgs_series.iloc[0]\n",
+    "            if len(true_true_cbgs_series) > 0\n",
+    "            else 0\n",
    "        )\n",
    "        true_false_cbgs = (\n",
-    "            true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n",
+    "            true_false_cbgs_series.iloc[0]\n",
+    "            if len(true_false_cbgs_series) > 0\n",
+    "            else 0\n",
    "        )\n",
    "        false_true_cbgs = (\n",
-    "            false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n",
+    "            false_true_cbgs_series.iloc[0]\n",
+    "            if len(false_true_cbgs_series) > 0\n",
+    "            else 0\n",
    "        )\n",
    "        false_false_cbgs = (\n",
-    "            false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n",
+    "            false_false_cbgs_series.iloc[0]\n",
+    "            if len(false_false_cbgs_series) > 0\n",
+    "            else 0\n",
    "        )\n",
    "\n",
    "        markdown_content += (\n",
@ -1088,15 +1141,20 @@
    "\n",
    "        # Calculate comparison\n",
    "        # A comparison priority tract has at least one CBG that is a priority CBG.\n",
-    "        df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n",
+    "        df[\n",
+    "            comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n",
+    "        ] = (\n",
    "            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
    "            if is_a_method_b_priority_tract\n",
    "            else None\n",
    "        )\n",
    "\n",
    "        # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n",
-    "        df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n",
-    "            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
+    "        df[\n",
+    "            comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n",
+    "        ] = (\n",
+    "            frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
+    "            == 1\n",
    "            if is_a_method_b_priority_tract\n",
    "            else None\n",
    "        )\n",
@ -1115,7 +1173,8 @@
    "        df[\n",
    "            comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n",
    "        ] = (\n",
-    "            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
+    "            frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
+    "            == 1\n",
    "            if not is_a_method_b_priority_tract\n",
    "            else None\n",
    "        )\n",
@ -1156,14 +1215,20 @@
    "\n",
    "    # List of all states/territories in their FIPS codes:\n",
    "    state_ids = sorted(original_df[state_field].unique())\n",
-    "    state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
+    "    state_names = \", \".join(\n",
+    "        [us.states.lookup(state_id).name for state_id in state_ids]\n",
+    "    )\n",
    "\n",
    "    # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n",
    "    # TODO: investigate why sums are sometimes series and sometimes scalar.\n",
    "    method_a_priority_cbgs = (\n",
-    "        original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n",
+    "        original_df.loc[:, method_a_priority_census_block_groups_field]\n",
+    "        .sum()\n",
+    "        .squeeze()\n",
+    "    )\n",
+    "    method_a_priority_cbgs_percent = (\n",
+    "        f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
    "    )\n",
-    "    method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
    "\n",
    "    total_tracts_count = len(comparison_df)\n",
    "\n",
@ -1185,7 +1250,9 @@
    "        .sum()\n",
    "        .squeeze()\n",
    "    )\n",
-    "    method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
+    "    method_a_tracts_count_percent = (\n",
+    "        f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
+    "    )\n",
    "\n",
    "    # Method A priority community stats\n",
    "    method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
@ -1316,7 +1383,8 @@
    "\n",
    "    # Write comparison to CSV.\n",
    "    file_path = (\n",
-    "        output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
+    "        output_dir\n",
+    "        / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
    "    )\n",
    "    comparison_df.to_csv(\n",
    "        path_or_buf=file_path,\n",
--- a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
@ -101,17 +101,25 @@
   "outputs": [],
   "source": [
    "geocorr_urban_rural_map = pd.read_csv(\n",
-    "    os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
-    "    encoding = \"ISO-8859-1\",\n",
+    "    os.path.join(GEOCORR_DATA_DIR, \"geocorr2014_2125804280.csv\"),\n",
+    "    encoding=\"ISO-8859-1\",\n",
    "    skiprows=[1],\n",
-    "    dtype='str',\n",
+    "    dtype=\"str\",\n",
    ")\n",
    "\n",
-    "geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
-    "geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
+    "geocorr_urban_rural_map[\"pop10\"] = pd.to_numeric(\n",
+    "    geocorr_urban_rural_map[\"pop10\"]\n",
+    ")\n",
+    "geocorr_urban_rural_map[\"afact\"] = pd.to_numeric(\n",
+    "    geocorr_urban_rural_map[\"afact\"]\n",
+    ")\n",
    "\n",
-    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
-    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = (\n",
+    "    geocorr_urban_rural_map[\"county\"] + geocorr_urban_rural_map[\"tract\"]\n",
+    ")  # + geocorr_urban_rural_map['bg']\n",
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[\n",
+    "    GEOID_TRACT_FIELD_NAME\n",
+    "].str.replace(\".\", \"\", regex=False)"
   ]
  },
  {
@ -139,15 +147,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
-    "    GEOID_TRACT_FIELD_NAME,\n",
-    "    'ur',\n",
-    "    'ua',\n",
-    "    'cntyname',\n",
-    "    'uaname',\n",
-    "    'pop10',\n",
-    "    'afact'\n",
-    "]]"
+    "geocorr_urban_rural_map = geocorr_urban_rural_map[\n",
+    "    [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\", \"cntyname\", \"uaname\", \"pop10\", \"afact\"]\n",
+    "]"
   ]
  },
  {
@ -165,7 +167,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
+    "geocorr_urban_rural_map.groupby(\n",
+    "    [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\"], dropna=False\n",
+    ").size().sort_values(ascending=False)"
   ]
  },
  {
@ -175,7 +179,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
+    "geocorr_urban_rural_map.loc[\n",
+    "    geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == \"36117020302\"\n",
+    "]"
   ]
  },
  {
@ -185,8 +191,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
-    "total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
+    "total_geo_population = (\n",
+    "    geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME)\n",
+    "    .agg({\"pop10\": np.sum})\n",
+    "    .reset_index()\n",
+    ")\n",
+    "total_geo_population.rename(columns={\"pop10\": \"total_population\"}, inplace=True)\n",
    "total_geo_population.head()"
   ]
  },
@ -197,8 +207,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
-    "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
+    "geocorr_urban_rural_with_total_pop_map = (\n",
+    "    geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, \"ur\"])\n",
+    "    .agg({\"pop10\": np.sum})\n",
+    "    .reset_index()\n",
+    ")\n",
+    "geocorr_urban_rural_with_total_pop_map = (\n",
+    "    geocorr_urban_rural_with_total_pop_map.merge(\n",
+    "        total_geo_population, how=\"inner\", on=GEOID_TRACT_FIELD_NAME\n",
+    "    )\n",
+    ")\n",
    "geocorr_urban_rural_with_total_pop_map.head()"
   ]
  },
@ -209,7 +227,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
+    "geocorr_urban_rural_with_total_pop_map[\"afact\"] = (\n",
+    "    geocorr_urban_rural_with_total_pop_map[\"pop10\"]\n",
+    "    / geocorr_urban_rural_with_total_pop_map[\"total_population\"]\n",
+    ")"
   ]
  },
  {
@ -229,7 +250,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
+    "geocorr_urban_rural_with_total_pop_map.loc[\n",
+    "    geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME]\n",
+    "    == \"01001020200\"\n",
+    "]"
   ]
  },
  {
@ -239,12 +263,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
-    "urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
+    "urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(\n",
+    "    index=GEOID_TRACT_FIELD_NAME, columns=\"ur\", values=[\"pop10\", \"afact\"]\n",
+    ")\n",
+    "urban_rural_map.columns = [\n",
+    "    \"_\".join(col).strip() for col in urban_rural_map.columns.values\n",
+    "]\n",
    "urban_rural_map.reset_index(inplace=True)\n",
-    "urban_rural_map['urban_heuristic_flag'] = 0\n",
-    "mask = urban_rural_map['afact_U'] >= 0.5\n",
-    "urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
+    "urban_rural_map[\"urban_heuristic_flag\"] = 0\n",
+    "mask = urban_rural_map[\"afact_U\"] >= 0.5\n",
+    "urban_rural_map.loc[mask, \"urban_heuristic_flag\"] = 1"
   ]
  },
  {
@ -256,12 +284,13 @@
   "source": [
    "urban_rural_map.rename(\n",
    "    columns={\n",
-    "        'pop10_R': 'population_in_rural_areas',\n",
-    "        'pop10_U': 'population_in_urban_areas',\n",
-    "        'afact_R': 'perc_population_in_rural_areas',\n",
-    "        'afact_U': 'perc_population_in_urban_areas',\n",
-    "    }, \n",
-    "    inplace=True)"
+    "        \"pop10_R\": \"population_in_rural_areas\",\n",
+    "        \"pop10_U\": \"population_in_urban_areas\",\n",
+    "        \"afact_R\": \"perc_population_in_rural_areas\",\n",
+    "        \"afact_U\": \"perc_population_in_urban_areas\",\n",
+    "    },\n",
+    "    inplace=True,\n",
+    ")"
   ]
  },
  {