Score F, testing methodology (#510)

* fixing dependency issue * fixing more dependencies * including fraction of state AMI * wip * nitpick whitespace * etl working now * wip on scoring * fix rename error * reducing metrics * fixing score f * fixing readme * adding dependency * passing tests; * linting/black * removing unnecessary sample * fixing error * adding verify flag on etl/base Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
2025-09-13 02:08:17 -07:00 · 2021-08-24 15:40:54 -05:00 · 2021-08-24 15:40:54 -05:00 · 65ceb7900f
commit 65ceb7900f
parent 043ed983ea
23 changed files with 557 additions and 153 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Optional

 from data_pipeline.config import settings
 from data_pipeline.utils import unzip_file_from_url, remove_all_from_dir
@ -33,14 +34,21 @@ class ExtractTransformLoad:

        pass

-    def extract(self, source_url: str = None, extract_path: Path = None) -> None:
+    def extract(
+        self,
+        source_url: str = None,
+        extract_path: Path = None,
+        verify: Optional[bool] = True,
+    ) -> None:
        """Extract the data from
        a remote source. By default it provides code to get the file from a source url,
        unzips it and stores it on an extract_path."""

        # this can be accessed via super().extract()
        if source_url and extract_path:
-            unzip_file_from_url(source_url, self.TMP_PATH, extract_path)
+            unzip_file_from_url(
+                source_url, self.TMP_PATH, extract_path, verify=verify
+            )

    def transform(self) -> None:
        """Transform the data extracted into a format that can be consumed by the
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -34,6 +34,11 @@ DATASET_LIST = [
        "module_dir": "hud_recap",
        "class_name": "HudRecapETL",
    },
+    {
+        "name": "cdc_places",
+        "module_dir": "cdc_places",
+        "class_name": "CDCPlacesETL",
+    },
 ]
 CENSUS_INFO = {
    "name": "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -50,6 +50,7 @@ class ScoreETL(ExtractTransformLoad):
        self.census_df: pd.DataFrame
        self.housing_and_transportation_df: pd.DataFrame
        self.hud_housing_df: pd.DataFrame
+        self.cdc_places_df: pd.DataFrame

    def data_sets(self) -> list:
        # Define a named tuple that will be used for each data set input.
@ -81,6 +82,36 @@ class ScoreETL(ExtractTransformLoad):
                renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
                bucket=None,
            ),
+            DataSet(
+                input_field="Current asthma among adults aged >=18 years",
+                renamed_field="Current asthma among adults aged >=18 years",
+                bucket=None,
+            ),
+            DataSet(
+                input_field="Coronary heart disease among adults aged >=18 years",
+                renamed_field="Coronary heart disease among adults aged >=18 years",
+                bucket=None,
+            ),
+            DataSet(
+                input_field="Cancer (excluding skin cancer) among adults aged >=18 years",
+                renamed_field="Cancer (excluding skin cancer) among adults aged >=18 years",
+                bucket=None,
+            ),
+            DataSet(
+                input_field="Current lack of health insurance among adults aged 18-64 years",
+                renamed_field="Current lack of health insurance among adults aged 18-64 years",
+                bucket=None,
+            ),
+            DataSet(
+                input_field="Diagnosed diabetes among adults aged >=18 years",
+                renamed_field="Diagnosed diabetes among adults aged >=18 years",
+                bucket=None,
+            ),
+            DataSet(
+                input_field="Physical health not good for >=14 days among adults aged >=18 years",
+                renamed_field="Physical health not good for >=14 days among adults aged >=18 years",
+                bucket=None,
+            ),
            # The following data sets have buckets, because they're used in Score C
            DataSet(
                input_field="CANCER",
@ -218,6 +249,14 @@ class ScoreETL(ExtractTransformLoad):
            low_memory=False,
        )

+        # Load CDC Places data
+        cdc_places_csv = self.DATA_PATH / "dataset" / "cdc_places" / "usa.csv"
+        self.cdc_places_df = pd.read_csv(
+            cdc_places_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+
    def transform(self) -> None:
        ## IMPORTANT: THIS METHOD IS CLOSE TO THE LIMIT OF STATEMENTS

@ -247,8 +286,28 @@ class ScoreETL(ExtractTransformLoad):
            )

        # Join all the data sources that use census tracts
-        # TODO: when there's more than one data source using census tract, reduce/merge them here.
-        census_tract_df = self.hud_housing_df
+        census_tract_dfs = [
+            self.hud_housing_df,
+            self.cdc_places_df,
+        ]
+        census_tract_df = functools.reduce(
+            lambda left, right: pd.merge(
+                left=left,
+                right=right,
+                on=self.GEOID_TRACT_FIELD_NAME,
+                how="outer",
+            ),
+            census_tract_dfs,
+        )
+
+        # Sanity check the join.
+        if (
+            len(census_tract_df[self.GEOID_TRACT_FIELD_NAME].str.len().unique())
+            != 1
+        ):
+            raise ValueError(
+                f"One of the input CSVs uses {self.GEOID_TRACT_FIELD_NAME} with a different length."
+            )

        # Calculate the tract for the CBG data.
        census_block_group_df[
@ -437,12 +496,56 @@ class ScoreETL(ExtractTransformLoad):
        )

        self.df[meets_burden_field_name] = (
-            self.df["Particulate matter (PM2.5)"] > 10
-        ) | (self.df["Respiratory hazard " "index"] > 0.75)
+            (self.df["Particulate matter (PM2.5) (percentile)"] > 0.9)
+            | (self.df["Respiratory hazard index (percentile)"] > 0.9)
+            | (self.df["Traffic proximity and volume (percentile)"] > 0.9)
+            | (
+                self.df[
+                    "Percent pre-1960s housing (lead paint indicator) (percentile)"
+                ]
+                > 0.9
+            )
+            | (self.df["Proximity to RMP sites (percentile)"] > 0.9)
+            | (
+                self.df[
+                    "Current asthma among adults aged >=18 years (percentile)"
+                ]
+                > 0.9
+            )
+            | (
+                self.df[
+                    "Coronary heart disease among adults aged >=18 years (percentile)"
+                ]
+                > 0.9
+            )
+            | (
+                self.df[
+                    "Cancer (excluding skin cancer) among adults aged >=18 years (percentile)"
+                ]
+                > 0.9
+            )
+            # | (
+            #     self.df[
+            #         "Current lack of health insurance among adults aged 18-64 years (percentile)"
+            #     ]
+            #     > 0.9
+            # )
+            | (
+                self.df[
+                    "Diagnosed diabetes among adults aged >=18 years (percentile)"
+                ]
+                > 0.9
+            )
+            # | (
+            #     self.df[
+            #         "Physical health not good for >=14 days among adults aged >=18 years (percentile)"
+            #     ]
+            #     > 0.9
+            # )
+        )

        self.df["Score F (communities)"] = (
-            self.df[ami_and_high_school_field_name]
-            & self.df[meets_burden_field_name]
+            self.df[meets_socio_field_name] & self.df[meets_burden_field_name]
        )

    def load(self) -> None:
@ -450,10 +553,4 @@ class ScoreETL(ExtractTransformLoad):

        # write nationwide csv
        self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
-
-        # TODO: drop
-        self.df[0:10000].to_csv(
-            self.SCORE_CSV_PATH / "usa-10000.csv", index=False
-        )
-
        self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)
--- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
@ -10,14 +10,19 @@ logger = get_module_logger(__name__)
 class CalEnviroScreenETL(ExtractTransformLoad):
    def __init__(self):
        self.CALENVIROSCREEN_FTP_URL = (
-            settings.AWS_JUSTICE40_DATASOURCES_URL + "/CalEnviroScreen_4.0_2021.zip"
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/CalEnviroScreen_4.0_2021.zip"
+        )
+        self.CALENVIROSCREEN_CSV = (
+            self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
        )
-        self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"

        # Definining some variable names
        self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
-        self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile"
+        self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
+            "calenviroscreen_percentile"
+        )
        self.CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = (
            "calenviroscreen_priority_community"
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/README.md
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
@ -0,0 +1,66 @@
+import pandas as pd
+
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.utils import get_module_logger, download_file_from_url
+
+logger = get_module_logger(__name__)
+
+
+class CDCPlacesETL(ExtractTransformLoad):
+    def __init__(self):
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
+
+        self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+        self.CDC_GEOID_FIELD_NAME = "LocationID"
+        self.CDC_VALUE_FIELD_NAME = "Data_Value"
+        self.CDC_MEASURE_FIELD_NAME = "Measure"
+
+        self.df: pd.DataFrame
+
+    def extract(self) -> None:
+        logger.info("Starting to download 520MB CDC Places file.")
+        file_path = download_file_from_url(
+            file_url=self.CDC_PLACES_URL,
+            download_file_name=self.TMP_PATH
+            / "cdc_places"
+            / "census_tract.csv",
+        )
+
+        self.df = pd.read_csv(
+            filepath_or_buffer=file_path,
+            dtype={self.CDC_GEOID_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+
+    def transform(self) -> None:
+        logger.info("Starting CDC Places transform")
+
+        # Rename GEOID field
+        self.df.rename(
+            columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
+            inplace=True,
+            errors="raise",
+        )
+
+        # Note: Puerto Rico not included.
+        self.df = self.df.pivot(
+            index=self.GEOID_TRACT_FIELD_NAME,
+            columns=self.CDC_MEASURE_FIELD_NAME,
+            values=self.CDC_VALUE_FIELD_NAME,
+        )
+
+        # Make the index (the census tract ID) a column, not the index.
+        self.df.reset_index(inplace=True)
+
+    def load(self) -> None:
+        logger.info("Saving CDC Places Data")
+
+        # mkdir census
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+
+        self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
+
+    def validate(self) -> None:
+        logger.info("Validating Census ACS Data")
+
+        pass
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@ -33,7 +33,9 @@ class CensusETL(ExtractTransformLoad):
        self.NATIONAL_CBG_CSV_PATH = self.CSV_BASE_PATH / "us.csv"
        self.NATIONAL_CBG_JSON_PATH = self.GEOJSON_BASE_PATH / "us.json"

-    def _path_for_fips_file(self, fips_code: str, file_type: GeoFileType) -> Path:
+    def _path_for_fips_file(
+        self, fips_code: str, file_type: GeoFileType
+    ) -> Path:
        """Get paths for associated geospatial files for the provided FIPS code

        Args:
@ -93,7 +95,9 @@ class CensusETL(ExtractTransformLoad):
            None
        """
        shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
-        geojson_file_path = self._path_for_fips_file(fips_code, GeoFileType.GEOJSON)
+        geojson_file_path = self._path_for_fips_file(
+            fips_code, GeoFileType.GEOJSON
+        )
        logger.info(f"Checking if {fips_code} geoJSON file exists ")
        if not geojson_file_path.is_file():
            logger.info(
@ -176,7 +180,9 @@ class CensusETL(ExtractTransformLoad):

        if not self.NATIONAL_CBG_CSV_PATH.is_file():
            logger.info(f"Creating {self.NATIONAL_CBG_CSV_PATH}")
-            with open(self.NATIONAL_CBG_CSV_PATH, mode="w", newline="") as cbg_csv_file:
+            with open(
+                self.NATIONAL_CBG_CSV_PATH, mode="w", newline=""
+            ) as cbg_csv_file:
                cbg_csv_file_writer = csv.writer(
                    cbg_csv_file,
                    delimiter=",",
@ -205,7 +211,9 @@ class CensusETL(ExtractTransformLoad):
            state_gdf = gpd.read_file(file_name)
            usa_df = usa_df.append(state_gdf)

-        usa_df = usa_df.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
+        usa_df = usa_df.to_crs(
+            "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
+        )
        logger.info("Writing national geojson file")
        usa_df.to_file(self.NATIONAL_CBG_JSON_PATH, driver="GeoJSON")

--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -41,10 +41,10 @@ class CensusACSETL(ExtractTransformLoad):

        self.STATE_MEDIAN_INCOME_FTP_URL = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/2014_to_2019_state_median_income.zip"
+            + "/2015_to_2019_state_median_income.zip"
        )
        self.STATE_MEDIAN_INCOME_FILE_PATH = (
-            self.TMP_PATH / "2014_to_2019_state_median_income.csv"
+            self.TMP_PATH / "2015_to_2019_state_median_income.csv"
        )

    def _fips_from_censusdata_censusgeo(
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -8,9 +8,7 @@ logger = get_module_logger(__name__)

 class EJScreenETL(ExtractTransformLoad):
    def __init__(self):
-        self.EJSCREEN_FTP_URL = (
-            "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
-        )
+        self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
        self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"
        self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
        self.df: pd.DataFrame
@ -20,6 +18,7 @@ class EJScreenETL(ExtractTransformLoad):
        super().extract(
            self.EJSCREEN_FTP_URL,
            self.TMP_PATH,
+            verify=False,  # EPA EJScreen end point has certificate issues often
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@ -35,7 +35,9 @@ class HousingTransportationETL(ExtractTransformLoad):
            )

            # New file name:
-            tmp_csv_file_path = zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
+            tmp_csv_file_path = (
+                zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
+            )
            tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)

            dfs.append(tmp_df)
@ -47,9 +49,9 @@ class HousingTransportationETL(ExtractTransformLoad):

        # Rename and reformat block group ID
        self.df.rename(columns={"blkgrp": self.GEOID_FIELD_NAME}, inplace=True)
-        self.df[self.GEOID_FIELD_NAME] = self.df[self.GEOID_FIELD_NAME].str.replace(
-            '"', ""
-        )
+        self.df[self.GEOID_FIELD_NAME] = self.df[
+            self.GEOID_FIELD_NAME
+        ].str.replace('"', "")

    def load(self) -> None:
        logger.info("Saving Housing and Transportation Data")
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@ -9,16 +9,16 @@ class HudHousingETL(ExtractTransformLoad):
    def __init__(self):
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "hud_housing"
        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
-        self.HOUSING_FTP_URL = (
-            "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip"
-        )
+        self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip"
        self.HOUSING_ZIP_FILE_DIR = self.TMP_PATH / "hud_housing"

        # We measure households earning less than 80% of HUD Area Median Family Income by county
        # and paying greater than 30% of their income to housing costs.
        self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
        self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME = "HOUSING_BURDEN_NUMERATOR"
-        self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = "HOUSING_BURDEN_DENOMINATOR"
+        self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = (
+            "HOUSING_BURDEN_DENOMINATOR"
+        )

        # Note: some variable definitions.
        # HUD-adjusted median family income (HAMFI).
@ -55,7 +55,9 @@ class HudHousingETL(ExtractTransformLoad):
        )

        # Rename and reformat block group ID
-        self.df.rename(columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True)
+        self.df.rename(
+            columns={"geoid": self.GEOID_TRACT_FIELD_NAME}, inplace=True
+        )

        # The CHAS data has census tract ids such as `14000US01001020100`
        # Whereas the rest of our data uses, for the same tract, `01001020100`.
@ -273,7 +275,9 @@ class HudHousingETL(ExtractTransformLoad):
        # TODO: add small sample size checks
        self.df[self.HOUSING_BURDEN_FIELD_NAME] = self.df[
            self.HOUSING_BURDEN_NUMERATOR_FIELD_NAME
-        ].astype(float) / self.df[self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(
+        ].astype(float) / self.df[
+            self.HOUSING_BURDEN_DENOMINATOR_FIELD_NAME
+        ].astype(
            float
        )

--- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
@ -18,7 +18,9 @@ class HudRecapETL(ExtractTransformLoad):
        self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"

        # Definining some variable names
-        self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = "hud_recap_priority_community"
+        self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
+            "hud_recap_priority_community"
+        )

        self.df: pd.DataFrame

--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@ -8,9 +8,7 @@ logger = get_module_logger(__name__)

 class TreeEquityScoreETL(ExtractTransformLoad):
    def __init__(self):
-        self.TES_URL = (
-            "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
-        )
+        self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
        self.TES_CSV = self.TMP_PATH / "tes_2021_data.csv"
        self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
        self.df: gpd.GeoDataFrame
@ -78,8 +76,12 @@ class TreeEquityScoreETL(ExtractTransformLoad):
        logger.info("Transforming Tree Equity Score Data")
        tes_state_dfs = []
        for state in self.states:
-            tes_state_dfs.append(gpd.read_file(f"{self.TMP_PATH}/{state}/{state}.shp"))
-        self.df = gpd.GeoDataFrame(pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs)
+            tes_state_dfs.append(
+                gpd.read_file(f"{self.TMP_PATH}/{state}/{state}.shp")
+            )
+        self.df = gpd.GeoDataFrame(
+            pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
+        )

    def load(self) -> None:
        logger.info("Saving Tree Equity Score GeoJSON")
--- a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
@ -3,9 +3,6 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0491828b",
-   "metadata": {},
-   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import censusdata\n",
@ -32,30 +29,26 @@
    "# Some display settings to make pandas outputs more readable.\n",
    "pd.set_option(\"display.expand_frame_repr\", False)\n",
    "pd.set_option(\"display.precision\", 2)"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "654f25a1",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
   "source": [
    "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
    "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
    "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {
+    "scrolled": true
+   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "8999cea4",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
   "source": [
    "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
    "    \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
@ -85,31 +78,33 @@
    "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
    "\n",
    "df.head()"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {
+    "scrolled": true
+   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "2a269bb1",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
   "source": [
    "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
    "\n",
    "df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n",
    "\n",
-    "df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
-   ]
+    "# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
+   ],
+   "outputs": [],
+   "metadata": {
+    "scrolled": true
+   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "91932af5",
-   "metadata": {},
+   "source": [],
   "outputs": [],
-   "source": []
+   "metadata": {}
  }
 ],
 "metadata": {
@ -133,4 +128,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 5
-}
+}
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -28,7 +28,7 @@
    "from datetime import datetime\n",
    "from tqdm.notebook import tqdm_notebook\n",
    "\n",
-    "module_path = os.path.abspath(os.path.join(\"..\"))\n",
+    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
    "\n",
@ -215,7 +215,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "8b795fb4",
+   "id": "274f6bc6",
   "metadata": {},
   "outputs": [],
   "source": [
@ -234,6 +234,21 @@
    "# (`census_tract_indices`).\n",
    "census_block_group_indices = [\n",
    "    Index(\n",
+    "        method_name=\"Score F\",\n",
+    "        priority_communities_field=\"Score F (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"Score F (socioeconomic only)\",\n",
+    "        priority_communities_field=\"Meets socioeconomic criteria\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"Score F (burden only)\",\n",
+    "        priority_communities_field=\"Meets burden criteria\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
    "        method_name=\"Score A\",\n",
    "        priority_communities_field=\"Score A (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
@ -253,21 +268,21 @@
    "        priority_communities_field=\"Score D (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score D (30th percentile)\",\n",
-    "        priority_communities_field=\"Score D (top 30th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score D (35th percentile)\",\n",
-    "        priority_communities_field=\"Score D (top 35th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score D (40th percentile)\",\n",
-    "        priority_communities_field=\"Score D (top 40th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
+    "#     Index(\n",
+    "#         method_name=\"Score D (30th percentile)\",\n",
+    "#         priority_communities_field=\"Score D (top 30th percentile)\",\n",
+    "#         other_census_tract_fields_to_keep=[],\n",
+    "#     ),\n",
+    "#     Index(\n",
+    "#         method_name=\"Score D (35th percentile)\",\n",
+    "#         priority_communities_field=\"Score D (top 35th percentile)\",\n",
+    "#         other_census_tract_fields_to_keep=[],\n",
+    "#     ),\n",
+    "#     Index(\n",
+    "#         method_name=\"Score D (40th percentile)\",\n",
+    "#         priority_communities_field=\"Score D (top 40th percentile)\",\n",
+    "#         other_census_tract_fields_to_keep=[],\n",
+    "#     ),\n",
    "    Index(\n",
    "        method_name=\"Poverty\",\n",
    "        priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
@ -534,7 +549,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "d7acf80d",
+   "id": "eeb9699d",
   "metadata": {},
   "outputs": [],
   "source": [
@ -682,7 +697,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "777a4623",
+   "id": "4f44426c",
   "metadata": {},
   "outputs": [],
   "source": [
@ -1140,14 +1155,6 @@
    "\n",
    "print(file_paths)"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e679502a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -98,11 +98,50 @@ def remove_all_dirs_from_dir(dir_path: Path) -> None:
            logging.info(f"Removing directory {file_path}")


+def download_file_from_url(
+    file_url: str,
+    download_file_name: Path,
+    verify: bool = True,
+) -> str:
+    """Downloads a file from a remote URL location and returns the file location.
+
+    Args:
+        file_url (str): URL where the zip file is located
+        download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
+        verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an error (optional, default to False)
+
+    Returns:
+        None
+
+    """
+    # disable https warning
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    if not os.path.isdir(download_file_name.parent):
+        os.mkdir(download_file_name.parent)
+
+    logger.info(f"Downloading {file_url}")
+    response = requests.get(file_url, verify=verify)
+    if response.status_code == 200:
+        file_contents = response.content
+    else:
+        sys.exit(
+            f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
+        )
+
+    # Write the contents to disk.
+    file = open(download_file_name, "wb")
+    file.write(file_contents)
+    file.close()
+
+    return download_file_name
+
+
 def unzip_file_from_url(
    file_url: str,
    download_path: Path,
    unzipped_file_path: Path,
-    verify: bool = False,
+    verify: bool = True,
 ) -> None:
    """Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after

@ -116,23 +155,11 @@ def unzip_file_from_url(
        None

    """
-
-    # disable https warning
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-    logger.info(f"Downloading {file_url}")
-    response = requests.get(file_url, verify=verify)
-    if response.status_code == 200:
-        file_contents = response.content
-    else:
-        sys.exit(
-            f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
-        )
-
-    zip_file_path = download_path / "downloaded.zip"
-    zip_file = open(zip_file_path, "wb")
-    zip_file.write(file_contents)
-    zip_file.close()
+    zip_file_path = download_file_from_url(
+        file_url=file_url,
+        download_file_name=download_path / "downloaded.zip",
+        verify=verify,
+    )

    logger.info(f"Extracting {zip_file_path}")
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref: