Imputing income using geographic neighbors (#1559)

Imputes income field with a light refactor. Needs more refactor and more tests (I spotchecked). Next ticket will check and address but a lot of "narwhal" architecture is here.
2025-09-30 07:33:18 -07:00 · 2022-04-27 15:59:10 -04:00 · 2022-04-27 15:59:10 -04:00 · f680d867c7
commit f680d867c7
parent 218fa48b85
16 changed files with 1245 additions and 81 deletions
--- a/data/data-pipeline/data_pipeline/comparison_tool/output/cre/cre__2022-03-30.xlsx
+++ b/data/data-pipeline/data_pipeline/comparison_tool/output/cre/cre__2022-03-30.xlsx
--- a/data/data-pipeline/data_pipeline/comparison_tool/src/utils.py
+++ b/data/data-pipeline/data_pipeline/comparison_tool/src/utils.py
@ -40,7 +40,7 @@ def validate_new_data(
    assert (
        checking_df[score_col].nunique() <= 3
    ), f"Error: there are too many values possible in {score_col}"
-    assert (True in checking_df[score_col].unique()) & (
+    assert (True in checking_df[score_col].unique()) | (
        False in checking_df[score_col].unique()
    ), f"Error: {score_col} should be a boolean"
--- a/data/data-pipeline/data_pipeline/content/config/csv.yml
+++ b/data/data-pipeline/data_pipeline/content/config/csv.yml
@ -26,6 +26,9 @@ fields:
  - score_name: Total population
    label: Total population
    format: float
  - score_name: Percent of individuals below 200% Federal Poverty Line, imputed and adjusted
    label: Adjusted percent of individuals below 200% Federal Poverty Line
    format: float
  - score_name: Is low income and has a low percent of higher ed students?
    label: Is low income and high percent of residents that are not higher ed students?
    format: bool
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -30,6 +30,9 @@ sheets:
      - score_name: Total population
        label: Total population
        format: float
      - score_name: Percent of individuals below 200% Federal Poverty Line, imputed and adjusted
        label: Adjusted percent of individuals below 200% Federal Poverty Line
        format: float
      - score_name: Is low income and has a low percent of higher ed students?
        label: Is low income and high percent of residents that are not higher ed students?
        format: bool
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -14,16 +14,6 @@ DATASET_LIST = [
        "module_dir": "tree_equity_score",
        "class_name": "TreeEquityScoreETL",
    },
    {
        "name": "census_acs",
        "module_dir": "census_acs",
        "class_name": "CensusACSETL",
    },
    {
        "name": "census_acs_2010",
        "module_dir": "census_acs_2010",
        "class_name": "CensusACS2010ETL",
    },
    {
        "name": "census_decennial",
        "module_dir": "census_decennial",
@ -124,6 +114,17 @@ DATASET_LIST = [
        "module_dir": "maryland_ejscreen",
        "class_name": "MarylandEJScreenETL",
    },
    # This has to come after us.json exists
    {
        "name": "census_acs",
        "module_dir": "census_acs",
        "class_name": "CensusACSETL",
    },
    {
        "name": "census_acs_2010",
        "module_dir": "census_acs_2010",
        "class_name": "CensusACS2010ETL",
    },
 ]
 CENSUS_INFO = {
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -5,6 +5,9 @@ from data_pipeline.config import settings
 from data_pipeline.score import field_names
 ## note: to keep map porting "right" fields, keeping descriptors the same.
 # Base Paths
 DATA_PATH = Path(settings.APP_ROOT) / "data"
 TMP_PATH = DATA_PATH / "tmp"
@ -179,6 +182,8 @@ TILES_SCORE_COLUMNS = {
    + field_names.PERCENTILE_FIELD_SUFFIX: "P100_PFS",
    field_names.POVERTY_LESS_THAN_200_FPL_FIELD
    + field_names.PERCENTILE_FIELD_SUFFIX: "P200_PFS",
    field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
    + field_names.PERCENTILE_FIELD_SUFFIX: "P200_I_PFS",
    field_names.LEAD_PAINT_FIELD
    + field_names.PERCENTILE_FIELD_SUFFIX: "LPF_PFS",
    field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX: "NPL_PFS",
@ -198,7 +203,8 @@ TILES_SCORE_COLUMNS = {
    field_names.M_HOUSING: "M_HSG",
    field_names.M_POLLUTION: "M_PLN",
    field_names.M_HEALTH: "M_HLTH",
-    field_names.SCORE_M_COMMUNITIES: "SM_C",
+    # temporarily update this so that it's the Narwhal score that gets visualized on the map
    field_names.SCORE_N_COMMUNITIES: "SM_C",
    field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
    field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
    field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
@ -283,7 +289,7 @@ TILES_SCORE_COLUMNS = {
    ## Low high school and low higher ed for t&wd
    field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
    ## FPL 200 and low higher ed for all others
-    field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES: "M_EBSI",
+    field_names.FPL_200_SERIES: "M_EBSI",
 }
 # columns to round floats to 2 decimals
@ -311,6 +317,8 @@ TILES_SCORE_FLOAT_COLUMNS = [
    + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.POVERTY_LESS_THAN_200_FPL_FIELD
    + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
    + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
@ -332,7 +340,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
    field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD,
    field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
    field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.COLLEGE_NON_ATTENDANCE_FIELD,
    field_names.COLLEGE_ATTENDANCE_FIELD,
 ]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -405,6 +405,7 @@ class ScoreETL(ExtractTransformLoad):
            df[field_names.MEDIAN_INCOME_FIELD] / df[field_names.AMI_FIELD]
        )
        # QQ: why don't we just filter to the numeric columns by type?
        numeric_columns = [
            field_names.HOUSING_BURDEN_FIELD,
            field_names.TOTAL_POP_FIELD,
@ -458,6 +459,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.IMPENETRABLE_SURFACES_FIELD,
            # We have to pass this boolean here in order to include it in ag value loss percentiles.
            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
        ]
        non_numeric_columns = [
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -29,7 +29,7 @@ from . import constants
 logger = get_module_logger(__name__)
 # Define the DAC variable
-DISADVANTAGED_COMMUNITIES_FIELD = field_names.SCORE_M_COMMUNITIES
+DISADVANTAGED_COMMUNITIES_FIELD = field_names.SCORE_N_COMMUNITIES
 class PostScoreETL(ExtractTransformLoad):
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -1,14 +1,26 @@
 from collections import namedtuple
 import os
 import pandas as pd
 import geopandas as gpd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census_acs.etl_utils import (
    retrieve_census_acs_data,
 )
-from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.sources.census_acs.etl_imputations import (
    calculate_income_measures,
 )
 from data_pipeline.utils import get_module_logger, unzip_file_from_url
 from data_pipeline.score import field_names
 logger = get_module_logger(__name__)
 # because now there is a requirement for the us.json, this will port from
 # AWS when a local copy does not exist.
 CENSUS_DATA_S3_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/census.zip"
 class CensusACSETL(ExtractTransformLoad):
    def __init__(self):
@ -59,6 +71,23 @@ class CensusACSETL(ExtractTransformLoad):
        self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
            "Percent of individuals < 200% Federal Poverty Line"
        )
        self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
            "Percent of individuals < 200% Federal Poverty Line, imputed"
        )
        self.ADJUSTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
            "Adjusted percent of individuals < 200% Federal Poverty Line"
        )
        self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME_PRELIMINARY = (
            "Preliminary adjusted percent of individuals < 200% Federal Poverty Line,"
            + " imputed"
        )
        self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
            "Adjusted percent of individuals < 200% Federal Poverty Line,"
            + " imputed"
        )
        self.MEDIAN_HOUSE_VALUE_FIELD = "B25077_001E"
        self.MEDIAN_HOUSE_VALUE_FIELD_NAME = (
@ -136,6 +165,10 @@ class CensusACSETL(ExtractTransformLoad):
            "Percent enrollment in college or graduate school"
        )
        self.IMPUTED_COLLEGE_ATTENDANCE_FIELD = (
            "Percent enrollment in college or graduate school, imputed"
        )
        self.COLLEGE_NON_ATTENDANCE_FIELD = "Percent of population not currently enrolled in college or graduate school"
        self.RE_FIELDS = [
@ -188,18 +221,50 @@ class CensusACSETL(ExtractTransformLoad):
                self.MEDIAN_INCOME_FIELD_NAME,
                self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
                self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
-                self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
+                self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
                self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
                self.HIGH_SCHOOL_ED_FIELD,
                self.COLLEGE_ATTENDANCE_FIELD,
                self.COLLEGE_NON_ATTENDANCE_FIELD,
                self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
            ]
            + self.RE_OUTPUT_FIELDS
            + [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
            + [
                field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
                field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
            ]
        )
        self.df: pd.DataFrame
    def _merge_geojson(
        self,
        df: pd.DataFrame,
        usa_geo_df: gpd.GeoDataFrame,
        geoid_field: str = "GEOID10",
        geometry_field: str = "geometry",
        state_code_field: str = "STATEFP10",
        county_code_field: str = "COUNTYFP10",
    ) -> gpd.GeoDataFrame:
        usa_geo_df[geoid_field] = (
            usa_geo_df[geoid_field].astype(str).str.zfill(11)
        )
        return gpd.GeoDataFrame(
            df.merge(
                usa_geo_df[
                    [
                        geoid_field,
                        geometry_field,
                        state_code_field,
                        county_code_field,
                    ]
                ],
                left_on=[self.GEOID_TRACT_FIELD_NAME],
                right_on=[geoid_field],
            )
        )
    def extract(self) -> None:
        # Define the variables to retrieve
        variables = (
@ -227,6 +292,27 @@ class CensusACSETL(ExtractTransformLoad):
        df = self.df
        # Here we join the geometry of the US to the dataframe so that we can impute
        # The income of neighbors. first this looks locally; if there's no local
        # geojson file for all of the US, this will read it off of S3
        logger.info("Reading in geojson for the country")
        if not os.path.exists(
            self.DATA_PATH / "census" / "geojson" / "us.json"
        ):
            logger.info("Fetching Census data from AWS S3")
            unzip_file_from_url(
                CENSUS_DATA_S3_URL,
                self.DATA_PATH / "tmp",
                self.DATA_PATH,
            )
        geo_df = gpd.read_file(
            self.DATA_PATH / "census" / "geojson" / "us.json"
        )
        df = self._merge_geojson(
            df=df,
            usa_geo_df=geo_df,
        )
        # Rename two fields.
        df = df.rename(
            columns={
@ -349,7 +435,7 @@ class CensusACSETL(ExtractTransformLoad):
            df["B03003_003E"] / df["B03003_001E"]
        )
-        # Calculate college attendance:
+        # Calculate college attendance and adjust low income
        df[self.COLLEGE_ATTENDANCE_FIELD] = (
            df[self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PUBLIC]
            + df[self.COLLEGE_ATTENDANCE_MALE_ENROLLED_PRIVATE]
@ -361,22 +447,64 @@ class CensusACSETL(ExtractTransformLoad):
            1 - df[self.COLLEGE_ATTENDANCE_FIELD]
        )
-        # strip columns
+        # we impute income for both income measures
-        df = df[self.COLUMNS_TO_KEEP]
+        ## TODO: Convert to pydantic for clarity
-
+        logger.info("Imputing income information")
-        # Save results to self.
+        ImputeVariables = namedtuple(
-        self.df = df
+            "ImputeVariables", ["raw_field_name", "imputed_field_name"]
        # rename columns to be used in score
        rename_fields = {
            "Percent of individuals < 200% Federal Poverty Line": field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
        }
        self.df.rename(
            columns=rename_fields,
            inplace=True,
            errors="raise",
        )
        df = calculate_income_measures(
            impute_var_named_tup_list=[
                ImputeVariables(
                    raw_field_name=self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
                    imputed_field_name=self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
                ),
                ImputeVariables(
                    raw_field_name=self.COLLEGE_ATTENDANCE_FIELD,
                    imputed_field_name=self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
                ),
            ],
            geo_df=df,
            geoid_field=self.GEOID_TRACT_FIELD_NAME,
        )
        logger.info("Calculating with imputed values")
        df[
            self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
        ] = (
            df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME].fillna(
                df[self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME]
            )
            - df[self.COLLEGE_ATTENDANCE_FIELD].fillna(
                df[self.IMPUTED_COLLEGE_ATTENDANCE_FIELD]
            )
        ).clip(
            lower=0
        )
        # All values should have a value at this point
        assert (
            df[
                self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
            ]
            .isna()
            .sum()
            == 0
        ), "Error: not all values were filled..."
        logger.info("Renaming columns...")
        df = df.rename(
            columns={
                self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME: field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
                self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME: field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
            }
        )
        # Strip columns and save results to self.
        self.df = df[self.COLUMNS_TO_KEEP]
    def load(self) -> None:
        logger.info("Saving Census ACS Data")
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
@ -0,0 +1,127 @@
 from typing import List, NamedTuple
 import pandas as pd
 import geopandas as gpd
 import numpy as np
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
 def _get_fips_mask(
    geo_df: gpd.GeoDataFrame,
    row: gpd.GeoSeries,
    fips_digits: int,
    geoid_field: str = "GEOID10_TRACT",
 ) -> pd.Series:
    return (
        geo_df[geoid_field].str[:fips_digits] == row[geoid_field][:fips_digits]
    )
 def _get_neighbor_mask(
    geo_df: gpd.GeoDataFrame, row: gpd.GeoSeries
 ) -> pd.Series:
    return geo_df["geometry"].touches(row["geometry"])
 def _choose_best_mask(
    geo_df: gpd.GeoDataFrame,
    masks_in_priority_order: List[pd.Series],
    column_to_impute: str,
 ) -> pd.Series:
    for mask in masks_in_priority_order:
        if any(geo_df[mask][column_to_impute].notna()):
            return mask
    raise Exception("No mask found")
 def _prepare_dataframe_for_imputation(
    impute_var_named_tup_list: List[NamedTuple],
    geo_df: gpd.GeoDataFrame,
    geoid_field: str = "GEOID10_TRACT",
 ) -> tuple[list, gpd.GeoDataFrame]:
    imputing_cols = [
        impute_var_pair.raw_field_name
        for impute_var_pair in impute_var_named_tup_list
    ]
    # prime column to exist
    for impute_var_pair in impute_var_named_tup_list:
        geo_df[impute_var_pair.imputed_field_name] = geo_df[
            impute_var_pair.raw_field_name
        ].copy()
    # generate a list of tracts for which at least one of the imputation
    # columns is null
    tract_list = geo_df[geo_df[imputing_cols].isna().any(axis=1)][
        geoid_field
    ].unique()
    # Check that imputation is a valid choice for this set of fields
    logger.info(f"Imputing values for {len(tract_list)} unique tracts.")
    assert len(tract_list) > 0, "Error: No missing values to impute"
    return tract_list, geo_df
 def calculate_income_measures(
    impute_var_named_tup_list: list,
    geo_df: gpd.GeoDataFrame,
    geoid_field: str,
 ) -> pd.DataFrame:
    """Impute values based on geographic neighbors
    We only want to check neighbors a single time, so all variables
    that we impute get imputed here.
    Takes in:
        required:
            impute_var_named_tup_list: list of named tuples (imputed field, raw field)
            geo_df: geo dataframe that already has the census shapefiles merged
            geoid field: tract level ID
    Returns: non-geometry pd.DataFrame
    """
    # Determine where to impute variables and fill a column with nulls
    tract_list, geo_df = _prepare_dataframe_for_imputation(
        impute_var_named_tup_list=impute_var_named_tup_list,
        geo_df=geo_df,
        geoid_field=geoid_field,
    )
    # Iterate through the dataframe to impute in place
    for index, row in geo_df.iterrows():
        if row[geoid_field] in tract_list:
            neighbor_mask = _get_neighbor_mask(geo_df, row)
            county_mask = _get_fips_mask(
                geo_df=geo_df, row=row, fips_digits=5, geoid_field=geoid_field
            )
            state_mask = _get_fips_mask(
                geo_df=geo_df, row=row, fips_digits=2, geoid_field=geoid_field
            )
            # Impute fields for every row missing at least one value using the best possible set of neighbors
            # Note that later, we will pull raw.fillna(imputed), so the mechanics of this step aren't critical
            for impute_var_pair in impute_var_named_tup_list:
                mask_to_use = _choose_best_mask(
                    geo_df=geo_df,
                    masks_in_priority_order=[
                        neighbor_mask,
                        county_mask,
                        state_mask,
                    ],
                    column_to_impute=impute_var_pair.raw_field_name,
                )
                geo_df.loc[index, impute_var_pair.imputed_field_name] = geo_df[
                    mask_to_use
                ][impute_var_pair.raw_field_name].mean()
    logger.info("Casting geodataframe as a typical dataframe")
    # get rid of the geometry column and cast as a typical df
    df = pd.DataFrame(
        geo_df[[col for col in geo_df.columns if col != "geometry"]]
    )
    # finally, return the df
    return df
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py
@ -4,6 +4,7 @@ from typing import List
 import censusdata
 import pandas as pd
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -56,6 +56,19 @@ M_HEALTH = "Health Factor (Definition M)"
 M_WORKFORCE = "Workforce Factor (Definition M)"
 M_NON_WORKFORCE = "Any Non-Workforce Factor (Definition M)"
 # Definition Narwhal fields
 SCORE_N = "Definition N"
 SCORE_N_COMMUNITIES = "Definition N (communities)"
 N_CLIMATE = "Climate Factor (Definition N)"
 N_ENERGY = "Energy Factor (Definition N)"
 N_TRANSPORTATION = "Transportation Factor (Definition N)"
 N_HOUSING = "Housing Factor (Definition N)"
 N_POLLUTION = "Pollution Factor (Definition N)"
 N_WATER = "Water Factor (Definition N)"
 N_HEALTH = "Health Factor (Definition N)"
 N_WORKFORCE = "Workforce Factor (Definition N)"
 N_NON_WORKFORCE = "Any Non-Workforce Factor (Definition N)"
 PERCENTILE = 90
 MEDIAN_HOUSE_VALUE_PERCENTILE = 90
@ -93,9 +106,19 @@ HEALTH_SOCIO_INDICATORS_EXCEEDED = (
 # Poverty / Income
 POVERTY_FIELD = "Poverty (Less than 200% of federal poverty line)"
 # this is the raw, unadjusted variable
 POVERTY_LESS_THAN_200_FPL_FIELD = (
    "Percent of individuals below 200% Federal Poverty Line"
 )
 # this is for use in the donuts
 ADJUSTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
    "Adjusted percent of individuals < 200% Federal Poverty Line"
 )
 # this is what gets used in the score
 POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD = "Percent of individuals below 200% Federal Poverty Line, imputed and adjusted"
 POVERTY_LESS_THAN_150_FPL_FIELD = (
    "Percent of individuals < 150% Federal Poverty Line"
 )
@ -412,6 +435,7 @@ SCORE_M_LOW_INCOME_SUFFIX = (
    ", is low income, and has a low percent of higher ed students"
 )
 COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD = (
    "Percent higher ed enrollment rate is less than 20%"
 )
@ -651,6 +675,7 @@ THRESHOLD_COUNT = "Total threshold criteria exceeded"
 CATEGORY_COUNT = "Total categories exceeded"
 FPL_200_SERIES = "Is low income?"
 FPL_200_SERIES_IMPUTED_AND_ADJUSTED = "Is low income (imputed and adjusted)?"
 FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
    "Is low income and has a low percent of higher ed students?"
 )
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -0,0 +1,808 @@
 from typing import Tuple
 import numpy as np
 import pandas as pd
 from data_pipeline.score.score import Score
 import data_pipeline.score.field_names as field_names
 from data_pipeline.utils import get_module_logger
 import data_pipeline.etl.score.constants as constants
 logger = get_module_logger(__name__)
 class ScoreNarwhal(Score):
    """Very similar to Score M, at present."""
    def __init__(self, df: pd.DataFrame) -> None:
        self.LOW_INCOME_THRESHOLD: float = 0.65
        self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
        self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
        self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
        self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
        super().__init__(df)
    def _combine_island_areas_with_states_and_set_thresholds(
        self,
        df: pd.DataFrame,
        column_from_island_areas: str,
        column_from_decennial_census: str,
        combined_column_name: str,
        threshold_cutoff_for_island_areas: float,
    ) -> Tuple[pd.DataFrame, str]:
        """Steps to set thresholds for island areas.
        This function is fairly logically complicated. It takes the following steps:
            1. Combine the two different fields into a single field.
            2. Calculate the 90th percentile for the combined field.
            3. Create a boolean series that is true for any census tract in the island
                areas (and only the island areas) that exceeds this percentile.
        For step one, it combines data that is either the island area's Decennial Census
        value in 2009 or the state's value in 5-year ACS ending in 2010.
        This will be used to generate the percentile cutoff for the 90th percentile.
        The stateside decennial census stopped asking economic comparisons,
        so this is as close to apples-to-apples as we get. We use 5-year ACS for data
        robustness over 1-year ACS.
        """
        # Create the combined field.
        # TODO: move this combined field percentile calculation to `etl_score`,
        #  since most other percentile logic is there.
        # There should only be one entry in either 2009 or 2019 fields, not one in both.
        # But just to be safe, we take the mean and ignore null values so if there
        # *were* entries in both, this result would make sense.
        df[combined_column_name] = df[
            [column_from_island_areas, column_from_decennial_census]
        ].mean(axis=1, skipna=True)
        # Create a percentile field for use in the Islands / PR visualization
        # TODO: move this code
        # In the code below, percentiles are constructed based on the combined column
        # of census and island data, but only reported for the island areas (where there
        # is no other comprehensive percentile information)
        return_series_name = (
            column_from_island_areas
            + field_names.ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD
            + field_names.PERCENTILE_FIELD_SUFFIX
        )
        df[return_series_name] = np.where(
            df[column_from_decennial_census].isna(),
            df[combined_column_name].rank(pct=True),
            np.nan,
        )
        threshold_column_name = (
            f"{column_from_island_areas} exceeds "
            f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
        )
        df[threshold_column_name] = (
            df[return_series_name] >= threshold_cutoff_for_island_areas
        )
        return df, threshold_column_name
    def _increment_total_eligibility_exceeded(
        self, columns_for_subset: list, skip_fips: tuple = ()
    ) -> None:
        """
        Increments the total eligible factors for a given tract
        The new skip_fips argument specifies which (if any) fips codes to
        skip over for incrementing.
        This allows us to essentially skip data we think is of limited veracity,
        without overriding any values in the data.
        THIS IS A TEMPORARY FIX.
        """
        if skip_fips:
            self.df[field_names.THRESHOLD_COUNT] += np.where(
                self.df[field_names.GEOID_TRACT_FIELD].str.startswith(
                    skip_fips
                ),
                0,
                self.df[columns_for_subset].sum(axis=1, skipna=True),
            )
        else:
            self.df[field_names.THRESHOLD_COUNT] += self.df[
                columns_for_subset
            ].sum(axis=1, skipna=True)
    def _climate_factor(self) -> bool:
        # In Xth percentile or above for FEMA’s Risk Index (Source: FEMA
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level and there is low higher ed attendance
        # Source: Census's American Community Survey
        climate_eligibility_columns = [
            field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
            field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
            field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
        ]
        self.df[
            field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD
        ] = (
            self.df[
                field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[
            field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD
        ] = (
            self.df[
                field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD] = (
            self.df[
                field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.CLIMATE_THRESHOLD_EXCEEDED] = (
            self.df[
                field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD
            ]
            | self.df[
                field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD
            ]
            | self.df[
                field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD
            ]
        )
        self.df[field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD] = (
            self.df[
                field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD
            ]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD] = (
            self.df[
                field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD
            ]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD] = (
            self.df[field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            climate_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
        return self.df[climate_eligibility_columns].any(axis="columns")
    def _energy_factor(self) -> bool:
        # In Xth percentile or above for DOE’s energy cost burden score (Source: LEAD Score)
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level and has low higher ed attendance.
        # Source: Census's American Community Survey
        energy_eligibility_columns = [
            field_names.PM25_EXPOSURE_LOW_INCOME_FIELD,
            field_names.ENERGY_BURDEN_LOW_INCOME_FIELD,
        ]
        self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD] = (
            self.df[
                field_names.ENERGY_BURDEN_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD] = (
            self.df[
                field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.ENERGY_THRESHOLD_EXCEEDED] = (
            self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD]
            | self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD]
        )
        self.df[field_names.PM25_EXPOSURE_LOW_INCOME_FIELD] = (
            self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.ENERGY_BURDEN_LOW_INCOME_FIELD] = (
            self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            energy_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
        return self.df[energy_eligibility_columns].any(axis="columns")
    def _transportation_factor(self) -> bool:
        # In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA)
        # or
        # In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
        # or
        # In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level and has a low percent of higher ed students.
        # Source: Census's American Community Survey
        transportion_eligibility_columns = [
            field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
            field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
        ]
        self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD] = (
            self.df[
                field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD] = (
            self.df[
                field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.TRAFFIC_THRESHOLD_EXCEEDED] = (
            self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD]
            | self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD]
        )
        self.df[field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD] = (
            self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD] = (
            self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            transportion_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
        return self.df[transportion_eligibility_columns].any(axis="columns")
    def _housing_factor(self) -> bool:
        # (
        # In Xth percentile or above for lead paint (Source: Census's American Community Survey’s
        # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
        # AND
        # In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
        # )
        # or
        # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level and has a low percent of higher ed students.
        # Source: Census's American Community Survey
        housing_eligibility_columns = [
            field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
            field_names.HOUSING_BURDEN_LOW_INCOME_FIELD,
        ]
        self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD] = (
            self.df[
                field_names.LEAD_PAINT_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        ) & (
            self.df[
                field_names.MEDIAN_HOUSE_VALUE_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            <= self.MEDIAN_HOUSE_VALUE_THRESHOLD
        )
        self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD] = (
            self.df[
                field_names.HOUSING_BURDEN_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.HOUSING_THREHSOLD_EXCEEDED] = (
            self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD]
            | self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD]
        )
        # series by series indicators
        self.df[field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD] = (
            self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.HOUSING_BURDEN_LOW_INCOME_FIELD] = (
            self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            housing_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
        return self.df[housing_eligibility_columns].any(axis="columns")
    def _pollution_factor(self) -> bool:
        # Proximity to Risk Management Plan sites is > X
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level and has a low percent of higher ed students.
        # Source: Census's American Community Survey
        pollution_eligibility_columns = [
            field_names.RMP_LOW_INCOME_FIELD,
            field_names.SUPERFUND_LOW_INCOME_FIELD,
            field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
        ]
        self.df[field_names.RMP_PCTILE_THRESHOLD] = (
            self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.NPL_PCTILE_THRESHOLD] = (
            self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.TSDF_PCTILE_THRESHOLD] = (
            self.df[
                field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = (
            self.df[field_names.RMP_PCTILE_THRESHOLD]
            | self.df[field_names.NPL_PCTILE_THRESHOLD]
        ) | self.df[field_names.TSDF_PCTILE_THRESHOLD]
        # individual series-by-series
        self.df[field_names.RMP_LOW_INCOME_FIELD] = (
            self.df[field_names.RMP_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.SUPERFUND_LOW_INCOME_FIELD] = (
            self.df[field_names.NPL_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD] = (
            self.df[field_names.TSDF_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            pollution_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
        return self.df[pollution_eligibility_columns].any(axis="columns")
    def _water_factor(self) -> bool:
        # In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model)
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level and has a low percent of higher ed students
        # Source: Census's American Community Survey
        self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = (
            self.df[
                field_names.WASTEWATER_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        # Straight copy here in case we add additional water fields.
        self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
            field_names.WASTEWATER_PCTILE_THRESHOLD
        ].copy()
        self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD] = (
            self.df[field_names.WASTEWATER_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            [field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD],
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
        return self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD]
    def _health_factor(self) -> bool:
        # In Xth percentile or above for diabetes (Source: CDC Places)
        # or
        # In Xth percentile or above for asthma (Source: CDC Places)
        # or
        # In Xth percentile or above for heart disease
        # or
        # In Xth percentile or above for low life expectancy (Source: CDC Places)
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level and has a low percent of higher ed students
        # Source: Census's American Community Survey
        health_eligibility_columns = [
            field_names.DIABETES_LOW_INCOME_FIELD,
            field_names.ASTHMA_LOW_INCOME_FIELD,
            field_names.HEART_DISEASE_LOW_INCOME_FIELD,
            field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
        ]
        self.df[field_names.DIABETES_PCTILE_THRESHOLD] = (
            self.df[
                field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.ASTHMA_PCTILE_THRESHOLD] = (
            self.df[
                field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD] = (
            self.df[
                field_names.HEART_DISEASE_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD] = (
            self.df[
                field_names.LOW_LIFE_EXPECTANCY_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.HEALTH_THRESHOLD_EXCEEDED] = (
            (
                self.df[field_names.DIABETES_PCTILE_THRESHOLD]
                | self.df[field_names.ASTHMA_PCTILE_THRESHOLD]
            )
            | self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD]
        ) | self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD]
        self.df[field_names.DIABETES_LOW_INCOME_FIELD] = (
            self.df[field_names.DIABETES_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.ASTHMA_LOW_INCOME_FIELD] = (
            self.df[field_names.ASTHMA_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.HEART_DISEASE_LOW_INCOME_FIELD] = (
            self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD] = (
            self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            health_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
        return self.df[health_eligibility_columns].any(axis="columns")
    def _workforce_factor(self) -> bool:
        # Where unemployment is above Xth percentile
        # or
        # Where median income as a percent of area median income is above Xth percentile
        # or
        # Where the percent of households at or below 100% of the federal poverty level
        # is above Xth percentile
        # or
        # Where linguistic isolation is above Xth percentile
        # AND
        # Where the high school degree achievement rates for adults 25 years and older
        # is less than Y%
        # AND the higher ed attendance rates are under Z%
        # (necessary to screen out university tracts)
        # Workforce criteria for states fields.
        workforce_eligibility_columns = [
            field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
            field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
            field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
            field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
        ]
        self.df[field_names.LOW_HS_EDUCATION_FIELD] = (
            self.df[field_names.HIGH_SCHOOL_ED_FIELD]
            >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
        )
        self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD] = (
            self.df[
                field_names.UNEMPLOYMENT_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] = (
            self.df[
                field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD] = (
            self.df[
                field_names.LINGUISTIC_ISO_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.POVERTY_PCTILE_THRESHOLD] = (
            self.df[
                field_names.POVERTY_LESS_THAN_100_FPL_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD] = (
            self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD]
            & self.df[field_names.LOW_HS_EDUCATION_FIELD]
        )
        self.df[field_names.POVERTY_LOW_HS_EDUCATION_FIELD] = (
            self.df[field_names.POVERTY_PCTILE_THRESHOLD]
            & self.df[field_names.LOW_HS_EDUCATION_FIELD]
        )
        self.df[field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD] = (
            self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
            & self.df[field_names.LOW_HS_EDUCATION_FIELD]
        )
        self.df[field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD] = (
            self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD]
            & self.df[field_names.LOW_HS_EDUCATION_FIELD]
        )
        workforce_combined_criteria_for_states = self.df[
            workforce_eligibility_columns
        ].any(axis="columns")
        self._increment_total_eligibility_exceeded(
            workforce_eligibility_columns
        )
        # Now, calculate workforce criteria for island territories.
        island_areas_workforce_eligibility_columns = [
            field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
            field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
            field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
        ]
        # First, combine unemployment.
        # This will include an adjusted percentile column for the island areas
        # to be used by the front end.
        (
            self.df,
            island_areas_unemployment_criteria_field_name,
        ) = self._combine_island_areas_with_states_and_set_thresholds(
            df=self.df,
            column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
            column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
            combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
            threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
        )
        # TODO: Remove this, it's for checking only
        assert (
            island_areas_unemployment_criteria_field_name
            == field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD
        ), "Error combining island columns"
        # Next, combine poverty.
        # This will include an adjusted percentile column for the island areas
        # to be used by the front end.
        (
            self.df,
            island_areas_poverty_criteria_field_name,
        ) = self._combine_island_areas_with_states_and_set_thresholds(
            df=self.df,
            column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
            column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
            combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
            threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
        )
        # TODO: Remove this, it's for checking only
        assert (
            island_areas_poverty_criteria_field_name
            == field_names.ISLAND_POVERTY_PCTILE_THRESHOLD
        ), "Error combining island columns"
        # Also check whether low area median income is 90th percentile or higher
        # within the islands.
        # Note that because the field for low median does not have to be combined,
        # unlike the other fields, we do not need to create a new percentile
        # column. This code should probably be refactored when (TODO) we do the big
        # refactor.
        self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] = (
            self.df[
                field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
        self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] = (
            self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
            >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
        )
        self.df[
            field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD
        ] = (
            self.df[island_areas_unemployment_criteria_field_name]
            & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
        )
        self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = (
            self.df[island_areas_poverty_criteria_field_name]
            & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
        )
        self.df[
            field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD
        ] = (
            self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
            & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
        )
        workforce_combined_criteria_for_island_areas = self.df[
            island_areas_workforce_eligibility_columns
        ].any(axis="columns")
        self._increment_total_eligibility_exceeded(
            island_areas_workforce_eligibility_columns
        )
        percent_of_island_tracts_highlighted = (
            100
            * workforce_combined_criteria_for_island_areas.sum()
            # Choosing a random column from island areas to calculate the denominator.
            / self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
            .notnull()
            .sum()
        )
        logger.info(
            f"For workforce criteria in island areas, "
            f"{workforce_combined_criteria_for_island_areas.sum()} ("
            f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
            f"in the column) have a value of TRUE."
        )
        # Because these criteria are calculated differently for the islands, we also calculate the
        # thresholds to pass to the FE slightly differently
        self.df[field_names.WORKFORCE_THRESHOLD_EXCEEDED] = (
            ## First we calculate for the non-island areas
            (
                (
                    self.df[field_names.POVERTY_PCTILE_THRESHOLD]
                    | self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD]
                )
                | self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
            )
            | self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD]
        ) | (
            ## then we calculate just for the island areas
            (
                self.df[field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD]
                | self.df[field_names.ISLAND_POVERTY_PCTILE_THRESHOLD]
            )
            | self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
        )
        # Because of the island complications, we also have to separately calculate the threshold for
        # socioeconomic thresholds
        self.df[field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED] = (
            self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
            | self.df[field_names.LOW_HS_EDUCATION_FIELD]
        )
        # A tract is included if it meets either the states tract criteria or the
        # island areas tract criteria.
        return (
            workforce_combined_criteria_for_states
            | workforce_combined_criteria_for_island_areas
        )
    def add_columns(self) -> pd.DataFrame:
        logger.info("Adding Score M")
        self.df[field_names.THRESHOLD_COUNT] = 0
        # TODO: move this inside of
        #  `_create_low_income_and_low_college_attendance_threshold`
        # and change the return signature of that method.
        # Create a standalone field that captures the college attendance boolean
        # threshold.
        self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = (
            self.df[
                # UPDATE: Pull the imputed poverty statistic
                field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.LOW_INCOME_THRESHOLD
        )
        self.df[field_names.N_CLIMATE] = self._climate_factor()
        self.df[field_names.N_ENERGY] = self._energy_factor()
        self.df[field_names.N_TRANSPORTATION] = self._transportation_factor()
        self.df[field_names.N_HOUSING] = self._housing_factor()
        self.df[field_names.N_POLLUTION] = self._pollution_factor()
        self.df[field_names.N_WATER] = self._water_factor()
        self.df[field_names.N_HEALTH] = self._health_factor()
        self.df[field_names.N_WORKFORCE] = self._workforce_factor()
        factors = [
            field_names.N_CLIMATE,
            field_names.N_ENERGY,
            field_names.N_TRANSPORTATION,
            field_names.N_HOUSING,
            field_names.N_POLLUTION,
            field_names.N_WATER,
            field_names.N_HEALTH,
            field_names.N_WORKFORCE,
        ]
        self.df[field_names.CATEGORY_COUNT] = self.df[factors].sum(axis=1)
        self.df[field_names.SCORE_N_COMMUNITIES] = self.df[factors].any(axis=1)
        return self.df
--- a/data/data-pipeline/data_pipeline/score/score_runner.py
+++ b/data/data-pipeline/data_pipeline/score/score_runner.py
@ -10,6 +10,7 @@ from data_pipeline.score.score_i import ScoreI
 from data_pipeline.score.score_k import ScoreK
 from data_pipeline.score.score_l import ScoreL
 from data_pipeline.score.score_m import ScoreM
 from data_pipeline.score.score_narwhal import ScoreNarwhal
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
@ -35,6 +36,7 @@ class ScoreRunner:
        self.df = ScoreK(df=self.df).add_columns()
        self.df = ScoreL(df=self.df).add_columns()
        self.df = ScoreM(df=self.df).add_columns()
        self.df = ScoreNarwhal(df=self.df).add_columns()
        # TODO do this with each score instead of in a bundle
        # Create percentiles for these index scores
--- a/data/data-pipeline/poetry.lock
+++ b/data/data-pipeline/poetry.lock
@ -50,7 +50,7 @@ tests = ["pytest"]
 [[package]]
 name = "astroid"
-version = "2.11.2"
+version = "2.11.3"
 description = "An abstract syntax tree for Python with inference support."
 category = "main"
 optional = false
@ -411,19 +411,20 @@ pyflakes = ">=2.3.0,<2.4.0"
 [[package]]
 name = "fonttools"
-version = "4.32.0"
+version = "4.33.3"
 description = "Tools to manipulate font files"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 [package.extras]
-all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"]
+all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"]
 graphite = ["lz4 (>=1.7.4.2)"]
 interpolatable = ["scipy", "munkres"]
 lxml = ["lxml (>=4.0,<5)"]
 pathops = ["skia-pathops (>=0.5.0)"]
 plot = ["matplotlib"]
 repacker = ["uharfbuzz (>=0.23.0)"]
 symfont = ["sympy"]
 type1 = ["xattr"]
 ufo = ["fs (>=2.2.0,<3)"]
@ -657,7 +658,7 @@ qtconsole = "*"
 [[package]]
 name = "jupyter-client"
-version = "7.2.2"
+version = "7.3.0"
 description = "Jupyter protocol implementation and client libraries"
 category = "main"
 optional = false
@ -879,7 +880,7 @@ tests = ["pytest", "pytz", "simplejson"]
 [[package]]
 name = "marshmallow-dataclass"
-version = "8.5.3"
+version = "8.5.7"
 description = "Python library to convert dataclasses into marshmallow schemas."
 category = "main"
 optional = false
@ -890,11 +891,11 @@ marshmallow = ">=3.13.0,<4.0"
 typing-inspect = ">=0.7.1"
 [package.extras]
-dev = ["marshmallow-enum", "typeguard", "pre-commit (>=1.18,<2.0)", "sphinx", "pytest (>=5.4)", "pytest-mypy-plugins (>=1.2.0)", "typing-extensions (>=3.7.2,<3.8.0)"]
+dev = ["marshmallow-enum", "typeguard", "pre-commit (>=2.17,<3.0)", "sphinx", "pytest (>=5.4)", "pytest-mypy-plugins (>=1.2.0)", "typing-extensions (>=3.7.2)"]
 docs = ["sphinx"]
 enum = ["marshmallow-enum"]
-lint = ["pre-commit (>=1.18,<2.0)"]
+lint = ["pre-commit (>=2.17,<3.0)"]
-tests = ["pytest (>=5.4)", "pytest-mypy-plugins (>=1.2.0)", "typing-extensions (>=3.7.2,<3.8.0)"]
+tests = ["pytest (>=5.4)", "pytest-mypy-plugins (>=1.2.0)", "typing-extensions (>=3.7.2)"]
 union = ["typeguard"]
 [[package]]
@ -1348,6 +1349,21 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [[package]]
 name = "pydantic"
 version = "1.9.0"
 description = "Data validation and settings management using python 3.6 type hinting"
 category = "main"
 optional = false
 python-versions = ">=3.6.1"
 [package.dependencies]
 typing-extensions = ">=3.7.4.3"
 [package.extras]
 dotenv = ["python-dotenv (>=0.10.4)"]
 email = ["email-validator (>=1.0.3)"]
 [[package]]
 name = "pyflakes"
 version = "2.3.1"
@ -1358,22 +1374,22 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [[package]]
 name = "pygments"
-version = "2.11.2"
+version = "2.12.0"
 description = "Pygments is a syntax highlighting package written in Python."
 category = "main"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
 [[package]]
 name = "pylint"
-version = "2.13.5"
+version = "2.13.7"
 description = "python code static checker"
 category = "main"
 optional = false
 python-versions = ">=3.6.2"
 [package.dependencies]
-astroid = ">=2.11.2,<=2.12.0-dev0"
+astroid = ">=2.11.3,<=2.12.0-dev0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
 dill = ">=0.2"
 isort = ">=4.2.5,<6"
@ -1406,7 +1422,7 @@ diagrams = ["railroad-diagrams", "jinja2"]
 [[package]]
 name = "pyproj"
-version = "3.3.0"
+version = "3.3.1"
 description = "Python interface to PROJ (cartographic projections and coordinate transformations library)"
 category = "main"
 optional = false
@ -1828,7 +1844,7 @@ test = ["pytest"]
 [[package]]
 name = "types-requests"
-version = "2.27.19"
+version = "2.27.22"
 description = "Typing stubs for requests"
 category = "main"
 optional = false
@ -1965,7 +1981,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "120a7d23ab8c6bb5f17e226f844627d124e7e3a986d1b7fe72b41ce5b45bbb78"
+content-hash = "e4462f3e9a5d1cf2449ac9ad0d9ed250a5fda5d03d04e2845e4be3526d943b2b"
 [metadata.files]
 ansiwrap = [
@ -2004,8 +2020,8 @@ argon2-cffi-bindings = [
    {file = "argon2_cffi_bindings-21.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5e00316dabdaea0b2dd82d141cc66889ced0cdcbfa599e8b471cf22c620c329a"},
 ]
 astroid = [
-    {file = "astroid-2.11.2-py3-none-any.whl", hash = "sha256:cc8cc0d2d916c42d0a7c476c57550a4557a083081976bf42a73414322a6411d9"},
+    {file = "astroid-2.11.3-py3-none-any.whl", hash = "sha256:f1af57483cd17e963b2eddce8361e00fc593d1520fe19948488e94ff6476bd71"},
-    {file = "astroid-2.11.2.tar.gz", hash = "sha256:8d0a30fe6481ce919f56690076eafbb2fb649142a89dc874f1ec0e7a011492d0"},
+    {file = "astroid-2.11.3.tar.gz", hash = "sha256:4e5ba10571e197785e312966ea5efb2f5783176d4c1a73fa922d474ae2be59f7"},
 ]
 atomicwrites = [
    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
@ -2196,8 +2212,8 @@ flake8 = [
    {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"},
 ]
 fonttools = [
-    {file = "fonttools-4.32.0-py3-none-any.whl", hash = "sha256:b038d1a0dee0079de7ade57071e2e2aced6e35bd697de244ac62938b2b1628c1"},
+    {file = "fonttools-4.33.3-py3-none-any.whl", hash = "sha256:f829c579a8678fa939a1d9e9894d01941db869de44390adb49ce67055a06cc2a"},
-    {file = "fonttools-4.32.0.zip", hash = "sha256:59a90de72149893167e3d552ae2402c6874e006b9adc3feaf5f6d706fe20d392"},
+    {file = "fonttools-4.33.3.zip", hash = "sha256:c0fdcfa8ceebd7c1b2021240bd46ef77aa8e7408cf10434be55df52384865f8e"},
 ]
 geopandas = [
    {file = "geopandas-0.9.0-py2.py3-none-any.whl", hash = "sha256:79f6e557ba0dba76eec44f8351b1c6b42a17c38f5f08fef347e98fe4dae563c7"},
@ -2259,8 +2275,8 @@ jupyter = [
    {file = "jupyter-1.0.0.zip", hash = "sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7"},
 ]
 jupyter-client = [
-    {file = "jupyter_client-7.2.2-py3-none-any.whl", hash = "sha256:44045448eadc12493d819d965eb1dc9d10d1927698adbb9b14eb9a3a4a45ba53"},
+    {file = "jupyter_client-7.3.0-py3-none-any.whl", hash = "sha256:671dd2d90d03f41716b09627a4eb06bb37875f92bf6563cc2ce4fe71c61c5cda"},
-    {file = "jupyter_client-7.2.2.tar.gz", hash = "sha256:8fdbad344a8baa6a413d86d25bbf87ce21cb2b4aa5a8e0413863b9754eb8eb8a"},
+    {file = "jupyter_client-7.3.0.tar.gz", hash = "sha256:3bcc8e08a294d0fa9406e48cfe17e11ef0efdb7c504fe8cc335128e3ef8f3dac"},
 ]
 jupyter-console = [
    {file = "jupyter_console-6.4.3-py3-none-any.whl", hash = "sha256:e630bcb682c0088dda45688ad7c2424d4a825c8acf494cb036ced03ed0424841"},
@ -2503,8 +2519,8 @@ marshmallow = [
    {file = "marshmallow-3.15.0.tar.gz", hash = "sha256:2aaaab4f01ef4f5a011a21319af9fce17ab13bf28a026d1252adab0e035648d5"},
 ]
 marshmallow-dataclass = [
-    {file = "marshmallow_dataclass-8.5.3-py3-none-any.whl", hash = "sha256:eefeff62ee975c64d293d2db9370e7e748a2ff83dcb5109416b75e087a2ac02e"},
+    {file = "marshmallow_dataclass-8.5.7-py3-none-any.whl", hash = "sha256:da530f92f806673b9f40d8dc671ca18848b6cebded0eaecef720e256b5143e69"},
-    {file = "marshmallow_dataclass-8.5.3.tar.gz", hash = "sha256:c0c5e1ea8d0e557b6fa00343799a9a9e60757b948fb096076beb6aa76bd68d30"},
+    {file = "marshmallow_dataclass-8.5.7.tar.gz", hash = "sha256:0bdb779939b4656a40430a6a8390af698676eef89c2e583deb06e3585bf81bba"},
 ]
 marshmallow-enum = [
    {file = "marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58"},
@ -2799,17 +2815,54 @@ pycparser = [
    {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
    {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]
 pydantic = [
    {file = "pydantic-1.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb23bcc093697cdea2708baae4f9ba0e972960a835af22560f6ae4e7e47d33f5"},
    {file = "pydantic-1.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1d5278bd9f0eee04a44c712982343103bba63507480bfd2fc2790fa70cd64cf4"},
    {file = "pydantic-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab624700dc145aa809e6f3ec93fb8e7d0f99d9023b713f6a953637429b437d37"},
    {file = "pydantic-1.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c8d7da6f1c1049eefb718d43d99ad73100c958a5367d30b9321b092771e96c25"},
    {file = "pydantic-1.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3c3b035103bd4e2e4a28da9da7ef2fa47b00ee4a9cf4f1a735214c1bcd05e0f6"},
    {file = "pydantic-1.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:3011b975c973819883842c5ab925a4e4298dffccf7782c55ec3580ed17dc464c"},
    {file = "pydantic-1.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:086254884d10d3ba16da0588604ffdc5aab3f7f09557b998373e885c690dd398"},
    {file = "pydantic-1.9.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0fe476769acaa7fcddd17cadd172b156b53546ec3614a4d880e5d29ea5fbce65"},
    {file = "pydantic-1.9.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8e9dcf1ac499679aceedac7e7ca6d8641f0193c591a2d090282aaf8e9445a46"},
    {file = "pydantic-1.9.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1e4c28f30e767fd07f2ddc6f74f41f034d1dd6bc526cd59e63a82fe8bb9ef4c"},
    {file = "pydantic-1.9.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:c86229333cabaaa8c51cf971496f10318c4734cf7b641f08af0a6fbf17ca3054"},
    {file = "pydantic-1.9.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:c0727bda6e38144d464daec31dff936a82917f431d9c39c39c60a26567eae3ed"},
    {file = "pydantic-1.9.0-cp36-cp36m-win_amd64.whl", hash = "sha256:dee5ef83a76ac31ab0c78c10bd7d5437bfdb6358c95b91f1ba7ff7b76f9996a1"},
    {file = "pydantic-1.9.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d9c9bdb3af48e242838f9f6e6127de9be7063aad17b32215ccc36a09c5cf1070"},
    {file = "pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ee7e3209db1e468341ef41fe263eb655f67f5c5a76c924044314e139a1103a2"},
    {file = "pydantic-1.9.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b6037175234850ffd094ca77bf60fb54b08b5b22bc85865331dd3bda7a02fa1"},
    {file = "pydantic-1.9.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b2571db88c636d862b35090ccf92bf24004393f85c8870a37f42d9f23d13e032"},
    {file = "pydantic-1.9.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8b5ac0f1c83d31b324e57a273da59197c83d1bb18171e512908fe5dc7278a1d6"},
    {file = "pydantic-1.9.0-cp37-cp37m-win_amd64.whl", hash = "sha256:bbbc94d0c94dd80b3340fc4f04fd4d701f4b038ebad72c39693c794fd3bc2d9d"},
    {file = "pydantic-1.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e0896200b6a40197405af18828da49f067c2fa1f821491bc8f5bde241ef3f7d7"},
    {file = "pydantic-1.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bdfdadb5994b44bd5579cfa7c9b0e1b0e540c952d56f627eb227851cda9db77"},
    {file = "pydantic-1.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:574936363cd4b9eed8acdd6b80d0143162f2eb654d96cb3a8ee91d3e64bf4cf9"},
    {file = "pydantic-1.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c556695b699f648c58373b542534308922c46a1cda06ea47bc9ca45ef5b39ae6"},
    {file = "pydantic-1.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f947352c3434e8b937e3aa8f96f47bdfe6d92779e44bb3f41e4c213ba6a32145"},
    {file = "pydantic-1.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5e48ef4a8b8c066c4a31409d91d7ca372a774d0212da2787c0d32f8045b1e034"},
    {file = "pydantic-1.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:96f240bce182ca7fe045c76bcebfa0b0534a1bf402ed05914a6f1dadff91877f"},
    {file = "pydantic-1.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:815ddebb2792efd4bba5488bc8fde09c29e8ca3227d27cf1c6990fc830fd292b"},
    {file = "pydantic-1.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c5b77947b9e85a54848343928b597b4f74fc364b70926b3c4441ff52620640c"},
    {file = "pydantic-1.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c68c3bc88dbda2a6805e9a142ce84782d3930f8fdd9655430d8576315ad97ce"},
    {file = "pydantic-1.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a79330f8571faf71bf93667d3ee054609816f10a259a109a0738dac983b23c3"},
    {file = "pydantic-1.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f5a64b64ddf4c99fe201ac2724daada8595ada0d102ab96d019c1555c2d6441d"},
    {file = "pydantic-1.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a733965f1a2b4090a5238d40d983dcd78f3ecea221c7af1497b845a9709c1721"},
    {file = "pydantic-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cc6a4cb8a118ffec2ca5fcb47afbacb4f16d0ab8b7350ddea5e8ef7bcc53a16"},
    {file = "pydantic-1.9.0-py3-none-any.whl", hash = "sha256:085ca1de245782e9b46cefcf99deecc67d418737a1fd3f6a4f511344b613a5b3"},
    {file = "pydantic-1.9.0.tar.gz", hash = "sha256:742645059757a56ecd886faf4ed2441b9c0cd406079c2b4bee51bcc3fbcd510a"},
 ]
 pyflakes = [
    {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"},
    {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"},
 ]
 pygments = [
-    {file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"},
+    {file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"},
-    {file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"},
+    {file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"},
 ]
 pylint = [
-    {file = "pylint-2.13.5-py3-none-any.whl", hash = "sha256:c149694cfdeaee1aa2465e6eaab84c87a881a7d55e6e93e09466be7164764d1e"},
+    {file = "pylint-2.13.7-py3-none-any.whl", hash = "sha256:13ddbbd8872c804574149e81197c28877eba75224ba6b76cd8652fc31df55c1c"},
-    {file = "pylint-2.13.5.tar.gz", hash = "sha256:dab221658368c7a05242e673c275c488670144123f4bd262b2777249c1c0de9b"},
+    {file = "pylint-2.13.7.tar.gz", hash = "sha256:911d3a97c808f7554643bcc5416028cfdc42eae34ed129b150741888c688d5d5"},
 ]
 pypandoc = [
    {file = "pypandoc-1.7.5.tar.gz", hash = "sha256:802c26aae17b64136c6d006949d8ce183a7d4d9fbd4f2d051e66f4fb9f45ca50"},
@ -2819,26 +2872,29 @@ pyparsing = [
    {file = "pyparsing-3.0.8.tar.gz", hash = "sha256:7bf433498c016c4314268d95df76c81b842a4cb2b276fa3312cfb1e1d85f6954"},
 ]
 pyproj = [
-    {file = "pyproj-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2c41c9b7b5e1a1b0acc2b7b2f5de65b226f7b96c870888e4f679ff96322b1ed0"},
+    {file = "pyproj-3.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:473961faef7a9fd723c5d432f65220ea6ab3854e606bf84b4d409a75a4261c78"},
-    {file = "pyproj-3.3.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0e1fd560b509b722db6566fa9685252f25640e93464d09e13d5190ed7ab491ba"},
+    {file = "pyproj-3.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fef9c1e339f25c57f6ae0558b5ab1bbdf7994529a30d8d7504fc6302ea51c03"},
-    {file = "pyproj-3.3.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277072176a17471c0b1d25d6cae75401d81e9b50ea625ba546f5b79acd757dfc"},
+    {file = "pyproj-3.3.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:140fa649fedd04f680a39f8ad339799a55cb1c49f6a84e1b32b97e49646647aa"},
-    {file = "pyproj-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eca8ecf2b6b3225d93c723e6a2f51143d9195ac407f69e979363cdde344b93bb"},
+    {file = "pyproj-3.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b59c08aea13ee428cf8a919212d55c036cc94784805ed77c8f31a4d1f541058c"},
-    {file = "pyproj-3.3.0-cp310-cp310-win32.whl", hash = "sha256:4d2fc49c73d9f34e932bf37926d56916ba1b6f2f693cd4d8cc1d0d9eacc0e537"},
+    {file = "pyproj-3.3.1-cp310-cp310-win32.whl", hash = "sha256:1adc9ccd1bf04998493b6a2e87e60656c75ab790653b36cfe351e9ef214828ed"},
-    {file = "pyproj-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:ce1adec823738e2d7c6af019fc38f58b4204bacfc782e4430373578c672f3833"},
+    {file = "pyproj-3.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:42eea10afc750fccd1c5c4ba56de29ab791ab4d83c1f7db72705566282ac5396"},
-    {file = "pyproj-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e70a1ea6f198cace1a492397bdd0a46e640201120973293d6c48031e370d6a87"},
+    {file = "pyproj-3.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:531ea36519fa7b581466d4b6ab32f66ae4dadd9499d726352f71ee5e19c3d1c5"},
-    {file = "pyproj-3.3.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:99f171da5f885efeec8d7fb2e2557175ffa8834eeb488842b1f52ac78a9a98e5"},
+    {file = "pyproj-3.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67025e37598a6bbed2c9c6c9e4c911f6dd39315d3e1148ead935a5c4d64309d5"},
-    {file = "pyproj-3.3.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3d28b84913cd849832a8f154c0e0c2ee4618057f7389ee68bfdb2145e7ed78cc"},
+    {file = "pyproj-3.3.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aed1a3c0cd4182425f91b48d5db39f459bc2fe0d88017ead6425a1bc85faee33"},
-    {file = "pyproj-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab4baf781721640659db83a6b4da636fc403008f4978c668275754284c946778"},
+    {file = "pyproj-3.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cc4771403db54494e1e55bca8e6d33cde322f8cf0ed39f1557ff109c66d2cd1"},
-    {file = "pyproj-3.3.0-cp38-cp38-win32.whl", hash = "sha256:4125e6704751d0e82d8d912d9851da097e8d38599d4c45f9944faaeb21771938"},
+    {file = "pyproj-3.3.1-cp38-cp38-win32.whl", hash = "sha256:c99f7b5757a28040a2dd4a28c9805fdf13eef79a796f4a566ab5cb362d10630d"},
-    {file = "pyproj-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:b15e199c1da8fd132e11dfa68d8cf65d4812dedabc776b308df778ecd0d07658"},
+    {file = "pyproj-3.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:5dac03d4338a4c8bd0f69144c527474f517b4cbd7d2d8c532cd8937799723248"},
-    {file = "pyproj-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fcceb6736085bf19291b707bc67c8cebe05330bd02268e9b8eba6d28a1905fce"},
+    {file = "pyproj-3.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:56b0f9ee2c5b2520b18db30a393a7b86130cf527ddbb8c96e7f3c837474a9d79"},
-    {file = "pyproj-3.3.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:dbf479bd481774ad217e9db5674868eee8f01dfe3868f61753328895ae7da61a"},
+    {file = "pyproj-3.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f92d8f6514516124abb714dce912b20867831162cfff9fae2678ef07b6fcf0f"},
-    {file = "pyproj-3.3.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:067a5c6099949edd66e9a10b139af4e2f65ebadb9f59583923a1d3feefac749a"},
+    {file = "pyproj-3.3.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1ef1bfbe2dcc558c7a98e2f1836abdcd630390f3160724a6f4f5c818b2be0ad5"},
-    {file = "pyproj-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:235b52d8700ffb6be1f3638b1e25d83a9c13edcdb793236d8a98fd39227c5c27"},
+    {file = "pyproj-3.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ca5f32b56210429b367ca4f9a57ffe67975c487af82e179a24370879a3daf68"},
-    {file = "pyproj-3.3.0-cp39-cp39-win32.whl", hash = "sha256:44b5590c0b8dd002154916e170ef88f57abf91005b34bcb23faef97abb4d42c2"},
+    {file = "pyproj-3.3.1-cp39-cp39-win32.whl", hash = "sha256:aba199704c824fb84ab64927e7bc9ef71e603e483130ec0f7e09e97259b8f61f"},
-    {file = "pyproj-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:b48dd9e5736957707fce1d9253fb0772bcf80480198c7790e21fed73fee61240"},
+    {file = "pyproj-3.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:120d45ed73144c65e9677dc73ba8a531c495d179dd9f9f0471ac5acc02d7ac4b"},
-    {file = "pyproj-3.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5a105bfe37c78416d2641cd5d3368c99057d041f15f8d51ea3898953b21395c9"},
+    {file = "pyproj-3.3.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:52efb681647dfac185cc655a709bc0caaf910031a0390f816f5fc8ce150cbedc"},
-    {file = "pyproj-3.3.0.tar.gz", hash = "sha256:ce8bfbc212729e9a643f5f5d77f7a93394e032eda1e2d8799ae902d08add747e"},
+    {file = "pyproj-3.3.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ab0d6e38fda7c13726afacaf62e9f9dd858089d67910471758afd9cb24e0ecd"},
    {file = "pyproj-3.3.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45487942c19c5a8b09c91964ea3201f4e094518e34743cae373889a36e3d9260"},
    {file = "pyproj-3.3.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:797ad5655d484feac14b0fbb4a4efeaac0cf780a223046e2465494c767fd1c3b"},
    {file = "pyproj-3.3.1.tar.gz", hash = "sha256:b3d8e14d91cc95fb3dbc03a9d0588ac58326803eefa5bbb0978d109de3304fbe"},
 ]
 pyrsistent = [
    {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"},
@ -3161,8 +3217,8 @@ traitlets = [
    {file = "traitlets-5.1.1.tar.gz", hash = "sha256:059f456c5a7c1c82b98c2e8c799f39c9b8128f6d0d46941ee118daace9eb70c7"},
 ]
 types-requests = [
-    {file = "types-requests-2.27.19.tar.gz", hash = "sha256:795e378117088d1e4bf41a2c01a153b73d6ea40aa9d7c0ac753abde84c0d3a8f"},
+    {file = "types-requests-2.27.22.tar.gz", hash = "sha256:2e81a74d2db1e6d06baa4a9e1896720543739297a23daac0436a34e2fc732574"},
-    {file = "types_requests-2.27.19-py3-none-any.whl", hash = "sha256:c6c5384677d98f212516de50c4b2c38ef659b93008fbc5bb4b81726138bc8485"},
+    {file = "types_requests-2.27.22-py3-none-any.whl", hash = "sha256:58730c31469fb959a21496d97d2e59c06ca6de2ccdfecb583cb924b83cb0811e"},
 ]
 types-urllib3 = [
    {file = "types-urllib3-1.26.13.tar.gz", hash = "sha256:40f8fb5e8cd7d57e8aefdee3fdd5e930aa1a1bb4179cdadd55226cea588af790"},
--- a/data/data-pipeline/pyproject.toml
+++ b/data/data-pipeline/pyproject.toml
@ -39,6 +39,7 @@ tqdm = "4.62.0"
 types-requests = "^2.25.0"
 us = "^2.0.2"
 xlsxwriter = "^2.0.0"
 pydantic = "^1.9.0"
 [tool.poetry.dev-dependencies]
 black = {version = "^21.6b0", allow-prereleases = true}