Merge branch 'emma-nechamkin/release/score-narwhal' of github.com:usds/justice40-tool into emma-nechamkin/release/score-narwhal

2025-08-19 23:21:39 -07:00 · 2022-08-30 14:16:00 -04:00 · 2022-08-30 14:16:00 -04:00 · 6e575c6110
commit 6e575c6110
parent 4a25a28b0e 1c4d3e4142
20 changed files with 1428 additions and 31 deletions
--- a/.github/workflows/deploy_be_staging.yml
+++ b/.github/workflows/deploy_be_staging.yml
@ -62,6 +62,9 @@ jobs:
      - name: Generate Score Post
        run: |
          poetry run python3 data_pipeline/application.py generate-score-post -s aws
      - name: Run Smoketests
        run: |
          poetry run pytest data_pipeline/ -m smoketest
      - name: Deploy Score to Geoplatform AWS
        run: |
          poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@ -196,7 +196,7 @@ Here's a list of commands:
 ## Local development
-You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. Also to generate tiles for a local map, you will need [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe). Please refer to the repos for specific instructions for your OS.
+You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. For score generation, you will need [libspatialindex](https://libspatialindex.org/en/latest/). And to generate tiles for a local map, you will need [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe). Please refer to the repos for specific instructions for your OS.
 ### VSCode
@ -218,6 +218,7 @@ To install the above-named executables:
 - gdal: `brew install gdal`
 - Tippecanoe: `brew install tippecanoe`
 - spatialindex: `brew install spatialindex`
 Note: For MacOS Monterey or M1 Macs, [you might need to follow these steps](https://stackoverflow.com/a/70880741) to install Scipy.
@ -229,7 +230,7 @@ If you want to run tile generation, please install TippeCanoe [following these i
 - Start a terminal
 - Change to this directory (`/data/data-pipeline/`)
- Make sure you have at least Python 3.7 installed: `python -V` or `python3 -V`
+- Make sure you have at least Python 3.8 installed: `python -V` or `python3 -V`
 - We use [Poetry](https://python-poetry.org/) for managing dependencies and building the application. Please follow the instructions on their site to download.
 - Install Poetry requirements with `poetry install`
--- a/data/data-pipeline/data_pipeline/config.py
+++ b/data/data-pipeline/data_pipeline/config.py
@ -12,7 +12,7 @@ settings = Dynaconf(
 # set root dir
 settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
-
+settings.REQUESTS_DEFAULT_TIMOUT = 3600
 # To set an environment use:
 # Linux/OSX: export ENV_FOR_DYNACONF=staging
 # Windows: set ENV_FOR_DYNACONF=staging
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -1,5 +1,5 @@
 import functools
-from collections import namedtuple
+from dataclasses import dataclass
 import numpy as np
 import pandas as pd
@ -496,10 +496,11 @@ class ScoreETL(ExtractTransformLoad):
        # >= some threshold.
        # TODO: Add more fields here.
        #  https://github.com/usds/justice40-tool/issues/970
-        ReversePercentile = namedtuple(
+        @dataclass
-            typename="ReversePercentile",
+        class ReversePercentile:
-            field_names=["field_name", "low_field_name"],
+            field_name: str
-        )
+            low_field_name: str
        reverse_percentiles = [
            # This dictionary follows the format:
            # <field name> : <field name for low values>
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -51,7 +51,7 @@ class GeoScoreETL(ExtractTransformLoad):
        ## TODO: We really should not have this any longer changing
        self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
-            field_names.SCORE_N
+            field_names.FINAL_SCORE_N_BOOLEAN
        ]
        self.TARGET_SCORE_RENAME_TO = "SCORE"
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
@ -1,4 +1,4 @@
-from typing import List, NamedTuple
+from typing import Any, List, NamedTuple, Tuple
 import pandas as pd
 import geopandas as gpd
@ -41,7 +41,7 @@ def _prepare_dataframe_for_imputation(
    impute_var_named_tup_list: List[NamedTuple],
    geo_df: gpd.GeoDataFrame,
    geoid_field: str = "GEOID10_TRACT",
-) -> tuple[list, gpd.GeoDataFrame]:
+) -> Tuple[Any, gpd.GeoDataFrame]:
    imputing_cols = [
        impute_var_pair.raw_field_name
        for impute_var_pair in impute_var_named_tup_list
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@ -282,12 +282,20 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        # Download MSA median incomes
        logger.info("Starting download of MSA median incomes.")
-        download = requests.get(self.MSA_MEDIAN_INCOME_URL, verify=None)
+        download = requests.get(
            self.MSA_MEDIAN_INCOME_URL,
            verify=None,
            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
        )
        self.msa_median_incomes = json.loads(download.content)
        # Download state median incomes
        logger.info("Starting download of state median incomes.")
-        download_state = requests.get(self.STATE_MEDIAN_INCOME_URL, verify=None)
+        download_state = requests.get(
            self.STATE_MEDIAN_INCOME_URL,
            verify=None,
            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
        )
        self.state_median_incomes = json.loads(download_state.content)
        ## NOTE we already have PR's MI here
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@ -7,6 +7,7 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import get_module_logger
 from data_pipeline.score import field_names
 from data_pipeline.config import settings
 pd.options.mode.chained_assignment = "raise"
@ -270,7 +271,8 @@ class CensusDecennialETL(ExtractTransformLoad):
                        island["var_list"],
                        island["fips"],
                        county,
-                    )
+                    ),
                    timeout=settings.REQUESTS_DEFAULT_TIMOUT,
                )
                df = json.loads(download.content)
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
@ -3,6 +3,7 @@ import requests
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 logger = get_module_logger(__name__)
@ -26,7 +27,11 @@ class HudRecapETL(ExtractTransformLoad):
    def extract(self) -> None:
        logger.info("Downloading HUD Recap Data")
-        download = requests.get(self.HUD_RECAP_CSV_URL, verify=None)
+        download = requests.get(
            self.HUD_RECAP_CSV_URL,
            verify=None,
            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
        )
        file_contents = download.content
        csv_file = open(self.HUD_RECAP_CSV, "wb")
        csv_file.write(file_contents)
--- a/data/data-pipeline/data_pipeline/ipython/compare_tiles_and_geoJson_files.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/compare_tiles_and_geoJson_files.ipynb
@ -0,0 +1,354 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c9fab286",
   "metadata": {},
   "outputs": [],
   "source": [
    "# %load_ext lab_black\n",
    "import json\n",
    "import pandas as pd\n",
    "import geopandas as gpd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "dbd84e10",
   "metadata": {},
   "outputs": [
    {
     "ename": "DriverError",
     "evalue": "/mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mCPLE_OpenFailedError\u001b[0m                      Traceback (most recent call last)",
      "\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mfiona/_err.pyx\u001b[0m in \u001b[0;36mfiona._err.exc_wrap_pointer\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mCPLE_OpenFailedError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mDriverError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_10603/1449522338.py\u001b[0m in \u001b[0;36m<cell line: 3>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Read in the score geojson file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mdata_pipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0metl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstants\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/geopandas/io/file.py\u001b[0m in \u001b[0;36m_read_file\u001b[0;34m(filename, bbox, mask, rows, **kwargs)\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    159\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mfiona_env\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_or_bytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfeatures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    162\u001b[0m             \u001b[0;31m# In a future Fiona release the crs attribute of features will\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/env.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    406\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    407\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlocal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_env\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 408\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    409\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    410\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/__init__.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)\u001b[0m\n\u001b[1;32m    262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    263\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m             c = Collection(path, mode, driver=driver, encoding=encoding,\n\u001b[0m\u001b[1;32m    265\u001b[0m                            layer=layer, enabled_drivers=enabled_drivers, **kwargs)\n\u001b[1;32m    266\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/collection.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)\u001b[0m\n\u001b[1;32m    160\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    161\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    163\u001b[0m             \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    164\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mWritingSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mfiona/ogrext.pyx\u001b[0m in \u001b[0;36mfiona.ogrext.Session.start\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mDriverError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory"
     ]
    }
   ],
   "source": [
    "# Read in the score geojson file\n",
    "from data_pipeline.etl.score.constants import DATA_SCORE_CSV_TILES_FILE_PATH\n",
    "nation = gpd.read_file(DATA_SCORE_CSV_TILES_FILE_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f850529",
   "metadata": {},
   "outputs": [],
   "source": [
    "nation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f342d36",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the columns of the df and sort the list:\n",
    "sorted_nation = sorted(nation.columns.to_list())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "97aac08f",
   "metadata": {},
   "source": [
    "CLI to covert a pbf into a json file (requires tippecannoe and jq to be installed)\n",
    "\n",
    "```bash\n",
    "curl https://justice40-data.s3.amazonaws.com/data-pipeline-staging/1822/e6385c172f1d2adf588050375b7c0985035cfb24/data/score/tiles/high/8/67/101.pbf -o uh-1822-e638-8-67-101.pbf | tippecanoe-decode uh-1822-e638-8-67-101.pbf 8 67 101 | jq > cat uh-1822-e638-8-67-101.json\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbe37ccb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load a random high-tile json (after decoding a pbf) file using json.loads()\n",
    "with open(\"/Users/vims/Downloads/uh-1822-e638-8-67-101.json\", \"r\") as f:\n",
    "    random_tile_features = json.loads(f.read())\n",
    "\n",
    "# Flatten data around the features key:\n",
    "flatten_features = pd.json_normalize(random_tile_features, record_path=[\"features\"])\n",
    "\n",
    "# index into the feature properties, get keys and turn into a sorted list\n",
    "random_tile = sorted(list(flatten_features[\"features\"][0][0][\"properties\"].keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a33f5126",
   "metadata": {},
   "outputs": [],
   "source": [
    "set_dif = set(sorted_nation).symmetric_difference(set(random_tile))\n",
    "list(set_dif)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d228360b",
   "metadata": {},
   "outputs": [],
   "source": [
    "nation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6925138",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f2d7ba0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID10</th>\n",
       "      <th>SF</th>\n",
       "      <th>CF</th>\n",
       "      <th>HRS_ET</th>\n",
       "      <th>AML_ET</th>\n",
       "      <th>FUDS_ET</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>27061480300</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Itasca County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>27061940000</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Itasca County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>27077460400</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Lake of the Woods County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127</th>\n",
       "      <td>27123042001</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Ramsey County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>27123033400</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Ramsey County</td>\n",
       "      <td>0</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74047</th>\n",
       "      <td>16055000200</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Kootenai County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74068</th>\n",
       "      <td>16011950500</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Bingham County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74076</th>\n",
       "      <td>16001010503</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Ada County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74107</th>\n",
       "      <td>16001001000</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Ada County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74123</th>\n",
       "      <td>16001002100</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Ada County</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3170 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           GEOID10         SF                        CF HRS_ET AML_ET FUDS_ET\n",
       "71     27061480300  Minnesota             Itasca County   None   None       0\n",
       "75     27061940000  Minnesota             Itasca County   None   None       0\n",
       "115    27077460400  Minnesota  Lake of the Woods County   None   None       0\n",
       "127    27123042001  Minnesota             Ramsey County   None   None       0\n",
       "160    27123033400  Minnesota             Ramsey County      0   None       0\n",
       "...            ...        ...                       ...    ...    ...     ...\n",
       "74047  16055000200      Idaho           Kootenai County   None   None       0\n",
       "74068  16011950500      Idaho            Bingham County   None   None       0\n",
       "74076  16001010503      Idaho                Ada County   None   None       0\n",
       "74107  16001001000      Idaho                Ada County   None   None       0\n",
       "74123  16001002100      Idaho                Ada County   None   None       0\n",
       "\n",
       "[3170 rows x 6 columns]"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_HRS_GEO = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'FUDS_ET']]\n",
    "nation_HRS_GEO.loc[nation_HRS_GEO['FUDS_ET'] == '0']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02eef4b5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "678bea72",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([None, '0', '1'], dtype=object)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation['HRS_ET'].unique()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.10 ('data-pipeline-WziHKidv-py3.8')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "vscode": {
   "interpreter": {
    "hash": "c28609757c27a373a12dad8bc3a2aec46aa91130799a09665fba7d386f9c3756"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/data/data-pipeline/data_pipeline/ipython/geojson_compare_tiles.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/geojson_compare_tiles.ipynb
@ -0,0 +1,496 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "27da604f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# %load_ext lab_black\n",
    "import json\n",
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "\n",
    "# Read in the above json file\n",
    "nation=gpd.read_file(\"/Users/vims/Downloads/usa-high-1822-637b.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7b7083fd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        None\n",
       "1        None\n",
       "2        None\n",
       "3        None\n",
       "4        None\n",
       "         ... \n",
       "74129    None\n",
       "74130    None\n",
       "74131    None\n",
       "74132    None\n",
       "74133    None\n",
       "Name: FUDS_RAW, Length: 74134, dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation['FUDS_RAW']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "117477e6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID10</th>\n",
       "      <th>SF</th>\n",
       "      <th>CF</th>\n",
       "      <th>HRS_ET</th>\n",
       "      <th>AML_ET</th>\n",
       "      <th>AML_RAW</th>\n",
       "      <th>FUDS_ET</th>\n",
       "      <th>FUDS_RAW</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>27139080202</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Scott County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>27139080204</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Scott County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>27139080100</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Scott County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>27139080302</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Scott County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>27139080400</td>\n",
       "      <td>Minnesota</td>\n",
       "      <td>Scott County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74129</th>\n",
       "      <td>16005001601</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Bannock County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74130</th>\n",
       "      <td>16005001300</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Bannock County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74131</th>\n",
       "      <td>16005001000</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Bannock County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74132</th>\n",
       "      <td>16005000900</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Bannock County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74133</th>\n",
       "      <td>16005000800</td>\n",
       "      <td>Idaho</td>\n",
       "      <td>Bannock County</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>74134 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           GEOID10         SF              CF HRS_ET  AML_ET AML_RAW  FUDS_ET  \\\n",
       "0      27139080202  Minnesota    Scott County   None   False    None    False   \n",
       "1      27139080204  Minnesota    Scott County   None   False    None    False   \n",
       "2      27139080100  Minnesota    Scott County   None   False    None    False   \n",
       "3      27139080302  Minnesota    Scott County   None   False    None    False   \n",
       "4      27139080400  Minnesota    Scott County   None   False    None    False   \n",
       "...            ...        ...             ...    ...     ...     ...      ...   \n",
       "74129  16005001601      Idaho  Bannock County   None   False    None    False   \n",
       "74130  16005001300      Idaho  Bannock County   None   False    None    False   \n",
       "74131  16005001000      Idaho  Bannock County   None   False    None    False   \n",
       "74132  16005000900      Idaho  Bannock County   None   False    None    False   \n",
       "74133  16005000800      Idaho  Bannock County   None   False    None    False   \n",
       "\n",
       "      FUDS_RAW  \n",
       "0         None  \n",
       "1         None  \n",
       "2         None  \n",
       "3         None  \n",
       "4         None  \n",
       "...        ...  \n",
       "74129     None  \n",
       "74130     None  \n",
       "74131     None  \n",
       "74132     None  \n",
       "74133     None  \n",
       "\n",
       "[74134 rows x 8 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'AML_RAW','FUDS_ET', 'FUDS_RAW']]\n",
    "nation_new_ind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "0f37acf4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([None, '0', '1'], dtype=object)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['HRS_ET'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "4ae865ae",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    8843\n",
       "1    4045\n",
       "Name: HRS_ET, dtype: int64"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['HRS_ET'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "2f0d29db",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False,  True])"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['AML_ET'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "646b3754",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    72100\n",
       "True      2034\n",
       "Name: AML_ET, dtype: int64"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['AML_ET'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "0571df6d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([None, '1'], dtype=object)"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['AML_RAW'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "171fa3c9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    2034\n",
       "Name: AML_RAW, dtype: int64"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['AML_RAW'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "370b0769",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False,  True])"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['FUDS_ET'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "f8afb668",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    72056\n",
       "True      2078\n",
       "Name: FUDS_ET, dtype: int64"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['FUDS_ET'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "f2e3b78a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([None, '0', '1'], dtype=object)"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['FUDS_RAW'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "b722e802",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    3170\n",
       "1    2078\n",
       "Name: FUDS_RAW, dtype: int64"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nation_new_ind['FUDS_RAW'].value_counts()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -10,7 +10,9 @@ STATE_FIELD = "State/Territory"
 COUNTY_FIELD = "County Name"
 # Definition Narwhal fields
-SCORE_N = "Definition N (communities)"
+FINAL_SCORE_N_BOOLEAN = (
    "Definition M community, including adjacency index tracts"
 )
 SCORE_N_COMMUNITIES = "Definition N (communities)"
 N_CLIMATE = "Climate Factor (Definition N)"
 N_ENERGY = "Energy Factor (Definition N)"
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -14,20 +14,17 @@ logger = get_module_logger(__name__)
 class ScoreNarwhal(Score):
    """Very similar to Score M, at present."""
-    def __init__(self, df: pd.DataFrame) -> None:
+    LOW_INCOME_THRESHOLD: float = 0.65
-        self.LOW_INCOME_THRESHOLD: float = 0.65
+    MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
-        self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
+    ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
-        self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
+    MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
-        self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
+    LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
        self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
-        # We define a donut hole DAC as a tract that is entirely surrounded by
+    # We define a donut hole DAC as a tract that is entirely surrounded by
-        # DACs (score threshold = 1) and above median for low income, as a starting
+    # DACs (score threshold = 1) and above median for low income, as a starting
-        # point. As we ground-truth, these thresholds might change.
+    # point. As we ground-truth, these thresholds might change.
-        self.LOW_INCOME_THRESHOLD_DONUT: float = 0.50
+    LOW_INCOME_THRESHOLD_DONUT: float = 0.50
-        self.SCORE_THRESHOLD_DONUT: float = 1.00
+    SCORE_THRESHOLD_DONUT: float = 1.00
        super().__init__(df)
    def _combine_island_areas_with_states_and_set_thresholds(
        self,
--- a/data/data-pipeline/data_pipeline/tests/conftest.py
+++ b/data/data-pipeline/data_pipeline/tests/conftest.py
@ -52,3 +52,16 @@ def mock_etl(monkeypatch, mock_paths) -> None:
    data_path, tmp_path = mock_paths
    monkeypatch.setattr(ExtractTransformLoad, "DATA_PATH", data_path)
    monkeypatch.setattr(ExtractTransformLoad, "TMP_PATH", tmp_path)
 def pytest_collection_modifyitems(config, items):
    keywordexpr = config.option.keyword
    markexpr = config.option.markexpr
    if keywordexpr or markexpr:
        return  # let pytest handle this
    smoketest = "smoketest"
    skip_mymarker = pytest.mark.skip(reason=f"{smoketest} not selected")
    for item in items:
        if smoketest in item.keywords:
            item.add_marker(skip_mymarker)
--- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py
+++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py
@ -0,0 +1,12 @@
 import pandas as pd
 import pytest
 from data_pipeline.config import settings
 from data_pipeline.score import field_names
@pytest.fixture(scope="session")
 def final_score_df():
    return pd.read_csv(
        settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv",
        dtype={field_names.GEOID_TRACT_FIELD: str},
    )
--- a/data/data-pipeline/data_pipeline/tests/score/test_calculation.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_calculation.py
@ -0,0 +1,291 @@
 # flake8: noqa: W0613,W0611,F811
 from dataclasses import dataclass
 import pytest
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.score.score_narwhal import ScoreNarwhal
 from .fixtures import final_score_df  # pylint: disable=unused-import
 logger = get_module_logger(__name__)
 pytestmark = pytest.mark.smoketest
@dataclass
 class PercentileTestConfig:
    percentile_column_name: str
    threshold_column_name: str
    threshold: float
    percentile_column_need_suffix: bool = True
    @property
    def full_percentile_column_name(self):
        if self.percentile_column_need_suffix:
            return (
                self.percentile_column_name
                + field_names.PERCENTILE_FIELD_SUFFIX
            )
        return self.percentile_column_name
 ### TODO: we need to blow this out for all eight categories
 def _check_percentile_against_threshold(df, config: PercentileTestConfig):
    """Note - for the purpose of testing, this fills with False"""
    is_minimum_flagged_ok = (
        df[df[config.threshold_column_name].fillna(False)][
            config.full_percentile_column_name
        ].min()
        >= config.threshold
    )
    is_maximum_not_flagged_ok = (
        df[~df[config.threshold_column_name].fillna(False)][
            config.full_percentile_column_name
        ].max()
        < config.threshold
    )
    errors = []
    if not is_minimum_flagged_ok:
        errors.append(
            f"For column {config.threshold_column_name}, there is someone flagged below {config.threshold} percentile!"
        )
    if not is_maximum_not_flagged_ok:
        errors.append(
            f"For column {config.threshold_column_name}, there is someone not flagged above {config.threshold} percentile!"
        )
    return errors
 def test_percentile_columns(final_score_df):
    low_income = PercentileTestConfig(
        field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
        field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED,
        ScoreNarwhal.LOW_INCOME_THRESHOLD,
    )
    population_loss = PercentileTestConfig(
        field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
        field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    agricultural_loss = PercentileTestConfig(
        field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
        field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    building_loss = PercentileTestConfig(
        field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
        field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    flood = PercentileTestConfig(
        field_names.FUTURE_FLOOD_RISK_FIELD,
        field_names.HIGH_FUTURE_FLOOD_RISK_FIELD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    wildfire = PercentileTestConfig(
        field_names.FUTURE_WILDFIRE_RISK_FIELD,
        field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    low_high_school = PercentileTestConfig(
        field_names.HIGH_SCHOOL_ED_FIELD,
        field_names.LOW_HS_EDUCATION_FIELD,
        ScoreNarwhal.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD,
        percentile_column_need_suffix=False,
    )
    donut_hole_income = PercentileTestConfig(
        field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
        field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS,
        ScoreNarwhal.LOW_INCOME_THRESHOLD_DONUT,
    )
    donut_hole_adjacency = PercentileTestConfig(
        (field_names.SCORE_N_COMMUNITIES + field_names.ADJACENCY_INDEX_SUFFIX),
        field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD,
        ScoreNarwhal.SCORE_THRESHOLD_DONUT,
        percentile_column_need_suffix=False,
    )
    diesel = PercentileTestConfig(
        field_names.DIESEL_FIELD,
        field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    dot_burden = PercentileTestConfig(
        field_names.DOT_TRAVEL_BURDEN_FIELD,
        field_names.DOT_BURDEN_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    traffic_proximity = PercentileTestConfig(
        field_names.TRAFFIC_FIELD,
        field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    energy_burden = PercentileTestConfig(
        field_names.ENERGY_BURDEN_FIELD,
        field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    pm25 = PercentileTestConfig(
        field_names.PM25_FIELD,
        field_names.PM25_EXCEEDS_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    kitchen_plumbing = PercentileTestConfig(
        field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD,
        field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    # Leadpaint is handled below in a separate method
    housing = PercentileTestConfig(
        field_names.HOUSING_BURDEN_FIELD,
        field_names.HOUSING_BURDEN_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    non_natural_space = PercentileTestConfig(
        field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME,
        field_names.NON_NATURAL_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    rmp = PercentileTestConfig(
        field_names.RMP_FIELD,
        field_names.RMP_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    npl = PercentileTestConfig(
        field_names.NPL_FIELD,
        field_names.NPL_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    tsdf = PercentileTestConfig(
        field_names.TSDF_FIELD,
        field_names.TSDF_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    wastewater = PercentileTestConfig(
        field_names.WASTEWATER_FIELD,
        field_names.WASTEWATER_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    ust = PercentileTestConfig(
        field_names.UST_FIELD,
        field_names.UST_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    diabetes = PercentileTestConfig(
        field_names.DIABETES_FIELD,
        field_names.DIABETES_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    asthma = PercentileTestConfig(
        field_names.ASTHMA_FIELD,
        field_names.ASTHMA_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    heart_disease = PercentileTestConfig(
        field_names.HEART_DISEASE_FIELD,
        field_names.HEART_DISEASE_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    low_life_expectancy = PercentileTestConfig(
        field_names.LOW_LIFE_EXPECTANCY_FIELD,
        field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    unemployment = PercentileTestConfig(
        field_names.UNEMPLOYMENT_FIELD,
        field_names.UNEMPLOYMENT_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    low_median_income = PercentileTestConfig(
        field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
        field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    linguist_isolation = PercentileTestConfig(
        field_names.LINGUISTIC_ISO_FIELD,
        field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    poverty = PercentileTestConfig(
        field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
        field_names.POVERTY_PCTILE_THRESHOLD,
        ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
    )
    errors = []
    for threshhold_config in (
        low_income,
        population_loss,
        agricultural_loss,
        building_loss,
        flood,
        wildfire,
        low_high_school,
        donut_hole_income,
        donut_hole_adjacency,
        dot_burden,
        diesel,
        traffic_proximity,
        energy_burden,
        pm25,
        kitchen_plumbing,
        housing,
        non_natural_space,
        rmp,
        npl,
        tsdf,
        wastewater,
        ust,
        diabetes,
        asthma,
        heart_disease,
        low_life_expectancy,
        unemployment,
        low_median_income,
        linguist_isolation,
        poverty,
    ):
        errors.extend(
            _check_percentile_against_threshold(
                final_score_df, threshhold_config
            )
        )
    error_text = "\n".join(errors)
    assert not errors, error_text
 def test_lead_paint_indicator(
    final_score_df,
 ):
    """We need special logic here because this is a combined threshold, so we need this test to have two parts.
    1. We construct our own threshold columns
    2. We make sure it's the same as the threshold column in the dataframe
    """
    lead_pfs = (
        field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
    )
    home_val_pfs = (
        field_names.MEDIAN_HOUSE_VALUE_FIELD
        + field_names.PERCENTILE_FIELD_SUFFIX
    )
    combined_proxy_boolean = field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD
    tmp_lead_threshold = (
        final_score_df[lead_pfs] >= ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD
    )
    tmp_mhv_threshold = (
        final_score_df[home_val_pfs]
        <= ScoreNarwhal.MEDIAN_HOUSE_VALUE_THRESHOLD
    )
    true_combined_proxy = tmp_lead_threshold & tmp_mhv_threshold
    assert (
        tmp_mhv_threshold.sum() > 0
    ), "MHV threshold alone does not capture any homes"
    assert final_score_df[combined_proxy_boolean].equals(
        true_combined_proxy
    ), "Lead proxy calculated improperly"
    assert (
        tmp_lead_threshold.sum() > true_combined_proxy.sum()
    ), "House value is not further limiting this proxy"
--- a/data/data-pipeline/data_pipeline/tests/score/test_output.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py
@ -0,0 +1,205 @@
 # flake8: noqa: W0613,W0611,F811
 from dataclasses import dataclass
 from typing import List
 import pytest
 import pandas as pd
 from data_pipeline.score import field_names
 from .fixtures import final_score_df  # pylint: disable=unused-import
 pytestmark = pytest.mark.smoketest
 def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
    """Fills NA with False"""
    return df[df[col].fillna(False)].shape[0] >= error_check
 def _helper_single_threshold_test(df, col, socioeconomic_column, score_column):
    """Note that this fills nulls in the threshold column where nulls exist"""
    nulls_dont_exist = (
        df[df[col].fillna(False) & df[socioeconomic_column]][score_column]
        .isna()
        .sum()
        == 0
    )
    only_trues = df[df[col].fillna(False) & df[socioeconomic_column]][
        score_column
    ].min()
    return nulls_dont_exist, only_trues
@dataclass
 class ThresholdTestConfig:
    name: str
    threshhold_columns: List[str]
    ses_column_name: str = field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED
    score_column_name: str = field_names.SCORE_N_COMMUNITIES
    @property
    def error_message(self):
        return f"Eligibility columns have an error, {self.name}"
 def check_for_threshhold_errors(
    df: pd.DataFrame, config: ThresholdTestConfig
 ) -> List[str]:
    errors = []
    for col in config.threshhold_columns:
        nulls_dont_exist, only_trues = _helper_single_threshold_test(
            df,
            col,
            config.ses_column_name,
            config.score_column_name,
        )
        proper_threshold_identification = (
            _helper_test_count_exceeding_threshold(df, col)
        )
        if not nulls_dont_exist:
            errors.append(
                f"For {col}, threshold is not calculated right -- there are NaNs in Score"
            )
        if not only_trues:
            errors.append(
                f"For {col} and {config.ses_column_name}, threshold is not calculated right "
                f"-- there are Falses where there should only be Trues"
            )
        if not proper_threshold_identification:
            errors.append(
                f"Threshold {col} returns too few tracts, are you sure it's nationally-representative?"
            )
    if errors:
        errors.append(config.error_message)
    return errors
 def test_threshholds(final_score_df):
    climate_thresholds = ThresholdTestConfig(
        "climate",
        [
            field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD,
            field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD,
            field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD,
            field_names.HIGH_FUTURE_FLOOD_RISK_FIELD,
            field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD,
        ],
    )
    energy_thresholds = ThresholdTestConfig(
        "energy",
        [
            field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD,
            field_names.PM25_EXCEEDS_PCTILE_THRESHOLD,
        ],
    )
    transportation_thresholds = ThresholdTestConfig(
        "transportation",
        [
            field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD,
            field_names.DOT_BURDEN_PCTILE_THRESHOLD,
            field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD,
        ],
    )
    housing_thresholds = ThresholdTestConfig(
        "housing",
        [
            field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
            field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD,
            field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD,
            field_names.HOUSING_BURDEN_PCTILE_THRESHOLD,
            field_names.NON_NATURAL_PCTILE_THRESHOLD,
        ],
    )
    pollution_thresholds = ThresholdTestConfig(
        "pollution",
        [
            field_names.RMP_PCTILE_THRESHOLD,
            field_names.NPL_PCTILE_THRESHOLD,
            field_names.TSDF_PCTILE_THRESHOLD,
            field_names.AML_BOOLEAN,
            field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
        ],
    )
    water_thresholds = ThresholdTestConfig(
        "water",
        [
            field_names.WASTEWATER_PCTILE_THRESHOLD,
            field_names.UST_PCTILE_THRESHOLD,
        ],
    )
    health_thresholds = ThresholdTestConfig(
        "health",
        [
            field_names.DIABETES_PCTILE_THRESHOLD,
            field_names.ASTHMA_PCTILE_THRESHOLD,
            field_names.HEART_DISEASE_PCTILE_THRESHOLD,
            field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD,
        ],
    )
    workforce_base_thresholds = ThresholdTestConfig(
        "workforce (not island areas)",
        [
            field_names.UNEMPLOYMENT_PCTILE_THRESHOLD,
            field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD,
            field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD,
            field_names.POVERTY_PCTILE_THRESHOLD,
        ],
        ses_column_name=field_names.LOW_HS_EDUCATION_FIELD,
    )
    errors = []
    for threshhold_config in [
        climate_thresholds,
        energy_thresholds,
        transportation_thresholds,
        housing_thresholds,
        pollution_thresholds,
        water_thresholds,
        health_thresholds,
        workforce_base_thresholds,
    ]:
        errors.extend(
            check_for_threshhold_errors(final_score_df, threshhold_config)
        )
    error_text = "\n".join(errors)
    assert not errors, error_text
 def test_max_40_percent_DAC(final_score_df):
    score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
    total_population_col = field_names.TOTAL_POP_FIELD
    assert (
        final_score_df[score_col_with_donuts].isna().sum() == 0
    ), f"Error: {score_col_with_donuts} contains NULLs"
    assert (
        final_score_df[final_score_df[score_col_with_donuts]][
            total_population_col
        ].sum()
        / final_score_df[total_population_col].sum()
    ) < 0.4, "Error: the scoring methodology identifies >40% of people in  the US as disadvantaged"
    assert (
        final_score_df[score_col_with_donuts].sum() > 0
    ), "FYI: You've identified no tracts at all!"
 def test_donut_hole_addition_to_score_n(final_score_df):
    score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
    score_col = field_names.SCORE_N_COMMUNITIES
    donut_hole_score_only = (
        field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
    )
    count_donuts = final_score_df[donut_hole_score_only].sum()
    count_n = final_score_df[score_col].sum()
    count_n_with_donuts = final_score_df[score_col_with_donuts].sum()
    new_donuts = final_score_df[
        final_score_df[donut_hole_score_only] & ~final_score_df[score_col]
    ].shape[0]
    assert (
        new_donuts + count_n == count_n_with_donuts
    ), "The math doesn't work! The number of new donut hole tracts plus score tracts (base) does not equal the total number of tracts identified"
    assert (
        count_donuts < count_n
    ), "There are more donut hole tracts than base tracts. How can it be?"
    assert (
        new_donuts > 0
    ), "FYI: The adjacency index is doing nothing. Consider removing it?"
--- a/data/data-pipeline/data_pipeline/tile/generate.py
+++ b/data/data-pipeline/data_pipeline/tile/generate.py
@ -87,6 +87,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
        logger.info("Generating Tribal mbtiles file")
        cmd = "tippecanoe "
        cmd += "--layer=blocks "
        cmd += "--base-zoom=3 "
        cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} "
        cmd += f"--output={tribal_tiles_path}/usa.mbtiles "
        cmd += str(tribal_geojson_dir / "usa.json")
@ -95,10 +96,12 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
        # generate mvts
        logger.info("Generating Tribal mvt folders and files")
        cmd = "tippecanoe "
        cmd += "--layer=blocks "
        cmd += "--base-zoom=3 "
        cmd += "--no-tile-compression "
        cmd += "--drop-densest-as-needed "
        cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} "
-        cmd += f"--output-to-directory={tribal_tiles_path} --layer=blocks "
+        cmd += f"--output-to-directory={tribal_tiles_path} "
        cmd += str(tribal_geojson_dir / "usa.json")
        call(cmd, shell=True)
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -149,7 +149,9 @@ def download_file_from_url(
        os.mkdir(download_file_name.parent)
    logger.info(f"Downloading {file_url}")
-    response = requests.get(file_url, verify=verify)
+    response = requests.get(
        file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
    )
    if response.status_code == 200:
        file_contents = response.content
    else:
--- a/data/data-pipeline/pytest.ini
+++ b/data/data-pipeline/pytest.ini
@ -1,2 +1,4 @@
 [pytest]
 norecursedirs = .git data
 markers =
    smoketest: marks a test as depending on the full score output