Score tests (#1847)

* update Python version on README; tuple typing fix

* Alaska tribal points fix (#1821)

* Bump mistune from 0.8.4 to 2.0.3 in /data/data-pipeline (#1777)

Bumps [mistune](https://github.com/lepture/mistune) from 0.8.4 to 2.0.3.
- [Release notes](https://github.com/lepture/mistune/releases)
- [Changelog](https://github.com/lepture/mistune/blob/master/docs/changes.rst)
- [Commits](https://github.com/lepture/mistune/compare/v0.8.4...v2.0.3)

---
updated-dependencies:
- dependency-name: mistune
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

* poetry update

* initial pass of score tests

* add threshold tests

* added ses threshold (not donut, not island)

* testing suite -- stopping for the day

* added test for lead proxy indicator

* Refactor score tests to make them less verbose and more direct (#1865)

* Cleanup tests slightly before refactor (#1846)

* Refactor score calculations tests

* Feedback from review

* Refactor output tests like calculatoin tests (#1846) (#1870)

* Reorganize files (#1846)

* Switch from lru_cache to fixture scorpes (#1846)

* Add tests for all factors (#1846)

* Mark smoketests and run as part of be deply (#1846)

* Update renamed var (#1846)

* Switch from named tuple to dataclass (#1846)

This is annoying, but pylint in python3.8 was crashing parsing the named
tuple. We weren't using any namedtuple-specific features, so I made the
type a dataclass just to get pylint to behave.

* Add default timout to requests (#1846)

* Fix type (#1846)

* Fix merge mistake on poetry.lock (#1846)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
Co-authored-by: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Matt Bowen <83967628+mattbowen-usds@users.noreply.github.com>
Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov>
This commit is contained in:
Emma Nechamkin 2022-08-26 15:23:20 -04:00 committed by GitHub
parent e539db86ab
commit 1c4d3e4142
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 1425 additions and 29 deletions

View file

@ -62,6 +62,9 @@ jobs:
- name: Generate Score Post
run: |
poetry run python3 data_pipeline/application.py generate-score-post -s aws
- name: Run Smoketests
run: |
poetry run pytest data_pipeline/ -m smoketest
- name: Deploy Score to Geoplatform AWS
run: |
poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read

View file

@ -12,7 +12,7 @@ settings = Dynaconf(
# set root dir
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
settings.REQUESTS_DEFAULT_TIMOUT = 3600
# To set an environment use:
# Linux/OSX: export ENV_FOR_DYNACONF=staging
# Windows: set ENV_FOR_DYNACONF=staging

View file

@ -1,5 +1,5 @@
import functools
from collections import namedtuple
from dataclasses import dataclass
import numpy as np
import pandas as pd
@ -496,10 +496,11 @@ class ScoreETL(ExtractTransformLoad):
# >= some threshold.
# TODO: Add more fields here.
# https://github.com/usds/justice40-tool/issues/970
ReversePercentile = namedtuple(
typename="ReversePercentile",
field_names=["field_name", "low_field_name"],
)
@dataclass
class ReversePercentile:
field_name: str
low_field_name: str
reverse_percentiles = [
# This dictionary follows the format:
# <field name> : <field name for low values>

View file

@ -51,7 +51,7 @@ class GeoScoreETL(ExtractTransformLoad):
## TODO: We really should not have this any longer changing
self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
field_names.SCORE_N
field_names.FINAL_SCORE_N_BOOLEAN
]
self.TARGET_SCORE_RENAME_TO = "SCORE"

View file

@ -1,4 +1,4 @@
from typing import List, NamedTuple, Tuple
from typing import Any, List, NamedTuple, Tuple
import pandas as pd
import geopandas as gpd
@ -41,7 +41,7 @@ def _prepare_dataframe_for_imputation(
impute_var_named_tup_list: List[NamedTuple],
geo_df: gpd.GeoDataFrame,
geoid_field: str = "GEOID10_TRACT",
) -> Tuple[list, gpd.GeoDataFrame]:
) -> Tuple[Any, gpd.GeoDataFrame]:
imputing_cols = [
impute_var_pair.raw_field_name
for impute_var_pair in impute_var_named_tup_list

View file

@ -282,12 +282,20 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# Download MSA median incomes
logger.info("Starting download of MSA median incomes.")
download = requests.get(self.MSA_MEDIAN_INCOME_URL, verify=None)
download = requests.get(
self.MSA_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.msa_median_incomes = json.loads(download.content)
# Download state median incomes
logger.info("Starting download of state median incomes.")
download_state = requests.get(self.STATE_MEDIAN_INCOME_URL, verify=None)
download_state = requests.get(
self.STATE_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.state_median_incomes = json.loads(download_state.content)
## NOTE we already have PR's MI here

View file

@ -7,6 +7,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
from data_pipeline.score import field_names
from data_pipeline.config import settings
pd.options.mode.chained_assignment = "raise"
@ -270,7 +271,8 @@ class CensusDecennialETL(ExtractTransformLoad):
island["var_list"],
island["fips"],
county,
)
),
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
df = json.loads(download.content)

View file

@ -3,6 +3,7 @@ import requests
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
logger = get_module_logger(__name__)
@ -26,7 +27,11 @@ class HudRecapETL(ExtractTransformLoad):
def extract(self) -> None:
logger.info("Downloading HUD Recap Data")
download = requests.get(self.HUD_RECAP_CSV_URL, verify=None)
download = requests.get(
self.HUD_RECAP_CSV_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
file_contents = download.content
csv_file = open(self.HUD_RECAP_CSV, "wb")
csv_file.write(file_contents)

View file

@ -0,0 +1,354 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c9fab286",
"metadata": {},
"outputs": [],
"source": [
"# %load_ext lab_black\n",
"import json\n",
"import pandas as pd\n",
"import geopandas as gpd"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "dbd84e10",
"metadata": {},
"outputs": [
{
"ename": "DriverError",
"evalue": "/mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mCPLE_OpenFailedError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mfiona/_err.pyx\u001b[0m in \u001b[0;36mfiona._err.exc_wrap_pointer\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mCPLE_OpenFailedError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mDriverError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_10603/1449522338.py\u001b[0m in \u001b[0;36m<cell line: 3>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Read in the score geojson file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mdata_pipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0metl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstants\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/geopandas/io/file.py\u001b[0m in \u001b[0;36m_read_file\u001b[0;34m(filename, bbox, mask, rows, **kwargs)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mfiona_env\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_or_bytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfeatures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;31m# In a future Fiona release the crs attribute of features will\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/env.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlocal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_env\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 408\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 409\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/__init__.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m c = Collection(path, mode, driver=driver, encoding=encoding,\n\u001b[0m\u001b[1;32m 265\u001b[0m layer=layer, enabled_drivers=enabled_drivers, **kwargs)\n\u001b[1;32m 266\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/collection.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mWritingSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mfiona/ogrext.pyx\u001b[0m in \u001b[0;36mfiona.ogrext.Session.start\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mDriverError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory"
]
}
],
"source": [
"# Read in the score geojson file\n",
"from data_pipeline.etl.score.constants import DATA_SCORE_CSV_TILES_FILE_PATH\n",
"nation = gpd.read_file(DATA_SCORE_CSV_TILES_FILE_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f850529",
"metadata": {},
"outputs": [],
"source": [
"nation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f342d36",
"metadata": {},
"outputs": [],
"source": [
"# get the columns of the df and sort the list:\n",
"sorted_nation = sorted(nation.columns.to_list())"
]
},
{
"cell_type": "markdown",
"id": "97aac08f",
"metadata": {},
"source": [
"CLI to covert a pbf into a json file (requires tippecannoe and jq to be installed)\n",
"\n",
"```bash\n",
"curl https://justice40-data.s3.amazonaws.com/data-pipeline-staging/1822/e6385c172f1d2adf588050375b7c0985035cfb24/data/score/tiles/high/8/67/101.pbf -o uh-1822-e638-8-67-101.pbf | tippecanoe-decode uh-1822-e638-8-67-101.pbf 8 67 101 | jq > cat uh-1822-e638-8-67-101.json\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbe37ccb",
"metadata": {},
"outputs": [],
"source": [
"# load a random high-tile json (after decoding a pbf) file using json.loads()\n",
"with open(\"/Users/vims/Downloads/uh-1822-e638-8-67-101.json\", \"r\") as f:\n",
" random_tile_features = json.loads(f.read())\n",
"\n",
"# Flatten data around the features key:\n",
"flatten_features = pd.json_normalize(random_tile_features, record_path=[\"features\"])\n",
"\n",
"# index into the feature properties, get keys and turn into a sorted list\n",
"random_tile = sorted(list(flatten_features[\"features\"][0][0][\"properties\"].keys()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a33f5126",
"metadata": {},
"outputs": [],
"source": [
"set_dif = set(sorted_nation).symmetric_difference(set(random_tile))\n",
"list(set_dif)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d228360b",
"metadata": {},
"outputs": [],
"source": [
"nation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6925138",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f2d7ba0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>SF</th>\n",
" <th>CF</th>\n",
" <th>HRS_ET</th>\n",
" <th>AML_ET</th>\n",
" <th>FUDS_ET</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>27061480300</td>\n",
" <td>Minnesota</td>\n",
" <td>Itasca County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>27061940000</td>\n",
" <td>Minnesota</td>\n",
" <td>Itasca County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>27077460400</td>\n",
" <td>Minnesota</td>\n",
" <td>Lake of the Woods County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127</th>\n",
" <td>27123042001</td>\n",
" <td>Minnesota</td>\n",
" <td>Ramsey County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>27123033400</td>\n",
" <td>Minnesota</td>\n",
" <td>Ramsey County</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74047</th>\n",
" <td>16055000200</td>\n",
" <td>Idaho</td>\n",
" <td>Kootenai County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74068</th>\n",
" <td>16011950500</td>\n",
" <td>Idaho</td>\n",
" <td>Bingham County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74076</th>\n",
" <td>16001010503</td>\n",
" <td>Idaho</td>\n",
" <td>Ada County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74107</th>\n",
" <td>16001001000</td>\n",
" <td>Idaho</td>\n",
" <td>Ada County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74123</th>\n",
" <td>16001002100</td>\n",
" <td>Idaho</td>\n",
" <td>Ada County</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3170 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 SF CF HRS_ET AML_ET FUDS_ET\n",
"71 27061480300 Minnesota Itasca County None None 0\n",
"75 27061940000 Minnesota Itasca County None None 0\n",
"115 27077460400 Minnesota Lake of the Woods County None None 0\n",
"127 27123042001 Minnesota Ramsey County None None 0\n",
"160 27123033400 Minnesota Ramsey County 0 None 0\n",
"... ... ... ... ... ... ...\n",
"74047 16055000200 Idaho Kootenai County None None 0\n",
"74068 16011950500 Idaho Bingham County None None 0\n",
"74076 16001010503 Idaho Ada County None None 0\n",
"74107 16001001000 Idaho Ada County None None 0\n",
"74123 16001002100 Idaho Ada County None None 0\n",
"\n",
"[3170 rows x 6 columns]"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_HRS_GEO = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'FUDS_ET']]\n",
"nation_HRS_GEO.loc[nation_HRS_GEO['FUDS_ET'] == '0']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02eef4b5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "678bea72",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([None, '0', '1'], dtype=object)"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation['HRS_ET'].unique()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.10 ('data-pipeline-WziHKidv-py3.8')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"vscode": {
"interpreter": {
"hash": "c28609757c27a373a12dad8bc3a2aec46aa91130799a09665fba7d386f9c3756"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,496 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "27da604f",
"metadata": {},
"outputs": [],
"source": [
"# %load_ext lab_black\n",
"import json\n",
"import pandas as pd\n",
"import geopandas as gpd\n",
"\n",
"# Read in the above json file\n",
"nation=gpd.read_file(\"/Users/vims/Downloads/usa-high-1822-637b.json\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7b7083fd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 None\n",
"1 None\n",
"2 None\n",
"3 None\n",
"4 None\n",
" ... \n",
"74129 None\n",
"74130 None\n",
"74131 None\n",
"74132 None\n",
"74133 None\n",
"Name: FUDS_RAW, Length: 74134, dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation['FUDS_RAW']"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "117477e6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>SF</th>\n",
" <th>CF</th>\n",
" <th>HRS_ET</th>\n",
" <th>AML_ET</th>\n",
" <th>AML_RAW</th>\n",
" <th>FUDS_ET</th>\n",
" <th>FUDS_RAW</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>27139080202</td>\n",
" <td>Minnesota</td>\n",
" <td>Scott County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>27139080204</td>\n",
" <td>Minnesota</td>\n",
" <td>Scott County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>27139080100</td>\n",
" <td>Minnesota</td>\n",
" <td>Scott County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>27139080302</td>\n",
" <td>Minnesota</td>\n",
" <td>Scott County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>27139080400</td>\n",
" <td>Minnesota</td>\n",
" <td>Scott County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74129</th>\n",
" <td>16005001601</td>\n",
" <td>Idaho</td>\n",
" <td>Bannock County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74130</th>\n",
" <td>16005001300</td>\n",
" <td>Idaho</td>\n",
" <td>Bannock County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74131</th>\n",
" <td>16005001000</td>\n",
" <td>Idaho</td>\n",
" <td>Bannock County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74132</th>\n",
" <td>16005000900</td>\n",
" <td>Idaho</td>\n",
" <td>Bannock County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74133</th>\n",
" <td>16005000800</td>\n",
" <td>Idaho</td>\n",
" <td>Bannock County</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>74134 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 SF CF HRS_ET AML_ET AML_RAW FUDS_ET \\\n",
"0 27139080202 Minnesota Scott County None False None False \n",
"1 27139080204 Minnesota Scott County None False None False \n",
"2 27139080100 Minnesota Scott County None False None False \n",
"3 27139080302 Minnesota Scott County None False None False \n",
"4 27139080400 Minnesota Scott County None False None False \n",
"... ... ... ... ... ... ... ... \n",
"74129 16005001601 Idaho Bannock County None False None False \n",
"74130 16005001300 Idaho Bannock County None False None False \n",
"74131 16005001000 Idaho Bannock County None False None False \n",
"74132 16005000900 Idaho Bannock County None False None False \n",
"74133 16005000800 Idaho Bannock County None False None False \n",
"\n",
" FUDS_RAW \n",
"0 None \n",
"1 None \n",
"2 None \n",
"3 None \n",
"4 None \n",
"... ... \n",
"74129 None \n",
"74130 None \n",
"74131 None \n",
"74132 None \n",
"74133 None \n",
"\n",
"[74134 rows x 8 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'AML_RAW','FUDS_ET', 'FUDS_RAW']]\n",
"nation_new_ind"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "0f37acf4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([None, '0', '1'], dtype=object)"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['HRS_ET'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "4ae865ae",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 8843\n",
"1 4045\n",
"Name: HRS_ET, dtype: int64"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['HRS_ET'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "2f0d29db",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, True])"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['AML_ET'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "646b3754",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 72100\n",
"True 2034\n",
"Name: AML_ET, dtype: int64"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['AML_ET'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "0571df6d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([None, '1'], dtype=object)"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['AML_RAW'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "171fa3c9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 2034\n",
"Name: AML_RAW, dtype: int64"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['AML_RAW'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "370b0769",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, True])"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['FUDS_ET'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "f8afb668",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 72056\n",
"True 2078\n",
"Name: FUDS_ET, dtype: int64"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['FUDS_ET'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "f2e3b78a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([None, '0', '1'], dtype=object)"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['FUDS_RAW'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "b722e802",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 3170\n",
"1 2078\n",
"Name: FUDS_RAW, dtype: int64"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nation_new_ind['FUDS_RAW'].value_counts()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -10,7 +10,9 @@ STATE_FIELD = "State/Territory"
COUNTY_FIELD = "County Name"
# Definition Narwhal fields
SCORE_N = "Definition N (communities)"
FINAL_SCORE_N_BOOLEAN = (
"Definition M community, including adjacency index tracts"
)
SCORE_N_COMMUNITIES = "Definition N (communities)"
N_CLIMATE = "Climate Factor (Definition N)"
N_ENERGY = "Energy Factor (Definition N)"

View file

@ -14,20 +14,17 @@ logger = get_module_logger(__name__)
class ScoreNarwhal(Score):
"""Very similar to Score M, at present."""
def __init__(self, df: pd.DataFrame) -> None:
self.LOW_INCOME_THRESHOLD: float = 0.65
self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
LOW_INCOME_THRESHOLD: float = 0.65
MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
# We define a donut hole DAC as a tract that is entirely surrounded by
# DACs (score threshold = 1) and above median for low income, as a starting
# point. As we ground-truth, these thresholds might change.
self.LOW_INCOME_THRESHOLD_DONUT: float = 0.50
self.SCORE_THRESHOLD_DONUT: float = 1.00
super().__init__(df)
LOW_INCOME_THRESHOLD_DONUT: float = 0.50
SCORE_THRESHOLD_DONUT: float = 1.00
def _combine_island_areas_with_states_and_set_thresholds(
self,

View file

@ -52,3 +52,16 @@ def mock_etl(monkeypatch, mock_paths) -> None:
data_path, tmp_path = mock_paths
monkeypatch.setattr(ExtractTransformLoad, "DATA_PATH", data_path)
monkeypatch.setattr(ExtractTransformLoad, "TMP_PATH", tmp_path)
def pytest_collection_modifyitems(config, items):
keywordexpr = config.option.keyword
markexpr = config.option.markexpr
if keywordexpr or markexpr:
return # let pytest handle this
smoketest = "smoketest"
skip_mymarker = pytest.mark.skip(reason=f"{smoketest} not selected")
for item in items:
if smoketest in item.keywords:
item.add_marker(skip_mymarker)

View file

@ -0,0 +1,12 @@
import pandas as pd
import pytest
from data_pipeline.config import settings
from data_pipeline.score import field_names
@pytest.fixture(scope="session")
def final_score_df():
return pd.read_csv(
settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv",
dtype={field_names.GEOID_TRACT_FIELD: str},
)

View file

@ -0,0 +1,291 @@
# flake8: noqa: W0613,W0611,F811
from dataclasses import dataclass
import pytest
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.score.score_narwhal import ScoreNarwhal
from .fixtures import final_score_df # pylint: disable=unused-import
logger = get_module_logger(__name__)
pytestmark = pytest.mark.smoketest
@dataclass
class PercentileTestConfig:
percentile_column_name: str
threshold_column_name: str
threshold: float
percentile_column_need_suffix: bool = True
@property
def full_percentile_column_name(self):
if self.percentile_column_need_suffix:
return (
self.percentile_column_name
+ field_names.PERCENTILE_FIELD_SUFFIX
)
return self.percentile_column_name
### TODO: we need to blow this out for all eight categories
def _check_percentile_against_threshold(df, config: PercentileTestConfig):
"""Note - for the purpose of testing, this fills with False"""
is_minimum_flagged_ok = (
df[df[config.threshold_column_name].fillna(False)][
config.full_percentile_column_name
].min()
>= config.threshold
)
is_maximum_not_flagged_ok = (
df[~df[config.threshold_column_name].fillna(False)][
config.full_percentile_column_name
].max()
< config.threshold
)
errors = []
if not is_minimum_flagged_ok:
errors.append(
f"For column {config.threshold_column_name}, there is someone flagged below {config.threshold} percentile!"
)
if not is_maximum_not_flagged_ok:
errors.append(
f"For column {config.threshold_column_name}, there is someone not flagged above {config.threshold} percentile!"
)
return errors
def test_percentile_columns(final_score_df):
low_income = PercentileTestConfig(
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED,
ScoreNarwhal.LOW_INCOME_THRESHOLD,
)
population_loss = PercentileTestConfig(
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
agricultural_loss = PercentileTestConfig(
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
building_loss = PercentileTestConfig(
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
flood = PercentileTestConfig(
field_names.FUTURE_FLOOD_RISK_FIELD,
field_names.HIGH_FUTURE_FLOOD_RISK_FIELD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
wildfire = PercentileTestConfig(
field_names.FUTURE_WILDFIRE_RISK_FIELD,
field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
low_high_school = PercentileTestConfig(
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.LOW_HS_EDUCATION_FIELD,
ScoreNarwhal.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD,
percentile_column_need_suffix=False,
)
donut_hole_income = PercentileTestConfig(
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS,
ScoreNarwhal.LOW_INCOME_THRESHOLD_DONUT,
)
donut_hole_adjacency = PercentileTestConfig(
(field_names.SCORE_N_COMMUNITIES + field_names.ADJACENCY_INDEX_SUFFIX),
field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD,
ScoreNarwhal.SCORE_THRESHOLD_DONUT,
percentile_column_need_suffix=False,
)
diesel = PercentileTestConfig(
field_names.DIESEL_FIELD,
field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
dot_burden = PercentileTestConfig(
field_names.DOT_TRAVEL_BURDEN_FIELD,
field_names.DOT_BURDEN_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
traffic_proximity = PercentileTestConfig(
field_names.TRAFFIC_FIELD,
field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
energy_burden = PercentileTestConfig(
field_names.ENERGY_BURDEN_FIELD,
field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
pm25 = PercentileTestConfig(
field_names.PM25_FIELD,
field_names.PM25_EXCEEDS_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
kitchen_plumbing = PercentileTestConfig(
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD,
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# Leadpaint is handled below in a separate method
housing = PercentileTestConfig(
field_names.HOUSING_BURDEN_FIELD,
field_names.HOUSING_BURDEN_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
non_natural_space = PercentileTestConfig(
field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME,
field_names.NON_NATURAL_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
rmp = PercentileTestConfig(
field_names.RMP_FIELD,
field_names.RMP_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
npl = PercentileTestConfig(
field_names.NPL_FIELD,
field_names.NPL_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
tsdf = PercentileTestConfig(
field_names.TSDF_FIELD,
field_names.TSDF_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
wastewater = PercentileTestConfig(
field_names.WASTEWATER_FIELD,
field_names.WASTEWATER_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
ust = PercentileTestConfig(
field_names.UST_FIELD,
field_names.UST_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
diabetes = PercentileTestConfig(
field_names.DIABETES_FIELD,
field_names.DIABETES_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
asthma = PercentileTestConfig(
field_names.ASTHMA_FIELD,
field_names.ASTHMA_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
heart_disease = PercentileTestConfig(
field_names.HEART_DISEASE_FIELD,
field_names.HEART_DISEASE_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
low_life_expectancy = PercentileTestConfig(
field_names.LOW_LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
unemployment = PercentileTestConfig(
field_names.UNEMPLOYMENT_FIELD,
field_names.UNEMPLOYMENT_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
low_median_income = PercentileTestConfig(
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
linguist_isolation = PercentileTestConfig(
field_names.LINGUISTIC_ISO_FIELD,
field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
poverty = PercentileTestConfig(
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.POVERTY_PCTILE_THRESHOLD,
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
errors = []
for threshhold_config in (
low_income,
population_loss,
agricultural_loss,
building_loss,
flood,
wildfire,
low_high_school,
donut_hole_income,
donut_hole_adjacency,
dot_burden,
diesel,
traffic_proximity,
energy_burden,
pm25,
kitchen_plumbing,
housing,
non_natural_space,
rmp,
npl,
tsdf,
wastewater,
ust,
diabetes,
asthma,
heart_disease,
low_life_expectancy,
unemployment,
low_median_income,
linguist_isolation,
poverty,
):
errors.extend(
_check_percentile_against_threshold(
final_score_df, threshhold_config
)
)
error_text = "\n".join(errors)
assert not errors, error_text
def test_lead_paint_indicator(
final_score_df,
):
"""We need special logic here because this is a combined threshold, so we need this test to have two parts.
1. We construct our own threshold columns
2. We make sure it's the same as the threshold column in the dataframe
"""
lead_pfs = (
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
)
home_val_pfs = (
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
)
combined_proxy_boolean = field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD
tmp_lead_threshold = (
final_score_df[lead_pfs] >= ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD
)
tmp_mhv_threshold = (
final_score_df[home_val_pfs]
<= ScoreNarwhal.MEDIAN_HOUSE_VALUE_THRESHOLD
)
true_combined_proxy = tmp_lead_threshold & tmp_mhv_threshold
assert (
tmp_mhv_threshold.sum() > 0
), "MHV threshold alone does not capture any homes"
assert final_score_df[combined_proxy_boolean].equals(
true_combined_proxy
), "Lead proxy calculated improperly"
assert (
tmp_lead_threshold.sum() > true_combined_proxy.sum()
), "House value is not further limiting this proxy"

View file

@ -0,0 +1,205 @@
# flake8: noqa: W0613,W0611,F811
from dataclasses import dataclass
from typing import List
import pytest
import pandas as pd
from data_pipeline.score import field_names
from .fixtures import final_score_df # pylint: disable=unused-import
pytestmark = pytest.mark.smoketest
def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
"""Fills NA with False"""
return df[df[col].fillna(False)].shape[0] >= error_check
def _helper_single_threshold_test(df, col, socioeconomic_column, score_column):
"""Note that this fills nulls in the threshold column where nulls exist"""
nulls_dont_exist = (
df[df[col].fillna(False) & df[socioeconomic_column]][score_column]
.isna()
.sum()
== 0
)
only_trues = df[df[col].fillna(False) & df[socioeconomic_column]][
score_column
].min()
return nulls_dont_exist, only_trues
@dataclass
class ThresholdTestConfig:
name: str
threshhold_columns: List[str]
ses_column_name: str = field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED
score_column_name: str = field_names.SCORE_N_COMMUNITIES
@property
def error_message(self):
return f"Eligibility columns have an error, {self.name}"
def check_for_threshhold_errors(
df: pd.DataFrame, config: ThresholdTestConfig
) -> List[str]:
errors = []
for col in config.threshhold_columns:
nulls_dont_exist, only_trues = _helper_single_threshold_test(
df,
col,
config.ses_column_name,
config.score_column_name,
)
proper_threshold_identification = (
_helper_test_count_exceeding_threshold(df, col)
)
if not nulls_dont_exist:
errors.append(
f"For {col}, threshold is not calculated right -- there are NaNs in Score"
)
if not only_trues:
errors.append(
f"For {col} and {config.ses_column_name}, threshold is not calculated right "
f"-- there are Falses where there should only be Trues"
)
if not proper_threshold_identification:
errors.append(
f"Threshold {col} returns too few tracts, are you sure it's nationally-representative?"
)
if errors:
errors.append(config.error_message)
return errors
def test_threshholds(final_score_df):
climate_thresholds = ThresholdTestConfig(
"climate",
[
field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD,
field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD,
field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD,
field_names.HIGH_FUTURE_FLOOD_RISK_FIELD,
field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD,
],
)
energy_thresholds = ThresholdTestConfig(
"energy",
[
field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD,
field_names.PM25_EXCEEDS_PCTILE_THRESHOLD,
],
)
transportation_thresholds = ThresholdTestConfig(
"transportation",
[
field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD,
field_names.DOT_BURDEN_PCTILE_THRESHOLD,
field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD,
],
)
housing_thresholds = ThresholdTestConfig(
"housing",
[
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD,
field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD,
field_names.HOUSING_BURDEN_PCTILE_THRESHOLD,
field_names.NON_NATURAL_PCTILE_THRESHOLD,
],
)
pollution_thresholds = ThresholdTestConfig(
"pollution",
[
field_names.RMP_PCTILE_THRESHOLD,
field_names.NPL_PCTILE_THRESHOLD,
field_names.TSDF_PCTILE_THRESHOLD,
field_names.AML_BOOLEAN,
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
],
)
water_thresholds = ThresholdTestConfig(
"water",
[
field_names.WASTEWATER_PCTILE_THRESHOLD,
field_names.UST_PCTILE_THRESHOLD,
],
)
health_thresholds = ThresholdTestConfig(
"health",
[
field_names.DIABETES_PCTILE_THRESHOLD,
field_names.ASTHMA_PCTILE_THRESHOLD,
field_names.HEART_DISEASE_PCTILE_THRESHOLD,
field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD,
],
)
workforce_base_thresholds = ThresholdTestConfig(
"workforce (not island areas)",
[
field_names.UNEMPLOYMENT_PCTILE_THRESHOLD,
field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD,
field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD,
field_names.POVERTY_PCTILE_THRESHOLD,
],
ses_column_name=field_names.LOW_HS_EDUCATION_FIELD,
)
errors = []
for threshhold_config in [
climate_thresholds,
energy_thresholds,
transportation_thresholds,
housing_thresholds,
pollution_thresholds,
water_thresholds,
health_thresholds,
workforce_base_thresholds,
]:
errors.extend(
check_for_threshhold_errors(final_score_df, threshhold_config)
)
error_text = "\n".join(errors)
assert not errors, error_text
def test_max_40_percent_DAC(final_score_df):
score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
total_population_col = field_names.TOTAL_POP_FIELD
assert (
final_score_df[score_col_with_donuts].isna().sum() == 0
), f"Error: {score_col_with_donuts} contains NULLs"
assert (
final_score_df[final_score_df[score_col_with_donuts]][
total_population_col
].sum()
/ final_score_df[total_population_col].sum()
) < 0.4, "Error: the scoring methodology identifies >40% of people in the US as disadvantaged"
assert (
final_score_df[score_col_with_donuts].sum() > 0
), "FYI: You've identified no tracts at all!"
def test_donut_hole_addition_to_score_n(final_score_df):
score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
score_col = field_names.SCORE_N_COMMUNITIES
donut_hole_score_only = (
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
)
count_donuts = final_score_df[donut_hole_score_only].sum()
count_n = final_score_df[score_col].sum()
count_n_with_donuts = final_score_df[score_col_with_donuts].sum()
new_donuts = final_score_df[
final_score_df[donut_hole_score_only] & ~final_score_df[score_col]
].shape[0]
assert (
new_donuts + count_n == count_n_with_donuts
), "The math doesn't work! The number of new donut hole tracts plus score tracts (base) does not equal the total number of tracts identified"
assert (
count_donuts < count_n
), "There are more donut hole tracts than base tracts. How can it be?"
assert (
new_donuts > 0
), "FYI: The adjacency index is doing nothing. Consider removing it?"

View file

@ -87,6 +87,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
logger.info("Generating Tribal mbtiles file")
cmd = "tippecanoe "
cmd += "--layer=blocks "
cmd += "--base-zoom=3 "
cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} "
cmd += f"--output={tribal_tiles_path}/usa.mbtiles "
cmd += str(tribal_geojson_dir / "usa.json")
@ -95,10 +96,12 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
# generate mvts
logger.info("Generating Tribal mvt folders and files")
cmd = "tippecanoe "
cmd += "--layer=blocks "
cmd += "--base-zoom=3 "
cmd += "--no-tile-compression "
cmd += "--drop-densest-as-needed "
cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} "
cmd += f"--output-to-directory={tribal_tiles_path} --layer=blocks "
cmd += f"--output-to-directory={tribal_tiles_path} "
cmd += str(tribal_geojson_dir / "usa.json")
call(cmd, shell=True)

View file

@ -149,7 +149,9 @@ def download_file_from_url(
os.mkdir(download_file_name.parent)
logger.info(f"Downloading {file_url}")
response = requests.get(file_url, verify=verify)
response = requests.get(
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
)
if response.status_code == 200:
file_contents = response.content
else:

View file

@ -1,2 +1,4 @@
[pytest]
norecursedirs = .git data
markers =
smoketest: marks a test as depending on the full score output