mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 01:54:18 -08:00
Score tests (#1847)
* update Python version on README; tuple typing fix * Alaska tribal points fix (#1821) * Bump mistune from 0.8.4 to 2.0.3 in /data/data-pipeline (#1777) Bumps [mistune](https://github.com/lepture/mistune) from 0.8.4 to 2.0.3. - [Release notes](https://github.com/lepture/mistune/releases) - [Changelog](https://github.com/lepture/mistune/blob/master/docs/changes.rst) - [Commits](https://github.com/lepture/mistune/compare/v0.8.4...v2.0.3) --- updated-dependencies: - dependency-name: mistune dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * poetry update * initial pass of score tests * add threshold tests * added ses threshold (not donut, not island) * testing suite -- stopping for the day * added test for lead proxy indicator * Refactor score tests to make them less verbose and more direct (#1865) * Cleanup tests slightly before refactor (#1846) * Refactor score calculations tests * Feedback from review * Refactor output tests like calculatoin tests (#1846) (#1870) * Reorganize files (#1846) * Switch from lru_cache to fixture scorpes (#1846) * Add tests for all factors (#1846) * Mark smoketests and run as part of be deply (#1846) * Update renamed var (#1846) * Switch from named tuple to dataclass (#1846) This is annoying, but pylint in python3.8 was crashing parsing the named tuple. We weren't using any namedtuple-specific features, so I made the type a dataclass just to get pylint to behave. * Add default timout to requests (#1846) * Fix type (#1846) * Fix merge mistake on poetry.lock (#1846) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov> Co-authored-by: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Matt Bowen <83967628+mattbowen-usds@users.noreply.github.com> Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov>
This commit is contained in:
parent
e539db86ab
commit
1c4d3e4142
19 changed files with 1425 additions and 29 deletions
3
.github/workflows/deploy_be_staging.yml
vendored
3
.github/workflows/deploy_be_staging.yml
vendored
|
@ -62,6 +62,9 @@ jobs:
|
|||
- name: Generate Score Post
|
||||
run: |
|
||||
poetry run python3 data_pipeline/application.py generate-score-post -s aws
|
||||
- name: Run Smoketests
|
||||
run: |
|
||||
poetry run pytest data_pipeline/ -m smoketest
|
||||
- name: Deploy Score to Geoplatform AWS
|
||||
run: |
|
||||
poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
|
||||
|
|
|
@ -12,7 +12,7 @@ settings = Dynaconf(
|
|||
|
||||
# set root dir
|
||||
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
|
||||
|
||||
settings.REQUESTS_DEFAULT_TIMOUT = 3600
|
||||
# To set an environment use:
|
||||
# Linux/OSX: export ENV_FOR_DYNACONF=staging
|
||||
# Windows: set ENV_FOR_DYNACONF=staging
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import functools
|
||||
from collections import namedtuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
@ -496,10 +496,11 @@ class ScoreETL(ExtractTransformLoad):
|
|||
# >= some threshold.
|
||||
# TODO: Add more fields here.
|
||||
# https://github.com/usds/justice40-tool/issues/970
|
||||
ReversePercentile = namedtuple(
|
||||
typename="ReversePercentile",
|
||||
field_names=["field_name", "low_field_name"],
|
||||
)
|
||||
@dataclass
|
||||
class ReversePercentile:
|
||||
field_name: str
|
||||
low_field_name: str
|
||||
|
||||
reverse_percentiles = [
|
||||
# This dictionary follows the format:
|
||||
# <field name> : <field name for low values>
|
||||
|
|
|
@ -51,7 +51,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
|
||||
## TODO: We really should not have this any longer changing
|
||||
self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
|
||||
field_names.SCORE_N
|
||||
field_names.FINAL_SCORE_N_BOOLEAN
|
||||
]
|
||||
self.TARGET_SCORE_RENAME_TO = "SCORE"
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import List, NamedTuple, Tuple
|
||||
from typing import Any, List, NamedTuple, Tuple
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
|
||||
|
@ -41,7 +41,7 @@ def _prepare_dataframe_for_imputation(
|
|||
impute_var_named_tup_list: List[NamedTuple],
|
||||
geo_df: gpd.GeoDataFrame,
|
||||
geoid_field: str = "GEOID10_TRACT",
|
||||
) -> Tuple[list, gpd.GeoDataFrame]:
|
||||
) -> Tuple[Any, gpd.GeoDataFrame]:
|
||||
imputing_cols = [
|
||||
impute_var_pair.raw_field_name
|
||||
for impute_var_pair in impute_var_named_tup_list
|
||||
|
|
|
@ -282,12 +282,20 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
|
||||
# Download MSA median incomes
|
||||
logger.info("Starting download of MSA median incomes.")
|
||||
download = requests.get(self.MSA_MEDIAN_INCOME_URL, verify=None)
|
||||
download = requests.get(
|
||||
self.MSA_MEDIAN_INCOME_URL,
|
||||
verify=None,
|
||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
||||
)
|
||||
self.msa_median_incomes = json.loads(download.content)
|
||||
|
||||
# Download state median incomes
|
||||
logger.info("Starting download of state median incomes.")
|
||||
download_state = requests.get(self.STATE_MEDIAN_INCOME_URL, verify=None)
|
||||
download_state = requests.get(
|
||||
self.STATE_MEDIAN_INCOME_URL,
|
||||
verify=None,
|
||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
||||
)
|
||||
self.state_median_incomes = json.loads(download_state.content)
|
||||
## NOTE we already have PR's MI here
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import pandas as pd
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.config import settings
|
||||
|
||||
pd.options.mode.chained_assignment = "raise"
|
||||
|
||||
|
@ -270,7 +271,8 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
island["var_list"],
|
||||
island["fips"],
|
||||
county,
|
||||
)
|
||||
),
|
||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
||||
)
|
||||
|
||||
df = json.loads(download.content)
|
||||
|
|
|
@ -3,6 +3,7 @@ import requests
|
|||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.config import settings
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -26,7 +27,11 @@ class HudRecapETL(ExtractTransformLoad):
|
|||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading HUD Recap Data")
|
||||
download = requests.get(self.HUD_RECAP_CSV_URL, verify=None)
|
||||
download = requests.get(
|
||||
self.HUD_RECAP_CSV_URL,
|
||||
verify=None,
|
||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
||||
)
|
||||
file_contents = download.content
|
||||
csv_file = open(self.HUD_RECAP_CSV, "wb")
|
||||
csv_file.write(file_contents)
|
||||
|
|
|
@ -0,0 +1,354 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c9fab286",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %load_ext lab_black\n",
|
||||
"import json\n",
|
||||
"import pandas as pd\n",
|
||||
"import geopandas as gpd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "dbd84e10",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "DriverError",
|
||||
"evalue": "/mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mCPLE_OpenFailedError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mfiona/_err.pyx\u001b[0m in \u001b[0;36mfiona._err.exc_wrap_pointer\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;31mCPLE_OpenFailedError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory",
|
||||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||||
"\u001b[0;31mDriverError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m/tmp/ipykernel_10603/1449522338.py\u001b[0m in \u001b[0;36m<cell line: 3>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Read in the score geojson file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mdata_pipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0metl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstants\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/geopandas/io/file.py\u001b[0m in \u001b[0;36m_read_file\u001b[0;34m(filename, bbox, mask, rows, **kwargs)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mfiona_env\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_or_bytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfeatures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;31m# In a future Fiona release the crs attribute of features will\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/env.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlocal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_env\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 408\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 409\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/__init__.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m c = Collection(path, mode, driver=driver, encoding=encoding,\n\u001b[0m\u001b[1;32m 265\u001b[0m layer=layer, enabled_drivers=enabled_drivers, **kwargs)\n\u001b[1;32m 266\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/collection.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mWritingSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32mfiona/ogrext.pyx\u001b[0m in \u001b[0;36mfiona.ogrext.Session.start\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;31mDriverError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Read in the score geojson file\n",
|
||||
"from data_pipeline.etl.score.constants import DATA_SCORE_CSV_TILES_FILE_PATH\n",
|
||||
"nation = gpd.read_file(DATA_SCORE_CSV_TILES_FILE_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2f850529",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5f342d36",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get the columns of the df and sort the list:\n",
|
||||
"sorted_nation = sorted(nation.columns.to_list())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "97aac08f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"CLI to covert a pbf into a json file (requires tippecannoe and jq to be installed)\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"curl https://justice40-data.s3.amazonaws.com/data-pipeline-staging/1822/e6385c172f1d2adf588050375b7c0985035cfb24/data/score/tiles/high/8/67/101.pbf -o uh-1822-e638-8-67-101.pbf | tippecanoe-decode uh-1822-e638-8-67-101.pbf 8 67 101 | jq > cat uh-1822-e638-8-67-101.json\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cbe37ccb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load a random high-tile json (after decoding a pbf) file using json.loads()\n",
|
||||
"with open(\"/Users/vims/Downloads/uh-1822-e638-8-67-101.json\", \"r\") as f:\n",
|
||||
" random_tile_features = json.loads(f.read())\n",
|
||||
"\n",
|
||||
"# Flatten data around the features key:\n",
|
||||
"flatten_features = pd.json_normalize(random_tile_features, record_path=[\"features\"])\n",
|
||||
"\n",
|
||||
"# index into the feature properties, get keys and turn into a sorted list\n",
|
||||
"random_tile = sorted(list(flatten_features[\"features\"][0][0][\"properties\"].keys()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a33f5126",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"set_dif = set(sorted_nation).symmetric_difference(set(random_tile))\n",
|
||||
"list(set_dif)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d228360b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b6925138",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2f2d7ba0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>GEOID10</th>\n",
|
||||
" <th>SF</th>\n",
|
||||
" <th>CF</th>\n",
|
||||
" <th>HRS_ET</th>\n",
|
||||
" <th>AML_ET</th>\n",
|
||||
" <th>FUDS_ET</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>71</th>\n",
|
||||
" <td>27061480300</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Itasca County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75</th>\n",
|
||||
" <td>27061940000</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Itasca County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>115</th>\n",
|
||||
" <td>27077460400</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Lake of the Woods County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>127</th>\n",
|
||||
" <td>27123042001</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Ramsey County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>160</th>\n",
|
||||
" <td>27123033400</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Ramsey County</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74047</th>\n",
|
||||
" <td>16055000200</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Kootenai County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74068</th>\n",
|
||||
" <td>16011950500</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Bingham County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74076</th>\n",
|
||||
" <td>16001010503</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Ada County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74107</th>\n",
|
||||
" <td>16001001000</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Ada County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74123</th>\n",
|
||||
" <td>16001002100</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Ada County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>3170 rows × 6 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" GEOID10 SF CF HRS_ET AML_ET FUDS_ET\n",
|
||||
"71 27061480300 Minnesota Itasca County None None 0\n",
|
||||
"75 27061940000 Minnesota Itasca County None None 0\n",
|
||||
"115 27077460400 Minnesota Lake of the Woods County None None 0\n",
|
||||
"127 27123042001 Minnesota Ramsey County None None 0\n",
|
||||
"160 27123033400 Minnesota Ramsey County 0 None 0\n",
|
||||
"... ... ... ... ... ... ...\n",
|
||||
"74047 16055000200 Idaho Kootenai County None None 0\n",
|
||||
"74068 16011950500 Idaho Bingham County None None 0\n",
|
||||
"74076 16001010503 Idaho Ada County None None 0\n",
|
||||
"74107 16001001000 Idaho Ada County None None 0\n",
|
||||
"74123 16001002100 Idaho Ada County None None 0\n",
|
||||
"\n",
|
||||
"[3170 rows x 6 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 75,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_HRS_GEO = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'FUDS_ET']]\n",
|
||||
"nation_HRS_GEO.loc[nation_HRS_GEO['FUDS_ET'] == '0']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "02eef4b5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "678bea72",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([None, '0', '1'], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation['HRS_ET'].unique()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8.10 ('data-pipeline-WziHKidv-py3.8')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "c28609757c27a373a12dad8bc3a2aec46aa91130799a09665fba7d386f9c3756"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -0,0 +1,496 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "27da604f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %load_ext lab_black\n",
|
||||
"import json\n",
|
||||
"import pandas as pd\n",
|
||||
"import geopandas as gpd\n",
|
||||
"\n",
|
||||
"# Read in the above json file\n",
|
||||
"nation=gpd.read_file(\"/Users/vims/Downloads/usa-high-1822-637b.json\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "7b7083fd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0 None\n",
|
||||
"1 None\n",
|
||||
"2 None\n",
|
||||
"3 None\n",
|
||||
"4 None\n",
|
||||
" ... \n",
|
||||
"74129 None\n",
|
||||
"74130 None\n",
|
||||
"74131 None\n",
|
||||
"74132 None\n",
|
||||
"74133 None\n",
|
||||
"Name: FUDS_RAW, Length: 74134, dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation['FUDS_RAW']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "117477e6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>GEOID10</th>\n",
|
||||
" <th>SF</th>\n",
|
||||
" <th>CF</th>\n",
|
||||
" <th>HRS_ET</th>\n",
|
||||
" <th>AML_ET</th>\n",
|
||||
" <th>AML_RAW</th>\n",
|
||||
" <th>FUDS_ET</th>\n",
|
||||
" <th>FUDS_RAW</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>27139080202</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Scott County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>27139080204</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Scott County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>27139080100</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Scott County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>27139080302</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Scott County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>27139080400</td>\n",
|
||||
" <td>Minnesota</td>\n",
|
||||
" <td>Scott County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74129</th>\n",
|
||||
" <td>16005001601</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Bannock County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74130</th>\n",
|
||||
" <td>16005001300</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Bannock County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74131</th>\n",
|
||||
" <td>16005001000</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Bannock County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74132</th>\n",
|
||||
" <td>16005000900</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Bannock County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>74133</th>\n",
|
||||
" <td>16005000800</td>\n",
|
||||
" <td>Idaho</td>\n",
|
||||
" <td>Bannock County</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>74134 rows × 8 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" GEOID10 SF CF HRS_ET AML_ET AML_RAW FUDS_ET \\\n",
|
||||
"0 27139080202 Minnesota Scott County None False None False \n",
|
||||
"1 27139080204 Minnesota Scott County None False None False \n",
|
||||
"2 27139080100 Minnesota Scott County None False None False \n",
|
||||
"3 27139080302 Minnesota Scott County None False None False \n",
|
||||
"4 27139080400 Minnesota Scott County None False None False \n",
|
||||
"... ... ... ... ... ... ... ... \n",
|
||||
"74129 16005001601 Idaho Bannock County None False None False \n",
|
||||
"74130 16005001300 Idaho Bannock County None False None False \n",
|
||||
"74131 16005001000 Idaho Bannock County None False None False \n",
|
||||
"74132 16005000900 Idaho Bannock County None False None False \n",
|
||||
"74133 16005000800 Idaho Bannock County None False None False \n",
|
||||
"\n",
|
||||
" FUDS_RAW \n",
|
||||
"0 None \n",
|
||||
"1 None \n",
|
||||
"2 None \n",
|
||||
"3 None \n",
|
||||
"4 None \n",
|
||||
"... ... \n",
|
||||
"74129 None \n",
|
||||
"74130 None \n",
|
||||
"74131 None \n",
|
||||
"74132 None \n",
|
||||
"74133 None \n",
|
||||
"\n",
|
||||
"[74134 rows x 8 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'AML_RAW','FUDS_ET', 'FUDS_RAW']]\n",
|
||||
"nation_new_ind"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"id": "0f37acf4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([None, '0', '1'], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 68,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['HRS_ET'].unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"id": "4ae865ae",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0 8843\n",
|
||||
"1 4045\n",
|
||||
"Name: HRS_ET, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['HRS_ET'].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"id": "2f0d29db",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([False, True])"
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['AML_ET'].unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"id": "646b3754",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False 72100\n",
|
||||
"True 2034\n",
|
||||
"Name: AML_ET, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['AML_ET'].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "0571df6d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([None, '1'], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['AML_RAW'].unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"id": "171fa3c9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1 2034\n",
|
||||
"Name: AML_RAW, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 58,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['AML_RAW'].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 60,
|
||||
"id": "370b0769",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([False, True])"
|
||||
]
|
||||
},
|
||||
"execution_count": 60,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['FUDS_ET'].unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"id": "f8afb668",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False 72056\n",
|
||||
"True 2078\n",
|
||||
"Name: FUDS_ET, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['FUDS_ET'].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"id": "f2e3b78a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([None, '0', '1'], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['FUDS_RAW'].unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"id": "b722e802",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0 3170\n",
|
||||
"1 2078\n",
|
||||
"Name: FUDS_RAW, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nation_new_ind['FUDS_RAW'].value_counts()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -10,7 +10,9 @@ STATE_FIELD = "State/Territory"
|
|||
COUNTY_FIELD = "County Name"
|
||||
|
||||
# Definition Narwhal fields
|
||||
SCORE_N = "Definition N (communities)"
|
||||
FINAL_SCORE_N_BOOLEAN = (
|
||||
"Definition M community, including adjacency index tracts"
|
||||
)
|
||||
SCORE_N_COMMUNITIES = "Definition N (communities)"
|
||||
N_CLIMATE = "Climate Factor (Definition N)"
|
||||
N_ENERGY = "Energy Factor (Definition N)"
|
||||
|
|
|
@ -14,20 +14,17 @@ logger = get_module_logger(__name__)
|
|||
class ScoreNarwhal(Score):
|
||||
"""Very similar to Score M, at present."""
|
||||
|
||||
def __init__(self, df: pd.DataFrame) -> None:
|
||||
self.LOW_INCOME_THRESHOLD: float = 0.65
|
||||
self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
|
||||
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
|
||||
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
||||
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
|
||||
LOW_INCOME_THRESHOLD: float = 0.65
|
||||
MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
|
||||
ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
|
||||
MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
||||
LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
|
||||
|
||||
# We define a donut hole DAC as a tract that is entirely surrounded by
|
||||
# DACs (score threshold = 1) and above median for low income, as a starting
|
||||
# point. As we ground-truth, these thresholds might change.
|
||||
self.LOW_INCOME_THRESHOLD_DONUT: float = 0.50
|
||||
self.SCORE_THRESHOLD_DONUT: float = 1.00
|
||||
|
||||
super().__init__(df)
|
||||
# We define a donut hole DAC as a tract that is entirely surrounded by
|
||||
# DACs (score threshold = 1) and above median for low income, as a starting
|
||||
# point. As we ground-truth, these thresholds might change.
|
||||
LOW_INCOME_THRESHOLD_DONUT: float = 0.50
|
||||
SCORE_THRESHOLD_DONUT: float = 1.00
|
||||
|
||||
def _combine_island_areas_with_states_and_set_thresholds(
|
||||
self,
|
||||
|
|
|
@ -52,3 +52,16 @@ def mock_etl(monkeypatch, mock_paths) -> None:
|
|||
data_path, tmp_path = mock_paths
|
||||
monkeypatch.setattr(ExtractTransformLoad, "DATA_PATH", data_path)
|
||||
monkeypatch.setattr(ExtractTransformLoad, "TMP_PATH", tmp_path)
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
keywordexpr = config.option.keyword
|
||||
markexpr = config.option.markexpr
|
||||
if keywordexpr or markexpr:
|
||||
return # let pytest handle this
|
||||
|
||||
smoketest = "smoketest"
|
||||
skip_mymarker = pytest.mark.skip(reason=f"{smoketest} not selected")
|
||||
for item in items:
|
||||
if smoketest in item.keywords:
|
||||
item.add_marker(skip_mymarker)
|
||||
|
|
12
data/data-pipeline/data_pipeline/tests/score/fixtures.py
Normal file
12
data/data-pipeline/data_pipeline/tests/score/fixtures.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.score import field_names
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def final_score_df():
|
||||
return pd.read_csv(
|
||||
settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv",
|
||||
dtype={field_names.GEOID_TRACT_FIELD: str},
|
||||
)
|
291
data/data-pipeline/data_pipeline/tests/score/test_calculation.py
Normal file
291
data/data-pipeline/data_pipeline/tests/score/test_calculation.py
Normal file
|
@ -0,0 +1,291 @@
|
|||
# flake8: noqa: W0613,W0611,F811
|
||||
from dataclasses import dataclass
|
||||
import pytest
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.score.score_narwhal import ScoreNarwhal
|
||||
from .fixtures import final_score_df # pylint: disable=unused-import
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
pytestmark = pytest.mark.smoketest
|
||||
|
||||
|
||||
@dataclass
|
||||
class PercentileTestConfig:
|
||||
percentile_column_name: str
|
||||
threshold_column_name: str
|
||||
threshold: float
|
||||
percentile_column_need_suffix: bool = True
|
||||
|
||||
@property
|
||||
def full_percentile_column_name(self):
|
||||
if self.percentile_column_need_suffix:
|
||||
return (
|
||||
self.percentile_column_name
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||
)
|
||||
return self.percentile_column_name
|
||||
|
||||
|
||||
### TODO: we need to blow this out for all eight categories
|
||||
def _check_percentile_against_threshold(df, config: PercentileTestConfig):
|
||||
"""Note - for the purpose of testing, this fills with False"""
|
||||
is_minimum_flagged_ok = (
|
||||
df[df[config.threshold_column_name].fillna(False)][
|
||||
config.full_percentile_column_name
|
||||
].min()
|
||||
>= config.threshold
|
||||
)
|
||||
|
||||
is_maximum_not_flagged_ok = (
|
||||
df[~df[config.threshold_column_name].fillna(False)][
|
||||
config.full_percentile_column_name
|
||||
].max()
|
||||
< config.threshold
|
||||
)
|
||||
errors = []
|
||||
if not is_minimum_flagged_ok:
|
||||
errors.append(
|
||||
f"For column {config.threshold_column_name}, there is someone flagged below {config.threshold} percentile!"
|
||||
)
|
||||
if not is_maximum_not_flagged_ok:
|
||||
errors.append(
|
||||
f"For column {config.threshold_column_name}, there is someone not flagged above {config.threshold} percentile!"
|
||||
)
|
||||
return errors
|
||||
|
||||
|
||||
def test_percentile_columns(final_score_df):
|
||||
low_income = PercentileTestConfig(
|
||||
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
|
||||
field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED,
|
||||
ScoreNarwhal.LOW_INCOME_THRESHOLD,
|
||||
)
|
||||
population_loss = PercentileTestConfig(
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
agricultural_loss = PercentileTestConfig(
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
building_loss = PercentileTestConfig(
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
flood = PercentileTestConfig(
|
||||
field_names.FUTURE_FLOOD_RISK_FIELD,
|
||||
field_names.HIGH_FUTURE_FLOOD_RISK_FIELD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
wildfire = PercentileTestConfig(
|
||||
field_names.FUTURE_WILDFIRE_RISK_FIELD,
|
||||
field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
low_high_school = PercentileTestConfig(
|
||||
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
field_names.LOW_HS_EDUCATION_FIELD,
|
||||
ScoreNarwhal.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD,
|
||||
percentile_column_need_suffix=False,
|
||||
)
|
||||
donut_hole_income = PercentileTestConfig(
|
||||
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
|
||||
field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS,
|
||||
ScoreNarwhal.LOW_INCOME_THRESHOLD_DONUT,
|
||||
)
|
||||
donut_hole_adjacency = PercentileTestConfig(
|
||||
(field_names.SCORE_N_COMMUNITIES + field_names.ADJACENCY_INDEX_SUFFIX),
|
||||
field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD,
|
||||
ScoreNarwhal.SCORE_THRESHOLD_DONUT,
|
||||
percentile_column_need_suffix=False,
|
||||
)
|
||||
diesel = PercentileTestConfig(
|
||||
field_names.DIESEL_FIELD,
|
||||
field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
dot_burden = PercentileTestConfig(
|
||||
field_names.DOT_TRAVEL_BURDEN_FIELD,
|
||||
field_names.DOT_BURDEN_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
traffic_proximity = PercentileTestConfig(
|
||||
field_names.TRAFFIC_FIELD,
|
||||
field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
energy_burden = PercentileTestConfig(
|
||||
field_names.ENERGY_BURDEN_FIELD,
|
||||
field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
pm25 = PercentileTestConfig(
|
||||
field_names.PM25_FIELD,
|
||||
field_names.PM25_EXCEEDS_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
kitchen_plumbing = PercentileTestConfig(
|
||||
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD,
|
||||
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
# Leadpaint is handled below in a separate method
|
||||
housing = PercentileTestConfig(
|
||||
field_names.HOUSING_BURDEN_FIELD,
|
||||
field_names.HOUSING_BURDEN_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
non_natural_space = PercentileTestConfig(
|
||||
field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME,
|
||||
field_names.NON_NATURAL_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
rmp = PercentileTestConfig(
|
||||
field_names.RMP_FIELD,
|
||||
field_names.RMP_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
npl = PercentileTestConfig(
|
||||
field_names.NPL_FIELD,
|
||||
field_names.NPL_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
tsdf = PercentileTestConfig(
|
||||
field_names.TSDF_FIELD,
|
||||
field_names.TSDF_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
wastewater = PercentileTestConfig(
|
||||
field_names.WASTEWATER_FIELD,
|
||||
field_names.WASTEWATER_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
ust = PercentileTestConfig(
|
||||
field_names.UST_FIELD,
|
||||
field_names.UST_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
diabetes = PercentileTestConfig(
|
||||
field_names.DIABETES_FIELD,
|
||||
field_names.DIABETES_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
asthma = PercentileTestConfig(
|
||||
field_names.ASTHMA_FIELD,
|
||||
field_names.ASTHMA_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
heart_disease = PercentileTestConfig(
|
||||
field_names.HEART_DISEASE_FIELD,
|
||||
field_names.HEART_DISEASE_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
low_life_expectancy = PercentileTestConfig(
|
||||
field_names.LOW_LIFE_EXPECTANCY_FIELD,
|
||||
field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
unemployment = PercentileTestConfig(
|
||||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.UNEMPLOYMENT_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
low_median_income = PercentileTestConfig(
|
||||
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
|
||||
field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
linguist_isolation = PercentileTestConfig(
|
||||
field_names.LINGUISTIC_ISO_FIELD,
|
||||
field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
poverty = PercentileTestConfig(
|
||||
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
|
||||
field_names.POVERTY_PCTILE_THRESHOLD,
|
||||
ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
errors = []
|
||||
for threshhold_config in (
|
||||
low_income,
|
||||
population_loss,
|
||||
agricultural_loss,
|
||||
building_loss,
|
||||
flood,
|
||||
wildfire,
|
||||
low_high_school,
|
||||
donut_hole_income,
|
||||
donut_hole_adjacency,
|
||||
dot_burden,
|
||||
diesel,
|
||||
traffic_proximity,
|
||||
energy_burden,
|
||||
pm25,
|
||||
kitchen_plumbing,
|
||||
housing,
|
||||
non_natural_space,
|
||||
rmp,
|
||||
npl,
|
||||
tsdf,
|
||||
wastewater,
|
||||
ust,
|
||||
diabetes,
|
||||
asthma,
|
||||
heart_disease,
|
||||
low_life_expectancy,
|
||||
unemployment,
|
||||
low_median_income,
|
||||
linguist_isolation,
|
||||
poverty,
|
||||
):
|
||||
errors.extend(
|
||||
_check_percentile_against_threshold(
|
||||
final_score_df, threshhold_config
|
||||
)
|
||||
)
|
||||
error_text = "\n".join(errors)
|
||||
assert not errors, error_text
|
||||
|
||||
|
||||
def test_lead_paint_indicator(
|
||||
final_score_df,
|
||||
):
|
||||
"""We need special logic here because this is a combined threshold, so we need this test to have two parts.
|
||||
|
||||
1. We construct our own threshold columns
|
||||
2. We make sure it's the same as the threshold column in the dataframe
|
||||
"""
|
||||
lead_pfs = (
|
||||
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||
)
|
||||
home_val_pfs = (
|
||||
field_names.MEDIAN_HOUSE_VALUE_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||
)
|
||||
combined_proxy_boolean = field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD
|
||||
|
||||
tmp_lead_threshold = (
|
||||
final_score_df[lead_pfs] >= ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
tmp_mhv_threshold = (
|
||||
final_score_df[home_val_pfs]
|
||||
<= ScoreNarwhal.MEDIAN_HOUSE_VALUE_THRESHOLD
|
||||
)
|
||||
|
||||
true_combined_proxy = tmp_lead_threshold & tmp_mhv_threshold
|
||||
|
||||
assert (
|
||||
tmp_mhv_threshold.sum() > 0
|
||||
), "MHV threshold alone does not capture any homes"
|
||||
|
||||
assert final_score_df[combined_proxy_boolean].equals(
|
||||
true_combined_proxy
|
||||
), "Lead proxy calculated improperly"
|
||||
assert (
|
||||
tmp_lead_threshold.sum() > true_combined_proxy.sum()
|
||||
), "House value is not further limiting this proxy"
|
205
data/data-pipeline/data_pipeline/tests/score/test_output.py
Normal file
205
data/data-pipeline/data_pipeline/tests/score/test_output.py
Normal file
|
@ -0,0 +1,205 @@
|
|||
# flake8: noqa: W0613,W0611,F811
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from data_pipeline.score import field_names
|
||||
from .fixtures import final_score_df # pylint: disable=unused-import
|
||||
|
||||
pytestmark = pytest.mark.smoketest
|
||||
|
||||
|
||||
def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
|
||||
"""Fills NA with False"""
|
||||
return df[df[col].fillna(False)].shape[0] >= error_check
|
||||
|
||||
|
||||
def _helper_single_threshold_test(df, col, socioeconomic_column, score_column):
|
||||
"""Note that this fills nulls in the threshold column where nulls exist"""
|
||||
nulls_dont_exist = (
|
||||
df[df[col].fillna(False) & df[socioeconomic_column]][score_column]
|
||||
.isna()
|
||||
.sum()
|
||||
== 0
|
||||
)
|
||||
only_trues = df[df[col].fillna(False) & df[socioeconomic_column]][
|
||||
score_column
|
||||
].min()
|
||||
return nulls_dont_exist, only_trues
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThresholdTestConfig:
|
||||
name: str
|
||||
threshhold_columns: List[str]
|
||||
ses_column_name: str = field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED
|
||||
score_column_name: str = field_names.SCORE_N_COMMUNITIES
|
||||
|
||||
@property
|
||||
def error_message(self):
|
||||
return f"Eligibility columns have an error, {self.name}"
|
||||
|
||||
|
||||
def check_for_threshhold_errors(
|
||||
df: pd.DataFrame, config: ThresholdTestConfig
|
||||
) -> List[str]:
|
||||
errors = []
|
||||
for col in config.threshhold_columns:
|
||||
nulls_dont_exist, only_trues = _helper_single_threshold_test(
|
||||
df,
|
||||
col,
|
||||
config.ses_column_name,
|
||||
config.score_column_name,
|
||||
)
|
||||
proper_threshold_identification = (
|
||||
_helper_test_count_exceeding_threshold(df, col)
|
||||
)
|
||||
if not nulls_dont_exist:
|
||||
errors.append(
|
||||
f"For {col}, threshold is not calculated right -- there are NaNs in Score"
|
||||
)
|
||||
if not only_trues:
|
||||
errors.append(
|
||||
f"For {col} and {config.ses_column_name}, threshold is not calculated right "
|
||||
f"-- there are Falses where there should only be Trues"
|
||||
)
|
||||
if not proper_threshold_identification:
|
||||
errors.append(
|
||||
f"Threshold {col} returns too few tracts, are you sure it's nationally-representative?"
|
||||
)
|
||||
if errors:
|
||||
errors.append(config.error_message)
|
||||
return errors
|
||||
|
||||
|
||||
def test_threshholds(final_score_df):
|
||||
climate_thresholds = ThresholdTestConfig(
|
||||
"climate",
|
||||
[
|
||||
field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD,
|
||||
field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD,
|
||||
field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD,
|
||||
field_names.HIGH_FUTURE_FLOOD_RISK_FIELD,
|
||||
field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD,
|
||||
],
|
||||
)
|
||||
energy_thresholds = ThresholdTestConfig(
|
||||
"energy",
|
||||
[
|
||||
field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD,
|
||||
field_names.PM25_EXCEEDS_PCTILE_THRESHOLD,
|
||||
],
|
||||
)
|
||||
transportation_thresholds = ThresholdTestConfig(
|
||||
"transportation",
|
||||
[
|
||||
field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD,
|
||||
field_names.DOT_BURDEN_PCTILE_THRESHOLD,
|
||||
field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD,
|
||||
],
|
||||
)
|
||||
housing_thresholds = ThresholdTestConfig(
|
||||
"housing",
|
||||
[
|
||||
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
|
||||
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD,
|
||||
field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD,
|
||||
field_names.HOUSING_BURDEN_PCTILE_THRESHOLD,
|
||||
field_names.NON_NATURAL_PCTILE_THRESHOLD,
|
||||
],
|
||||
)
|
||||
pollution_thresholds = ThresholdTestConfig(
|
||||
"pollution",
|
||||
[
|
||||
field_names.RMP_PCTILE_THRESHOLD,
|
||||
field_names.NPL_PCTILE_THRESHOLD,
|
||||
field_names.TSDF_PCTILE_THRESHOLD,
|
||||
field_names.AML_BOOLEAN,
|
||||
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
|
||||
],
|
||||
)
|
||||
water_thresholds = ThresholdTestConfig(
|
||||
"water",
|
||||
[
|
||||
field_names.WASTEWATER_PCTILE_THRESHOLD,
|
||||
field_names.UST_PCTILE_THRESHOLD,
|
||||
],
|
||||
)
|
||||
health_thresholds = ThresholdTestConfig(
|
||||
"health",
|
||||
[
|
||||
field_names.DIABETES_PCTILE_THRESHOLD,
|
||||
field_names.ASTHMA_PCTILE_THRESHOLD,
|
||||
field_names.HEART_DISEASE_PCTILE_THRESHOLD,
|
||||
field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD,
|
||||
],
|
||||
)
|
||||
workforce_base_thresholds = ThresholdTestConfig(
|
||||
"workforce (not island areas)",
|
||||
[
|
||||
field_names.UNEMPLOYMENT_PCTILE_THRESHOLD,
|
||||
field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD,
|
||||
field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD,
|
||||
field_names.POVERTY_PCTILE_THRESHOLD,
|
||||
],
|
||||
ses_column_name=field_names.LOW_HS_EDUCATION_FIELD,
|
||||
)
|
||||
errors = []
|
||||
for threshhold_config in [
|
||||
climate_thresholds,
|
||||
energy_thresholds,
|
||||
transportation_thresholds,
|
||||
housing_thresholds,
|
||||
pollution_thresholds,
|
||||
water_thresholds,
|
||||
health_thresholds,
|
||||
workforce_base_thresholds,
|
||||
]:
|
||||
errors.extend(
|
||||
check_for_threshhold_errors(final_score_df, threshhold_config)
|
||||
)
|
||||
error_text = "\n".join(errors)
|
||||
assert not errors, error_text
|
||||
|
||||
|
||||
def test_max_40_percent_DAC(final_score_df):
|
||||
score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
|
||||
total_population_col = field_names.TOTAL_POP_FIELD
|
||||
assert (
|
||||
final_score_df[score_col_with_donuts].isna().sum() == 0
|
||||
), f"Error: {score_col_with_donuts} contains NULLs"
|
||||
assert (
|
||||
final_score_df[final_score_df[score_col_with_donuts]][
|
||||
total_population_col
|
||||
].sum()
|
||||
/ final_score_df[total_population_col].sum()
|
||||
) < 0.4, "Error: the scoring methodology identifies >40% of people in the US as disadvantaged"
|
||||
assert (
|
||||
final_score_df[score_col_with_donuts].sum() > 0
|
||||
), "FYI: You've identified no tracts at all!"
|
||||
|
||||
|
||||
def test_donut_hole_addition_to_score_n(final_score_df):
|
||||
score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
|
||||
score_col = field_names.SCORE_N_COMMUNITIES
|
||||
donut_hole_score_only = (
|
||||
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
|
||||
)
|
||||
count_donuts = final_score_df[donut_hole_score_only].sum()
|
||||
count_n = final_score_df[score_col].sum()
|
||||
count_n_with_donuts = final_score_df[score_col_with_donuts].sum()
|
||||
new_donuts = final_score_df[
|
||||
final_score_df[donut_hole_score_only] & ~final_score_df[score_col]
|
||||
].shape[0]
|
||||
|
||||
assert (
|
||||
new_donuts + count_n == count_n_with_donuts
|
||||
), "The math doesn't work! The number of new donut hole tracts plus score tracts (base) does not equal the total number of tracts identified"
|
||||
|
||||
assert (
|
||||
count_donuts < count_n
|
||||
), "There are more donut hole tracts than base tracts. How can it be?"
|
||||
|
||||
assert (
|
||||
new_donuts > 0
|
||||
), "FYI: The adjacency index is doing nothing. Consider removing it?"
|
|
@ -87,6 +87,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
|||
logger.info("Generating Tribal mbtiles file")
|
||||
cmd = "tippecanoe "
|
||||
cmd += "--layer=blocks "
|
||||
cmd += "--base-zoom=3 "
|
||||
cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} "
|
||||
cmd += f"--output={tribal_tiles_path}/usa.mbtiles "
|
||||
cmd += str(tribal_geojson_dir / "usa.json")
|
||||
|
@ -95,10 +96,12 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
|||
# generate mvts
|
||||
logger.info("Generating Tribal mvt folders and files")
|
||||
cmd = "tippecanoe "
|
||||
cmd += "--layer=blocks "
|
||||
cmd += "--base-zoom=3 "
|
||||
cmd += "--no-tile-compression "
|
||||
cmd += "--drop-densest-as-needed "
|
||||
cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} "
|
||||
cmd += f"--output-to-directory={tribal_tiles_path} --layer=blocks "
|
||||
cmd += f"--output-to-directory={tribal_tiles_path} "
|
||||
cmd += str(tribal_geojson_dir / "usa.json")
|
||||
call(cmd, shell=True)
|
||||
|
||||
|
|
|
@ -149,7 +149,9 @@ def download_file_from_url(
|
|||
os.mkdir(download_file_name.parent)
|
||||
|
||||
logger.info(f"Downloading {file_url}")
|
||||
response = requests.get(file_url, verify=verify)
|
||||
response = requests.get(
|
||||
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
|
||||
)
|
||||
if response.status_code == 200:
|
||||
file_contents = response.content
|
||||
else:
|
||||
|
|
|
@ -1,2 +1,4 @@
|
|||
[pytest]
|
||||
norecursedirs = .git data
|
||||
markers =
|
||||
smoketest: marks a test as depending on the full score output
|
||||
|
|
Loading…
Add table
Reference in a new issue