From e539db86ab4c031e2c3d35ef51ee0a49990865dd Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Fri, 26 Aug 2022 13:11:51 -0400 Subject: [PATCH 1/2] tuple type --- data/data-pipeline/README.md | 5 +++-- .../data_pipeline/etl/sources/census_acs/etl_imputations.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index 517c3ccb..3f46b22a 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -196,7 +196,7 @@ Here's a list of commands: ## Local development -You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. Also to generate tiles for a local map, you will need [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe). Please refer to the repos for specific instructions for your OS. +You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. For score generation, you will need [libspatialindex](https://libspatialindex.org/en/latest/). And to generate tiles for a local map, you will need [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe). Please refer to the repos for specific instructions for your OS. ### VSCode @@ -218,6 +218,7 @@ To install the above-named executables: - gdal: `brew install gdal` - Tippecanoe: `brew install tippecanoe` +- spatialindex: `brew install spatialindex` Note: For MacOS Monterey or M1 Macs, [you might need to follow these steps](https://stackoverflow.com/a/70880741) to install Scipy. @@ -229,7 +230,7 @@ If you want to run tile generation, please install TippeCanoe [following these i - Start a terminal - Change to this directory (`/data/data-pipeline/`) -- Make sure you have at least Python 3.7 installed: `python -V` or `python3 -V` +- Make sure you have at least Python 3.8 installed: `python -V` or `python3 -V` - We use [Poetry](https://python-poetry.org/) for managing dependencies and building the application. Please follow the instructions on their site to download. - Install Poetry requirements with `poetry install` diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py index 408a3341..22381477 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py @@ -1,4 +1,4 @@ -from typing import List, NamedTuple +from typing import List, NamedTuple, Tuple import pandas as pd import geopandas as gpd @@ -41,7 +41,7 @@ def _prepare_dataframe_for_imputation( impute_var_named_tup_list: List[NamedTuple], geo_df: gpd.GeoDataFrame, geoid_field: str = "GEOID10_TRACT", -) -> tuple[list, gpd.GeoDataFrame]: +) -> Tuple[list, gpd.GeoDataFrame]: imputing_cols = [ impute_var_pair.raw_field_name for impute_var_pair in impute_var_named_tup_list From 1c4d3e4142d83bf86cec32b307c8d33b833549a3 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com> Date: Fri, 26 Aug 2022 15:23:20 -0400 Subject: [PATCH 2/2] Score tests (#1847) * update Python version on README; tuple typing fix * Alaska tribal points fix (#1821) * Bump mistune from 0.8.4 to 2.0.3 in /data/data-pipeline (#1777) Bumps [mistune](https://github.com/lepture/mistune) from 0.8.4 to 2.0.3. - [Release notes](https://github.com/lepture/mistune/releases) - [Changelog](https://github.com/lepture/mistune/blob/master/docs/changes.rst) - [Commits](https://github.com/lepture/mistune/compare/v0.8.4...v2.0.3) --- updated-dependencies: - dependency-name: mistune dependency-type: indirect ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * poetry update * initial pass of score tests * add threshold tests * added ses threshold (not donut, not island) * testing suite -- stopping for the day * added test for lead proxy indicator * Refactor score tests to make them less verbose and more direct (#1865) * Cleanup tests slightly before refactor (#1846) * Refactor score calculations tests * Feedback from review * Refactor output tests like calculatoin tests (#1846) (#1870) * Reorganize files (#1846) * Switch from lru_cache to fixture scorpes (#1846) * Add tests for all factors (#1846) * Mark smoketests and run as part of be deply (#1846) * Update renamed var (#1846) * Switch from named tuple to dataclass (#1846) This is annoying, but pylint in python3.8 was crashing parsing the named tuple. We weren't using any namedtuple-specific features, so I made the type a dataclass just to get pylint to behave. * Add default timout to requests (#1846) * Fix type (#1846) * Fix merge mistake on poetry.lock (#1846) Signed-off-by: dependabot[bot] Co-authored-by: Jorge Escobar Co-authored-by: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Matt Bowen <83967628+mattbowen-usds@users.noreply.github.com> Co-authored-by: matt bowen --- .github/workflows/deploy_be_staging.yml | 3 + data/data-pipeline/data_pipeline/config.py | 2 +- .../data_pipeline/etl/score/etl_score.py | 11 +- .../data_pipeline/etl/score/etl_score_geo.py | 2 +- .../etl/sources/census_acs/etl_imputations.py | 4 +- .../sources/census_acs_median_income/etl.py | 12 +- .../etl/sources/census_decennial/etl.py | 4 +- .../etl/sources/hud_recap/etl.py | 7 +- .../compare_tiles_and_geoJson_files.ipynb | 354 +++++++++++++ .../ipython/geojson_compare_tiles.ipynb | 496 ++++++++++++++++++ .../data_pipeline/score/field_names.py | 4 +- .../data_pipeline/score/score_narwhal.py | 23 +- .../data_pipeline/tests/conftest.py | 13 + .../data_pipeline/tests/score/fixtures.py | 12 + .../tests/score/test_calculation.py | 291 ++++++++++ .../data_pipeline/tests/score/test_output.py | 205 ++++++++ .../data_pipeline/tile/generate.py | 5 +- data/data-pipeline/data_pipeline/utils.py | 4 +- data/data-pipeline/pytest.ini | 2 + 19 files changed, 1425 insertions(+), 29 deletions(-) create mode 100644 data/data-pipeline/data_pipeline/ipython/compare_tiles_and_geoJson_files.ipynb create mode 100644 data/data-pipeline/data_pipeline/ipython/geojson_compare_tiles.ipynb create mode 100644 data/data-pipeline/data_pipeline/tests/score/fixtures.py create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_calculation.py create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_output.py diff --git a/.github/workflows/deploy_be_staging.yml b/.github/workflows/deploy_be_staging.yml index fd324c73..8a10cf38 100644 --- a/.github/workflows/deploy_be_staging.yml +++ b/.github/workflows/deploy_be_staging.yml @@ -62,6 +62,9 @@ jobs: - name: Generate Score Post run: | poetry run python3 data_pipeline/application.py generate-score-post -s aws + - name: Run Smoketests + run: | + poetry run pytest data_pipeline/ -m smoketest - name: Deploy Score to Geoplatform AWS run: | poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read diff --git a/data/data-pipeline/data_pipeline/config.py b/data/data-pipeline/data_pipeline/config.py index c32389ca..23e550a8 100644 --- a/data/data-pipeline/data_pipeline/config.py +++ b/data/data-pipeline/data_pipeline/config.py @@ -12,7 +12,7 @@ settings = Dynaconf( # set root dir settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent - +settings.REQUESTS_DEFAULT_TIMOUT = 3600 # To set an environment use: # Linux/OSX: export ENV_FOR_DYNACONF=staging # Windows: set ENV_FOR_DYNACONF=staging diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 663dc8d7..cfcd123d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -1,5 +1,5 @@ import functools -from collections import namedtuple +from dataclasses import dataclass import numpy as np import pandas as pd @@ -496,10 +496,11 @@ class ScoreETL(ExtractTransformLoad): # >= some threshold. # TODO: Add more fields here. # https://github.com/usds/justice40-tool/issues/970 - ReversePercentile = namedtuple( - typename="ReversePercentile", - field_names=["field_name", "low_field_name"], - ) + @dataclass + class ReversePercentile: + field_name: str + low_field_name: str + reverse_percentiles = [ # This dictionary follows the format: # : diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index 14f72ad2..4ad3cb58 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -51,7 +51,7 @@ class GeoScoreETL(ExtractTransformLoad): ## TODO: We really should not have this any longer changing self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[ - field_names.SCORE_N + field_names.FINAL_SCORE_N_BOOLEAN ] self.TARGET_SCORE_RENAME_TO = "SCORE" diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py index 22381477..17180026 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py @@ -1,4 +1,4 @@ -from typing import List, NamedTuple, Tuple +from typing import Any, List, NamedTuple, Tuple import pandas as pd import geopandas as gpd @@ -41,7 +41,7 @@ def _prepare_dataframe_for_imputation( impute_var_named_tup_list: List[NamedTuple], geo_df: gpd.GeoDataFrame, geoid_field: str = "GEOID10_TRACT", -) -> Tuple[list, gpd.GeoDataFrame]: +) -> Tuple[Any, gpd.GeoDataFrame]: imputing_cols = [ impute_var_pair.raw_field_name for impute_var_pair in impute_var_named_tup_list diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py index 32325842..a39f8891 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py @@ -282,12 +282,20 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): # Download MSA median incomes logger.info("Starting download of MSA median incomes.") - download = requests.get(self.MSA_MEDIAN_INCOME_URL, verify=None) + download = requests.get( + self.MSA_MEDIAN_INCOME_URL, + verify=None, + timeout=settings.REQUESTS_DEFAULT_TIMOUT, + ) self.msa_median_incomes = json.loads(download.content) # Download state median incomes logger.info("Starting download of state median incomes.") - download_state = requests.get(self.STATE_MEDIAN_INCOME_URL, verify=None) + download_state = requests.get( + self.STATE_MEDIAN_INCOME_URL, + verify=None, + timeout=settings.REQUESTS_DEFAULT_TIMOUT, + ) self.state_median_incomes = json.loads(download_state.content) ## NOTE we already have PR's MI here diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 56aa4745..ea503f62 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -7,6 +7,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.utils import get_module_logger from data_pipeline.score import field_names +from data_pipeline.config import settings pd.options.mode.chained_assignment = "raise" @@ -270,7 +271,8 @@ class CensusDecennialETL(ExtractTransformLoad): island["var_list"], island["fips"], county, - ) + ), + timeout=settings.REQUESTS_DEFAULT_TIMOUT, ) df = json.loads(download.content) diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py index c5f6ce63..cf611137 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py @@ -3,6 +3,7 @@ import requests from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -26,7 +27,11 @@ class HudRecapETL(ExtractTransformLoad): def extract(self) -> None: logger.info("Downloading HUD Recap Data") - download = requests.get(self.HUD_RECAP_CSV_URL, verify=None) + download = requests.get( + self.HUD_RECAP_CSV_URL, + verify=None, + timeout=settings.REQUESTS_DEFAULT_TIMOUT, + ) file_contents = download.content csv_file = open(self.HUD_RECAP_CSV, "wb") csv_file.write(file_contents) diff --git a/data/data-pipeline/data_pipeline/ipython/compare_tiles_and_geoJson_files.ipynb b/data/data-pipeline/data_pipeline/ipython/compare_tiles_and_geoJson_files.ipynb new file mode 100644 index 00000000..f3585578 --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/compare_tiles_and_geoJson_files.ipynb @@ -0,0 +1,354 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c9fab286", + "metadata": {}, + "outputs": [], + "source": [ + "# %load_ext lab_black\n", + "import json\n", + "import pandas as pd\n", + "import geopandas as gpd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "dbd84e10", + "metadata": {}, + "outputs": [ + { + "ename": "DriverError", + "evalue": "/mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mCPLE_OpenFailedError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mfiona/_err.pyx\u001b[0m in \u001b[0;36mfiona._err.exc_wrap_pointer\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mCPLE_OpenFailedError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mDriverError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_10603/1449522338.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Read in the score geojson file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mdata_pipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0metl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstants\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_SCORE_CSV_TILES_FILE_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/geopandas/io/file.py\u001b[0m in \u001b[0;36m_read_file\u001b[0;34m(filename, bbox, mask, rows, **kwargs)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mfiona_env\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_or_bytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfeatures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;31m# In a future Fiona release the crs attribute of features will\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/env.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlocal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_env\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 408\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 409\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/__init__.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(fp, mode, driver, schema, crs, encoding, layer, vfs, enabled_drivers, crs_wkt, **kwargs)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m c = Collection(path, mode, driver=driver, encoding=encoding,\n\u001b[0m\u001b[1;32m 265\u001b[0m layer=layer, enabled_drivers=enabled_drivers, **kwargs)\n\u001b[1;32m 266\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.cache/pypoetry/virtualenvs/data-pipeline-WziHKidv-py3.8/lib/python3.8/site-packages/fiona/collection.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, driver, schema, crs, encoding, layer, vsi, archive, enabled_drivers, crs_wkt, ignore_fields, ignore_geometry, **kwargs)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mWritingSession\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mfiona/ogrext.pyx\u001b[0m in \u001b[0;36mfiona.ogrext.Session.start\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mfiona/_shim.pyx\u001b[0m in \u001b[0;36mfiona._shim.gdal_open_vector\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mDriverError\u001b[0m: /mnt/e/opt/justice40-tool/data/data-pipeline/data_pipeline/data/score/csv/tiles/usa.csv: No such file or directory" + ] + } + ], + "source": [ + "# Read in the score geojson file\n", + "from data_pipeline.etl.score.constants import DATA_SCORE_CSV_TILES_FILE_PATH\n", + "nation = gpd.read_file(DATA_SCORE_CSV_TILES_FILE_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f850529", + "metadata": {}, + "outputs": [], + "source": [ + "nation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f342d36", + "metadata": {}, + "outputs": [], + "source": [ + "# get the columns of the df and sort the list:\n", + "sorted_nation = sorted(nation.columns.to_list())" + ] + }, + { + "cell_type": "markdown", + "id": "97aac08f", + "metadata": {}, + "source": [ + "CLI to covert a pbf into a json file (requires tippecannoe and jq to be installed)\n", + "\n", + "```bash\n", + "curl https://justice40-data.s3.amazonaws.com/data-pipeline-staging/1822/e6385c172f1d2adf588050375b7c0985035cfb24/data/score/tiles/high/8/67/101.pbf -o uh-1822-e638-8-67-101.pbf | tippecanoe-decode uh-1822-e638-8-67-101.pbf 8 67 101 | jq > cat uh-1822-e638-8-67-101.json\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbe37ccb", + "metadata": {}, + "outputs": [], + "source": [ + "# load a random high-tile json (after decoding a pbf) file using json.loads()\n", + "with open(\"/Users/vims/Downloads/uh-1822-e638-8-67-101.json\", \"r\") as f:\n", + " random_tile_features = json.loads(f.read())\n", + "\n", + "# Flatten data around the features key:\n", + "flatten_features = pd.json_normalize(random_tile_features, record_path=[\"features\"])\n", + "\n", + "# index into the feature properties, get keys and turn into a sorted list\n", + "random_tile = sorted(list(flatten_features[\"features\"][0][0][\"properties\"].keys()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a33f5126", + "metadata": {}, + "outputs": [], + "source": [ + "set_dif = set(sorted_nation).symmetric_difference(set(random_tile))\n", + "list(set_dif)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d228360b", + "metadata": {}, + "outputs": [], + "source": [ + "nation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6925138", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f2d7ba0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10SFCFHRS_ETAML_ETFUDS_ET
7127061480300MinnesotaItasca CountyNoneNone0
7527061940000MinnesotaItasca CountyNoneNone0
11527077460400MinnesotaLake of the Woods CountyNoneNone0
12727123042001MinnesotaRamsey CountyNoneNone0
16027123033400MinnesotaRamsey County0None0
.....................
7404716055000200IdahoKootenai CountyNoneNone0
7406816011950500IdahoBingham CountyNoneNone0
7407616001010503IdahoAda CountyNoneNone0
7410716001001000IdahoAda CountyNoneNone0
7412316001002100IdahoAda CountyNoneNone0
\n", + "

3170 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 SF CF HRS_ET AML_ET FUDS_ET\n", + "71 27061480300 Minnesota Itasca County None None 0\n", + "75 27061940000 Minnesota Itasca County None None 0\n", + "115 27077460400 Minnesota Lake of the Woods County None None 0\n", + "127 27123042001 Minnesota Ramsey County None None 0\n", + "160 27123033400 Minnesota Ramsey County 0 None 0\n", + "... ... ... ... ... ... ...\n", + "74047 16055000200 Idaho Kootenai County None None 0\n", + "74068 16011950500 Idaho Bingham County None None 0\n", + "74076 16001010503 Idaho Ada County None None 0\n", + "74107 16001001000 Idaho Ada County None None 0\n", + "74123 16001002100 Idaho Ada County None None 0\n", + "\n", + "[3170 rows x 6 columns]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_HRS_GEO = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'FUDS_ET']]\n", + "nation_HRS_GEO.loc[nation_HRS_GEO['FUDS_ET'] == '0']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02eef4b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "678bea72", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([None, '0', '1'], dtype=object)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation['HRS_ET'].unique()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.10 ('data-pipeline-WziHKidv-py3.8')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "c28609757c27a373a12dad8bc3a2aec46aa91130799a09665fba7d386f9c3756" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/data_pipeline/ipython/geojson_compare_tiles.ipynb b/data/data-pipeline/data_pipeline/ipython/geojson_compare_tiles.ipynb new file mode 100644 index 00000000..f134f9a6 --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/geojson_compare_tiles.ipynb @@ -0,0 +1,496 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "27da604f", + "metadata": {}, + "outputs": [], + "source": [ + "# %load_ext lab_black\n", + "import json\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "\n", + "# Read in the above json file\n", + "nation=gpd.read_file(\"/Users/vims/Downloads/usa-high-1822-637b.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7b7083fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 None\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + " ... \n", + "74129 None\n", + "74130 None\n", + "74131 None\n", + "74132 None\n", + "74133 None\n", + "Name: FUDS_RAW, Length: 74134, dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation['FUDS_RAW']" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "117477e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10SFCFHRS_ETAML_ETAML_RAWFUDS_ETFUDS_RAW
027139080202MinnesotaScott CountyNoneFalseNoneFalseNone
127139080204MinnesotaScott CountyNoneFalseNoneFalseNone
227139080100MinnesotaScott CountyNoneFalseNoneFalseNone
327139080302MinnesotaScott CountyNoneFalseNoneFalseNone
427139080400MinnesotaScott CountyNoneFalseNoneFalseNone
...........................
7412916005001601IdahoBannock CountyNoneFalseNoneFalseNone
7413016005001300IdahoBannock CountyNoneFalseNoneFalseNone
7413116005001000IdahoBannock CountyNoneFalseNoneFalseNone
7413216005000900IdahoBannock CountyNoneFalseNoneFalseNone
7413316005000800IdahoBannock CountyNoneFalseNoneFalseNone
\n", + "

74134 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 SF CF HRS_ET AML_ET AML_RAW FUDS_ET \\\n", + "0 27139080202 Minnesota Scott County None False None False \n", + "1 27139080204 Minnesota Scott County None False None False \n", + "2 27139080100 Minnesota Scott County None False None False \n", + "3 27139080302 Minnesota Scott County None False None False \n", + "4 27139080400 Minnesota Scott County None False None False \n", + "... ... ... ... ... ... ... ... \n", + "74129 16005001601 Idaho Bannock County None False None False \n", + "74130 16005001300 Idaho Bannock County None False None False \n", + "74131 16005001000 Idaho Bannock County None False None False \n", + "74132 16005000900 Idaho Bannock County None False None False \n", + "74133 16005000800 Idaho Bannock County None False None False \n", + "\n", + " FUDS_RAW \n", + "0 None \n", + "1 None \n", + "2 None \n", + "3 None \n", + "4 None \n", + "... ... \n", + "74129 None \n", + "74130 None \n", + "74131 None \n", + "74132 None \n", + "74133 None \n", + "\n", + "[74134 rows x 8 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind = nation[['GEOID10', 'SF', 'CF', 'HRS_ET', 'AML_ET', 'AML_RAW','FUDS_ET', 'FUDS_RAW']]\n", + "nation_new_ind" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "0f37acf4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([None, '0', '1'], dtype=object)" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['HRS_ET'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "4ae865ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 8843\n", + "1 4045\n", + "Name: HRS_ET, dtype: int64" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['HRS_ET'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "2f0d29db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([False, True])" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['AML_ET'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "646b3754", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 72100\n", + "True 2034\n", + "Name: AML_ET, dtype: int64" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['AML_ET'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "0571df6d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([None, '1'], dtype=object)" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['AML_RAW'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "171fa3c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 2034\n", + "Name: AML_RAW, dtype: int64" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['AML_RAW'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "370b0769", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([False, True])" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['FUDS_ET'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "f8afb668", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 72056\n", + "True 2078\n", + "Name: FUDS_ET, dtype: int64" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['FUDS_ET'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "f2e3b78a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([None, '0', '1'], dtype=object)" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['FUDS_RAW'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "b722e802", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3170\n", + "1 2078\n", + "Name: FUDS_RAW, dtype: int64" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nation_new_ind['FUDS_RAW'].value_counts()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 3a721f60..fc68ebbb 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -10,7 +10,9 @@ STATE_FIELD = "State/Territory" COUNTY_FIELD = "County Name" # Definition Narwhal fields -SCORE_N = "Definition N (communities)" +FINAL_SCORE_N_BOOLEAN = ( + "Definition M community, including adjacency index tracts" +) SCORE_N_COMMUNITIES = "Definition N (communities)" N_CLIMATE = "Climate Factor (Definition N)" N_ENERGY = "Energy Factor (Definition N)" diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index 5fb2923c..66fb3251 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -14,20 +14,17 @@ logger = get_module_logger(__name__) class ScoreNarwhal(Score): """Very similar to Score M, at present.""" - def __init__(self, df: pd.DataFrame) -> None: - self.LOW_INCOME_THRESHOLD: float = 0.65 - self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20 - self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 - self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90 - self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10 + LOW_INCOME_THRESHOLD: float = 0.65 + MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20 + ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 + MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90 + LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10 - # We define a donut hole DAC as a tract that is entirely surrounded by - # DACs (score threshold = 1) and above median for low income, as a starting - # point. As we ground-truth, these thresholds might change. - self.LOW_INCOME_THRESHOLD_DONUT: float = 0.50 - self.SCORE_THRESHOLD_DONUT: float = 1.00 - - super().__init__(df) + # We define a donut hole DAC as a tract that is entirely surrounded by + # DACs (score threshold = 1) and above median for low income, as a starting + # point. As we ground-truth, these thresholds might change. + LOW_INCOME_THRESHOLD_DONUT: float = 0.50 + SCORE_THRESHOLD_DONUT: float = 1.00 def _combine_island_areas_with_states_and_set_thresholds( self, diff --git a/data/data-pipeline/data_pipeline/tests/conftest.py b/data/data-pipeline/data_pipeline/tests/conftest.py index f1dc63ac..6fb3d138 100644 --- a/data/data-pipeline/data_pipeline/tests/conftest.py +++ b/data/data-pipeline/data_pipeline/tests/conftest.py @@ -52,3 +52,16 @@ def mock_etl(monkeypatch, mock_paths) -> None: data_path, tmp_path = mock_paths monkeypatch.setattr(ExtractTransformLoad, "DATA_PATH", data_path) monkeypatch.setattr(ExtractTransformLoad, "TMP_PATH", tmp_path) + + +def pytest_collection_modifyitems(config, items): + keywordexpr = config.option.keyword + markexpr = config.option.markexpr + if keywordexpr or markexpr: + return # let pytest handle this + + smoketest = "smoketest" + skip_mymarker = pytest.mark.skip(reason=f"{smoketest} not selected") + for item in items: + if smoketest in item.keywords: + item.add_marker(skip_mymarker) diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py new file mode 100644 index 00000000..5a819da0 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -0,0 +1,12 @@ +import pandas as pd +import pytest +from data_pipeline.config import settings +from data_pipeline.score import field_names + + +@pytest.fixture(scope="session") +def final_score_df(): + return pd.read_csv( + settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv", + dtype={field_names.GEOID_TRACT_FIELD: str}, + ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_calculation.py b/data/data-pipeline/data_pipeline/tests/score/test_calculation.py new file mode 100644 index 00000000..783474e4 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_calculation.py @@ -0,0 +1,291 @@ +# flake8: noqa: W0613,W0611,F811 +from dataclasses import dataclass +import pytest +from data_pipeline.score import field_names +from data_pipeline.utils import get_module_logger +from data_pipeline.score.score_narwhal import ScoreNarwhal +from .fixtures import final_score_df # pylint: disable=unused-import + +logger = get_module_logger(__name__) + +pytestmark = pytest.mark.smoketest + + +@dataclass +class PercentileTestConfig: + percentile_column_name: str + threshold_column_name: str + threshold: float + percentile_column_need_suffix: bool = True + + @property + def full_percentile_column_name(self): + if self.percentile_column_need_suffix: + return ( + self.percentile_column_name + + field_names.PERCENTILE_FIELD_SUFFIX + ) + return self.percentile_column_name + + +### TODO: we need to blow this out for all eight categories +def _check_percentile_against_threshold(df, config: PercentileTestConfig): + """Note - for the purpose of testing, this fills with False""" + is_minimum_flagged_ok = ( + df[df[config.threshold_column_name].fillna(False)][ + config.full_percentile_column_name + ].min() + >= config.threshold + ) + + is_maximum_not_flagged_ok = ( + df[~df[config.threshold_column_name].fillna(False)][ + config.full_percentile_column_name + ].max() + < config.threshold + ) + errors = [] + if not is_minimum_flagged_ok: + errors.append( + f"For column {config.threshold_column_name}, there is someone flagged below {config.threshold} percentile!" + ) + if not is_maximum_not_flagged_ok: + errors.append( + f"For column {config.threshold_column_name}, there is someone not flagged above {config.threshold} percentile!" + ) + return errors + + +def test_percentile_columns(final_score_df): + low_income = PercentileTestConfig( + field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD, + field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED, + ScoreNarwhal.LOW_INCOME_THRESHOLD, + ) + population_loss = PercentileTestConfig( + field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD, + field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + agricultural_loss = PercentileTestConfig( + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD, + field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + building_loss = PercentileTestConfig( + field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, + field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + flood = PercentileTestConfig( + field_names.FUTURE_FLOOD_RISK_FIELD, + field_names.HIGH_FUTURE_FLOOD_RISK_FIELD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + wildfire = PercentileTestConfig( + field_names.FUTURE_WILDFIRE_RISK_FIELD, + field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + low_high_school = PercentileTestConfig( + field_names.HIGH_SCHOOL_ED_FIELD, + field_names.LOW_HS_EDUCATION_FIELD, + ScoreNarwhal.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD, + percentile_column_need_suffix=False, + ) + donut_hole_income = PercentileTestConfig( + field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD, + field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS, + ScoreNarwhal.LOW_INCOME_THRESHOLD_DONUT, + ) + donut_hole_adjacency = PercentileTestConfig( + (field_names.SCORE_N_COMMUNITIES + field_names.ADJACENCY_INDEX_SUFFIX), + field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD, + ScoreNarwhal.SCORE_THRESHOLD_DONUT, + percentile_column_need_suffix=False, + ) + diesel = PercentileTestConfig( + field_names.DIESEL_FIELD, + field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + dot_burden = PercentileTestConfig( + field_names.DOT_TRAVEL_BURDEN_FIELD, + field_names.DOT_BURDEN_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + traffic_proximity = PercentileTestConfig( + field_names.TRAFFIC_FIELD, + field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + energy_burden = PercentileTestConfig( + field_names.ENERGY_BURDEN_FIELD, + field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + pm25 = PercentileTestConfig( + field_names.PM25_FIELD, + field_names.PM25_EXCEEDS_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + kitchen_plumbing = PercentileTestConfig( + field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD, + field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + # Leadpaint is handled below in a separate method + housing = PercentileTestConfig( + field_names.HOUSING_BURDEN_FIELD, + field_names.HOUSING_BURDEN_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + non_natural_space = PercentileTestConfig( + field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME, + field_names.NON_NATURAL_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + rmp = PercentileTestConfig( + field_names.RMP_FIELD, + field_names.RMP_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + npl = PercentileTestConfig( + field_names.NPL_FIELD, + field_names.NPL_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + tsdf = PercentileTestConfig( + field_names.TSDF_FIELD, + field_names.TSDF_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + wastewater = PercentileTestConfig( + field_names.WASTEWATER_FIELD, + field_names.WASTEWATER_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + ust = PercentileTestConfig( + field_names.UST_FIELD, + field_names.UST_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + diabetes = PercentileTestConfig( + field_names.DIABETES_FIELD, + field_names.DIABETES_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + asthma = PercentileTestConfig( + field_names.ASTHMA_FIELD, + field_names.ASTHMA_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + heart_disease = PercentileTestConfig( + field_names.HEART_DISEASE_FIELD, + field_names.HEART_DISEASE_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + low_life_expectancy = PercentileTestConfig( + field_names.LOW_LIFE_EXPECTANCY_FIELD, + field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + unemployment = PercentileTestConfig( + field_names.UNEMPLOYMENT_FIELD, + field_names.UNEMPLOYMENT_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + low_median_income = PercentileTestConfig( + field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD, + field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + linguist_isolation = PercentileTestConfig( + field_names.LINGUISTIC_ISO_FIELD, + field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + poverty = PercentileTestConfig( + field_names.POVERTY_LESS_THAN_100_FPL_FIELD, + field_names.POVERTY_PCTILE_THRESHOLD, + ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + errors = [] + for threshhold_config in ( + low_income, + population_loss, + agricultural_loss, + building_loss, + flood, + wildfire, + low_high_school, + donut_hole_income, + donut_hole_adjacency, + dot_burden, + diesel, + traffic_proximity, + energy_burden, + pm25, + kitchen_plumbing, + housing, + non_natural_space, + rmp, + npl, + tsdf, + wastewater, + ust, + diabetes, + asthma, + heart_disease, + low_life_expectancy, + unemployment, + low_median_income, + linguist_isolation, + poverty, + ): + errors.extend( + _check_percentile_against_threshold( + final_score_df, threshhold_config + ) + ) + error_text = "\n".join(errors) + assert not errors, error_text + + +def test_lead_paint_indicator( + final_score_df, +): + """We need special logic here because this is a combined threshold, so we need this test to have two parts. + + 1. We construct our own threshold columns + 2. We make sure it's the same as the threshold column in the dataframe + """ + lead_pfs = ( + field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX + ) + home_val_pfs = ( + field_names.MEDIAN_HOUSE_VALUE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX + ) + combined_proxy_boolean = field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD + + tmp_lead_threshold = ( + final_score_df[lead_pfs] >= ScoreNarwhal.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + tmp_mhv_threshold = ( + final_score_df[home_val_pfs] + <= ScoreNarwhal.MEDIAN_HOUSE_VALUE_THRESHOLD + ) + + true_combined_proxy = tmp_lead_threshold & tmp_mhv_threshold + + assert ( + tmp_mhv_threshold.sum() > 0 + ), "MHV threshold alone does not capture any homes" + + assert final_score_df[combined_proxy_boolean].equals( + true_combined_proxy + ), "Lead proxy calculated improperly" + assert ( + tmp_lead_threshold.sum() > true_combined_proxy.sum() + ), "House value is not further limiting this proxy" diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py new file mode 100644 index 00000000..70e95be4 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -0,0 +1,205 @@ +# flake8: noqa: W0613,W0611,F811 +from dataclasses import dataclass +from typing import List +import pytest +import pandas as pd +from data_pipeline.score import field_names +from .fixtures import final_score_df # pylint: disable=unused-import + +pytestmark = pytest.mark.smoketest + + +def _helper_test_count_exceeding_threshold(df, col, error_check=1000): + """Fills NA with False""" + return df[df[col].fillna(False)].shape[0] >= error_check + + +def _helper_single_threshold_test(df, col, socioeconomic_column, score_column): + """Note that this fills nulls in the threshold column where nulls exist""" + nulls_dont_exist = ( + df[df[col].fillna(False) & df[socioeconomic_column]][score_column] + .isna() + .sum() + == 0 + ) + only_trues = df[df[col].fillna(False) & df[socioeconomic_column]][ + score_column + ].min() + return nulls_dont_exist, only_trues + + +@dataclass +class ThresholdTestConfig: + name: str + threshhold_columns: List[str] + ses_column_name: str = field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED + score_column_name: str = field_names.SCORE_N_COMMUNITIES + + @property + def error_message(self): + return f"Eligibility columns have an error, {self.name}" + + +def check_for_threshhold_errors( + df: pd.DataFrame, config: ThresholdTestConfig +) -> List[str]: + errors = [] + for col in config.threshhold_columns: + nulls_dont_exist, only_trues = _helper_single_threshold_test( + df, + col, + config.ses_column_name, + config.score_column_name, + ) + proper_threshold_identification = ( + _helper_test_count_exceeding_threshold(df, col) + ) + if not nulls_dont_exist: + errors.append( + f"For {col}, threshold is not calculated right -- there are NaNs in Score" + ) + if not only_trues: + errors.append( + f"For {col} and {config.ses_column_name}, threshold is not calculated right " + f"-- there are Falses where there should only be Trues" + ) + if not proper_threshold_identification: + errors.append( + f"Threshold {col} returns too few tracts, are you sure it's nationally-representative?" + ) + if errors: + errors.append(config.error_message) + return errors + + +def test_threshholds(final_score_df): + climate_thresholds = ThresholdTestConfig( + "climate", + [ + field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD, + field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD, + field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD, + field_names.HIGH_FUTURE_FLOOD_RISK_FIELD, + field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD, + ], + ) + energy_thresholds = ThresholdTestConfig( + "energy", + [ + field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD, + field_names.PM25_EXCEEDS_PCTILE_THRESHOLD, + ], + ) + transportation_thresholds = ThresholdTestConfig( + "transportation", + [ + field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD, + field_names.DOT_BURDEN_PCTILE_THRESHOLD, + field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD, + ], + ) + housing_thresholds = ThresholdTestConfig( + "housing", + [ + field_names.HISTORIC_REDLINING_SCORE_EXCEEDED, + field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD, + field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD, + field_names.HOUSING_BURDEN_PCTILE_THRESHOLD, + field_names.NON_NATURAL_PCTILE_THRESHOLD, + ], + ) + pollution_thresholds = ThresholdTestConfig( + "pollution", + [ + field_names.RMP_PCTILE_THRESHOLD, + field_names.NPL_PCTILE_THRESHOLD, + field_names.TSDF_PCTILE_THRESHOLD, + field_names.AML_BOOLEAN, + field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME, + ], + ) + water_thresholds = ThresholdTestConfig( + "water", + [ + field_names.WASTEWATER_PCTILE_THRESHOLD, + field_names.UST_PCTILE_THRESHOLD, + ], + ) + health_thresholds = ThresholdTestConfig( + "health", + [ + field_names.DIABETES_PCTILE_THRESHOLD, + field_names.ASTHMA_PCTILE_THRESHOLD, + field_names.HEART_DISEASE_PCTILE_THRESHOLD, + field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD, + ], + ) + workforce_base_thresholds = ThresholdTestConfig( + "workforce (not island areas)", + [ + field_names.UNEMPLOYMENT_PCTILE_THRESHOLD, + field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD, + field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD, + field_names.POVERTY_PCTILE_THRESHOLD, + ], + ses_column_name=field_names.LOW_HS_EDUCATION_FIELD, + ) + errors = [] + for threshhold_config in [ + climate_thresholds, + energy_thresholds, + transportation_thresholds, + housing_thresholds, + pollution_thresholds, + water_thresholds, + health_thresholds, + workforce_base_thresholds, + ]: + errors.extend( + check_for_threshhold_errors(final_score_df, threshhold_config) + ) + error_text = "\n".join(errors) + assert not errors, error_text + + +def test_max_40_percent_DAC(final_score_df): + score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN + total_population_col = field_names.TOTAL_POP_FIELD + assert ( + final_score_df[score_col_with_donuts].isna().sum() == 0 + ), f"Error: {score_col_with_donuts} contains NULLs" + assert ( + final_score_df[final_score_df[score_col_with_donuts]][ + total_population_col + ].sum() + / final_score_df[total_population_col].sum() + ) < 0.4, "Error: the scoring methodology identifies >40% of people in the US as disadvantaged" + assert ( + final_score_df[score_col_with_donuts].sum() > 0 + ), "FYI: You've identified no tracts at all!" + + +def test_donut_hole_addition_to_score_n(final_score_df): + score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN + score_col = field_names.SCORE_N_COMMUNITIES + donut_hole_score_only = ( + field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX + ) + count_donuts = final_score_df[donut_hole_score_only].sum() + count_n = final_score_df[score_col].sum() + count_n_with_donuts = final_score_df[score_col_with_donuts].sum() + new_donuts = final_score_df[ + final_score_df[donut_hole_score_only] & ~final_score_df[score_col] + ].shape[0] + + assert ( + new_donuts + count_n == count_n_with_donuts + ), "The math doesn't work! The number of new donut hole tracts plus score tracts (base) does not equal the total number of tracts identified" + + assert ( + count_donuts < count_n + ), "There are more donut hole tracts than base tracts. How can it be?" + + assert ( + new_donuts > 0 + ), "FYI: The adjacency index is doing nothing. Consider removing it?" diff --git a/data/data-pipeline/data_pipeline/tile/generate.py b/data/data-pipeline/data_pipeline/tile/generate.py index d5676b79..82e9404e 100644 --- a/data/data-pipeline/data_pipeline/tile/generate.py +++ b/data/data-pipeline/data_pipeline/tile/generate.py @@ -87,6 +87,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: logger.info("Generating Tribal mbtiles file") cmd = "tippecanoe " cmd += "--layer=blocks " + cmd += "--base-zoom=3 " cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} " cmd += f"--output={tribal_tiles_path}/usa.mbtiles " cmd += str(tribal_geojson_dir / "usa.json") @@ -95,10 +96,12 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None: # generate mvts logger.info("Generating Tribal mvt folders and files") cmd = "tippecanoe " + cmd += "--layer=blocks " + cmd += "--base-zoom=3 " cmd += "--no-tile-compression " cmd += "--drop-densest-as-needed " cmd += f"--minimum-zoom={USA_TRIBAL_MIN_ZOOM} --maximum-zoom={USA_TRIBAL_MAX_ZOOM} " - cmd += f"--output-to-directory={tribal_tiles_path} --layer=blocks " + cmd += f"--output-to-directory={tribal_tiles_path} " cmd += str(tribal_geojson_dir / "usa.json") call(cmd, shell=True) diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index 865e888b..063da627 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -149,7 +149,9 @@ def download_file_from_url( os.mkdir(download_file_name.parent) logger.info(f"Downloading {file_url}") - response = requests.get(file_url, verify=verify) + response = requests.get( + file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT + ) if response.status_code == 200: file_contents = response.content else: diff --git a/data/data-pipeline/pytest.ini b/data/data-pipeline/pytest.ini index 7022c5f7..17099dfd 100644 --- a/data/data-pipeline/pytest.ini +++ b/data/data-pipeline/pytest.ini @@ -1,2 +1,4 @@ [pytest] norecursedirs = .git data +markers = + smoketest: marks a test as depending on the full score output