From d41153d89dc7d3de4753d831e08513c40fc0759a Mon Sep 17 00:00:00 2001 From: Matt Bowen <83967628+mattbowen-usds@users.noreply.github.com> Date: Tue, 6 Sep 2022 15:10:19 -0400 Subject: [PATCH] Add tests to make sure each source makes it to the score correctly (#1878) * Remove unused persistent poverty from score (#1835) * Test a few datasets for overlap in the final score (#1835) * Add remaining data sources (#1853) * Apply code-review feedback (#1835) * Rearrange a little for readabililty (#1835) * Add tract test (#1835) * Add test for score values (#1835) * Check for unmatched source tracts (#1835) * Cleanup numeric code to plaintext (#1835) * Make import more obvious (#1835) --- .../data_pipeline/etl/score/etl_score.py | 13 -- .../data_pipeline/tests/score/fixtures.py | 208 +++++++++++++++++- .../tests/score/test_calculation.py | 1 - .../data_pipeline/tests/score/test_output.py | 124 ++++++++++- 4 files changed, 328 insertions(+), 18 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index ad6941d0..62a5006d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -42,7 +42,6 @@ class ScoreETL(ExtractTransformLoad): self.doe_energy_burden_df: pd.DataFrame self.national_risk_index_df: pd.DataFrame self.geocorr_urban_rural_df: pd.DataFrame - self.persistent_poverty_df: pd.DataFrame self.census_decennial_df: pd.DataFrame self.census_2010_df: pd.DataFrame self.national_tract_df: pd.DataFrame @@ -159,16 +158,6 @@ class ScoreETL(ExtractTransformLoad): low_memory=False, ) - # Load persistent poverty - persistent_poverty_csv = ( - constants.DATA_PATH / "dataset" / "persistent_poverty" / "usa.csv" - ) - self.persistent_poverty_df = pd.read_csv( - persistent_poverty_csv, - dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, - low_memory=False, - ) - # Load decennial census data census_decennial_csv = ( constants.DATA_PATH @@ -359,7 +348,6 @@ class ScoreETL(ExtractTransformLoad): self.doe_energy_burden_df, self.ejscreen_df, self.geocorr_urban_rural_df, - self.persistent_poverty_df, self.national_risk_index_df, self.census_acs_median_incomes_df, self.census_decennial_df, @@ -484,7 +472,6 @@ class ScoreETL(ExtractTransformLoad): non_numeric_columns = [ self.GEOID_TRACT_FIELD_NAME, - field_names.PERSISTENT_POVERTY_FIELD, field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD, field_names.AGRICULTURAL_VALUE_BOOL_FIELD, ] diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index 096a3e07..805c7726 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -1,13 +1,217 @@ import pandas as pd import pytest from data_pipeline.config import settings -from data_pipeline.score import field_names +from data_pipeline.score.field_names import GEOID_TRACT_FIELD +from data_pipeline.etl.score import constants @pytest.fixture(scope="session") def final_score_df(): return pd.read_csv( settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv", - dtype={field_names.GEOID_TRACT_FIELD: str}, + dtype={GEOID_TRACT_FIELD: str}, low_memory=False, ) + + +@pytest.fixture() +def census_df(): + census_csv = constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" + return pd.read_csv( + census_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def ejscreen_df(): + ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv" + return pd.read_csv( + ejscreen_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def hud_housing_df(): + hud_housing_csv = ( + constants.DATA_PATH / "dataset" / "hud_housing" / "usa.csv" + ) + return pd.read_csv( + hud_housing_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def cdc_places_df(): + cdc_places_csv = constants.DATA_PATH / "dataset" / "cdc_places" / "usa.csv" + return pd.read_csv( + cdc_places_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def census_acs_median_incomes_df(): + census_acs_median_incomes_csv = ( + constants.DATA_PATH + / "dataset" + / "census_acs_median_income_2019" + / "usa.csv" + ) + return pd.read_csv( + census_acs_median_incomes_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def cdc_life_expectancy_df(): + cdc_life_expectancy_csv = ( + constants.DATA_PATH / "dataset" / "cdc_life_expectancy" / "usa.csv" + ) + return pd.read_csv( + cdc_life_expectancy_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def doe_energy_burden_df(): + doe_energy_burden_csv = ( + constants.DATA_PATH / "dataset" / "doe_energy_burden" / "usa.csv" + ) + return pd.read_csv( + doe_energy_burden_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def national_risk_index_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "national_risk_index" / "usa.csv", + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def dot_travel_disadvantage_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "travel_composite" / "usa.csv", + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def fsf_fire_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "fsf_wildfire_risk" / "usa.csv", + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def fsf_flood_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "fsf_flood_risk" / "usa.csv", + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def nature_deprived_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "nlcd_nature_deprived" / "usa.csv", + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def eamlis_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "eamlis" / "usa.csv", + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def fuds_df(): + return pd.read_csv( + constants.DATA_PATH / "dataset" / "us_army_fuds" / "usa.csv", + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def geocorr_urban_rural_df(): + geocorr_urban_rural_csv = ( + constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv" + ) + return pd.read_csv( + geocorr_urban_rural_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def census_decennial_df(): + census_decennial_csv = ( + constants.DATA_PATH / "dataset" / "census_decennial_2010" / "usa.csv" + ) + return pd.read_csv( + census_decennial_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def census_2010_df(): + census_2010_csv = ( + constants.DATA_PATH / "dataset" / "census_acs_2010" / "usa.csv" + ) + return pd.read_csv( + census_2010_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def hrs_df(): + hrs_csv = constants.DATA_PATH / "dataset" / "historic_redlining" / "usa.csv" + + return pd.read_csv( + hrs_csv, + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + ) + + +@pytest.fixture() +def national_tract_df(): + national_tract_csv = constants.DATA_CENSUS_CSV_FILE_PATH + return pd.read_csv( + national_tract_csv, + names=[GEOID_TRACT_FIELD], + dtype={GEOID_TRACT_FIELD: "string"}, + low_memory=False, + header=None, + ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_calculation.py b/data/data-pipeline/data_pipeline/tests/score/test_calculation.py index 783474e4..d241918c 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_calculation.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_calculation.py @@ -28,7 +28,6 @@ class PercentileTestConfig: return self.percentile_column_name -### TODO: we need to blow this out for all eight categories def _check_percentile_against_threshold(df, config: PercentileTestConfig): """Note - for the purpose of testing, this fills with False""" is_minimum_flagged_ok = ( diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 70e95be4..0945fb9e 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -1,12 +1,37 @@ -# flake8: noqa: W0613,W0611,F811 +# flake8: noqa: W0613,W0611,F811, +# pylint: disable=unused-import,too-many-arguments from dataclasses import dataclass from typing import List import pytest import pandas as pd +import numpy as np from data_pipeline.score import field_names -from .fixtures import final_score_df # pylint: disable=unused-import +from data_pipeline.score.field_names import GEOID_TRACT_FIELD +from .fixtures import ( + final_score_df, + ejscreen_df, + hud_housing_df, + census_df, + cdc_places_df, + census_acs_median_incomes_df, + cdc_life_expectancy_df, + doe_energy_burden_df, + national_risk_index_df, + dot_travel_disadvantage_df, + fsf_fire_df, + nature_deprived_df, + eamlis_df, + fuds_df, + geocorr_urban_rural_df, + census_decennial_df, + census_2010_df, + hrs_df, + national_tract_df, +) + pytestmark = pytest.mark.smoketest +UNMATCHED_TRACK_THRESHOLD = 1000 def _helper_test_count_exceeding_threshold(df, col, error_check=1000): @@ -203,3 +228,98 @@ def test_donut_hole_addition_to_score_n(final_score_df): assert ( new_donuts > 0 ), "FYI: The adjacency index is doing nothing. Consider removing it?" + + +def test_data_sources( + final_score_df, + hud_housing_df, + ejscreen_df, + census_df, + cdc_places_df, + census_acs_median_incomes_df, + cdc_life_expectancy_df, + doe_energy_burden_df, + national_risk_index_df, + dot_travel_disadvantage_df, + fsf_fire_df, + nature_deprived_df, + eamlis_df, + fuds_df, + geocorr_urban_rural_df, + census_decennial_df, + census_2010_df, + hrs_df, +): + data_sources = { + key: value for key, value in locals().items() if key != "final_score_df" + } + + for data_source_name, data_source in data_sources.items(): + final = "final_" + df: pd.DataFrame = final_score_df.merge( + data_source, + on=GEOID_TRACT_FIELD, + indicator="MERGE", + suffixes=(final, f"_{data_source_name}"), + how="outer", + ) + + # Make our lists of columns for later comparison + core_cols = data_source.columns.intersection( + final_score_df.columns + ).drop(GEOID_TRACT_FIELD) + data_source_columns = [f"{col}_{data_source_name}" for col in core_cols] + final_columns = [f"{col}{final}" for col in core_cols] + assert ( + final_columns + ), f"No columns from data source show up in final score in source {data_source_name}" + + # Make sure we have NAs for any tracts in the final data that aren't + # covered in the final data + assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + + # Make sure the datasource doesn't have a ton of unmatched tracts, implying it + # has moved to 2020 tracts + assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACK_THRESHOLD + + df = df[df.MERGE == "both"] + + # Compare every column for equality, using close equality for numerics and + # `equals` equality for non-numeric columns + for final_column, data_source_column in zip( + data_source_columns, final_columns + ): + error_message = ( + f"Column {final_column} not equal " + f"between {data_source_name} and final score" + ) + if df[final_column].dtype in [ + np.dtype(object), + np.dtype(bool), + np.dtype(str), + ]: + assert df[final_column].equals( + df[data_source_column] + ), error_message + else: + assert np.allclose( + df[final_column], + df[data_source_column], + equal_nan=True, + ), error_message + + +def test_output_tracts(final_score_df, national_tract_df): + df = final_score_df.merge( + national_tract_df, + on=GEOID_TRACT_FIELD, + how="outer", + indicator="MERGE", + ) + counts = df.value_counts("MERGE") + assert counts.loc["left_only"] == 0 + assert counts.loc["right_only"] == 0 + + +def test_all_tracts_have_scores(final_score_df): + assert not final_score_df[field_names.SCORE_N_COMMUNITIES].isna().any()