From 736797e3af26a56dba0e1a2011e602c1797339d3 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Mon, 26 Sep 2022 10:49:59 -0400 Subject: [PATCH] Update smoketest to account for backfills (#1882) As I wrote in the commend: We backfill island areas with data from the 2010 census, so if THOSE tracts have data beyond the data source, that's to be expected and is fine to pass. If some other state or territory does though, this should fail This ends up being a nice way of documenting that behavior i guess! --- .../data_pipeline/tests/score/test_output.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index f10e6f71..d6a5cb1a 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -7,6 +7,7 @@ import pandas as pd import numpy as np from data_pipeline.score import field_names from data_pipeline.score.field_names import GEOID_TRACT_FIELD +from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES from .fixtures import ( final_score_df, ejscreen_df, @@ -287,7 +288,24 @@ def test_data_sources( # Make sure we have NAs for any tracts in the final data that aren't # included in the data source - assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + has_additional_non_null_tracts = not np.all( + df[df.MERGE == "left_only"][final_columns].isna() + ) + if has_additional_non_null_tracts: + # We backfill island areas with data from the 2010 census, so if THOSE tracts + # have data beyond the data source, that's to be expected and is fine to pass. + # If some other state or territory does though, this should fail + left_only = df.loc[(df.MERGE == "left_only")] + left_only_has_value = left_only.loc[ + ~df[final_columns].isna().all(axis=1) + ] + fips_with_values = set( + left_only_has_value[field_names.GEOID_TRACT_FIELD].str[0:2] + ) + non_island_fips_codes = fips_with_values.difference( + TILES_ISLAND_AREA_FIPS_CODES + ) + assert not non_island_fips_codes # Make sure the datasource doesn't have a ton of unmatched tracts, implying it # has moved to 2020 tracts