From 9a2193d1a45d27d6be3de6c47cebbd0b969328c1 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Thu, 25 Aug 2022 16:37:23 -0400 Subject: [PATCH 1/5] checking drop tracts works --- .../tests/score/test_score_narwhal_methods.py | 84 +++++++++++++++ .../data/test_drop_tracts_from_percentile.csv | 101 ++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv diff --git a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py new file mode 100644 index 00000000..cec98318 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py @@ -0,0 +1,84 @@ +import pandas as pd +import pytest +from data_pipeline.config import settings +import data_pipeline.score.field_names as field_names +from data_pipeline.etl.score.etl_score import ScoreETL +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +@pytest.fixture +def toy_score_df(scope="module"): + return pd.read_csv( + settings.APP_ROOT + / "tests" + / "score" + / "test_utils" + / "data" + / "test_drop_tracts_from_percentile.csv", + dtype={field_names.GEOID_TRACT_FIELD: str}, + ) + + +def _helper_test_dropping_tracts(toy_score_df, drop_tracts): + logger.info(drop_tracts) + test_frame = toy_score_df[ + ~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts) + ] + return_df = ScoreETL._add_percentiles_to_df( + df=toy_score_df, + input_column_name="to_rank", + output_column_name_root="to_rank_auto", + drop_tracts=drop_tracts, + ) + + test_frame = test_frame.assign( + true_rank=test_frame["to_rank"].rank(pct=True) + ) + + check_frame = test_frame.merge( + return_df[ + [ + field_names.GEOID_TRACT_FIELD, + "to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX, + ] + ], + on=[field_names.GEOID_TRACT_FIELD], + ) + + return check_frame["true_rank"].equals( + check_frame["to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX] + ) + + +def test_drop_0_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=[] + ), "Percentile in score fails when we do not drop any tracts" + + +def test_drop_1_tract(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=["1"] + ), "Percentile in score fails when we do drop a single tract" + + +def test_drop_2_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=["1", "2"] + ), "Percentile in score fails when we drop two tracts" + + +def test_drop_many_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, + drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list()[:5], + ), "Percentile in score fails when we drop many tracts" + + +def test_drop_all_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, + drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list(), + ), "Percentile in score fails when we drop all tracts" diff --git a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv new file mode 100644 index 00000000..5177546c --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv @@ -0,0 +1,101 @@ +GEOID10_TRACT,to_rank +1,1 +2,2 +3,3 +4,4 +5,5 +6,6 +7,7 +8,8 +9,9 +10,10 +11,11 +12,12 +13,13 +14,14 +15,15 +16,16 +17,17 +18,18 +19,19 +20,20 +21,21 +22,22 +23,23 +24,24 +25,25 +26,26 +27,27 +28,28 +29,29 +30,30 +31,31 +32,32 +33,33 +34,34 +35,35 +36,36 +37,37 +38,38 +39,39 +40,40 +41,41 +42,42 +43,43 +44,44 +45,45 +46,46 +47,47 +48,48 +49,49 +50,50 +51,51 +52,52 +53,53 +54,54 +55,55 +56,56 +57,57 +58,58 +59,59 +60,60 +61,61 +62,62 +63,63 +64,64 +65,65 +66,66 +67,67 +68,68 +69,69 +70,70 +71,71 +72,72 +73,73 +74,74 +75,75 +76,76 +77,77 +78,78 +79,79 +80,80 +81,81 +82,82 +83,83 +84,84 +85,85 +86,86 +87,87 +88,88 +89,89 +90,90 +91,91 +92,92 +93,93 +94,94 +95,95 +96,96 +97,97 +98,98 +99,99 +100,100 From d16d0109a4f71887af5dab43acc65b7afc6435e1 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com> Date: Thu, 25 Aug 2022 16:48:42 -0400 Subject: [PATCH 2/5] OOPS! Old changes persisted --- data/data-pipeline/data_pipeline/score/score_narwhal.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index 7e91a6c2..5fb2923c 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -496,13 +496,6 @@ class ScoreNarwhal(Score): field_names.AML_BOOLEAN ].fillna(False) - logger.info( - f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}" - ) - logger.info( - f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}" - ) - self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[ [ field_names.RMP_PCTILE_THRESHOLD, From b63c465885d203d6e02718fdc9a0c0dd87155c66 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Thu, 25 Aug 2022 17:15:33 -0400 Subject: [PATCH 3/5] adding a check to the agvalue calculation for nri --- .../etl/sources/national_risk_index/etl.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 0b7ff12e..d1373602 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -153,6 +153,19 @@ class NationalRiskIndexETL(ExtractTransformLoad): lower=self.AGRIVALUE_LOWER_BOUND ) + ## Check that this clip worked -- that the only place the value has changed is when the clip took effect + base_expectation = ( + disaster_agriculture_sum_series + / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] + ) + assert ( + df_nri[ + df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + != base_expectation + ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() + < self.AGRIVALUE_LOWER_BOUND + ) + # This produces a boolean that is True in the case of non-zero agricultural value df_nri[self.CONTAINS_AGRIVALUE] = ( df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0 From c5244470ed250a1b92d55591c3a1c39e5c8535ed Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Thu, 25 Aug 2022 18:38:22 -0400 Subject: [PATCH 4/5] updated with error messages --- .../data_pipeline/etl/sources/national_risk_index/etl.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index d1373602..51ffcfa0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -164,7 +164,12 @@ class NationalRiskIndexETL(ExtractTransformLoad): != base_expectation ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() < self.AGRIVALUE_LOWER_BOUND - ) + ), "Clipping the agrivalue did not work!" + + assert ( + df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + != base_expectation + ).sum() > 0, "Clipping the agrivalue did nothing!" # This produces a boolean that is True in the case of non-zero agricultural value df_nri[self.CONTAINS_AGRIVALUE] = ( From 15b4f5b61730a546baa77bddec4f00d40dc359f5 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Fri, 26 Aug 2022 10:12:45 -0400 Subject: [PATCH 5/5] updated error message --- .../data_pipeline/etl/sources/national_risk_index/etl.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 51ffcfa0..c6a312c0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -163,8 +163,11 @@ class NationalRiskIndexETL(ExtractTransformLoad): df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] != base_expectation ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() - < self.AGRIVALUE_LOWER_BOUND - ), "Clipping the agrivalue did not work!" + <= self.AGRIVALUE_LOWER_BOUND + ), ( + "Clipping the agrivalue did not work. There are places where the value doesn't " + + "match an unclipped ratio, even where the agrivalue is above the lower bound!" + ) assert ( df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]