diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 0b7ff12e..c6a312c0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -153,6 +153,27 @@ class NationalRiskIndexETL(ExtractTransformLoad): lower=self.AGRIVALUE_LOWER_BOUND ) + ## Check that this clip worked -- that the only place the value has changed is when the clip took effect + base_expectation = ( + disaster_agriculture_sum_series + / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] + ) + assert ( + df_nri[ + df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + != base_expectation + ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() + <= self.AGRIVALUE_LOWER_BOUND + ), ( + "Clipping the agrivalue did not work. There are places where the value doesn't " + + "match an unclipped ratio, even where the agrivalue is above the lower bound!" + ) + + assert ( + df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + != base_expectation + ).sum() > 0, "Clipping the agrivalue did nothing!" + # This produces a boolean that is True in the case of non-zero agricultural value df_nri[self.CONTAINS_AGRIVALUE] = ( df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0 diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index 68ba9b23..66fb3251 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -493,13 +493,6 @@ class ScoreNarwhal(Score): field_names.AML_BOOLEAN ].fillna(False) - logger.info( - f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}" - ) - logger.info( - f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}" - ) - self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[ [ field_names.RMP_PCTILE_THRESHOLD, diff --git a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py new file mode 100644 index 00000000..7df3c2e8 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py @@ -0,0 +1,86 @@ +# pylint: disable=protected-access + +import pandas as pd +import pytest +from data_pipeline.config import settings +from data_pipeline.score import field_names +from data_pipeline.etl.score.etl_score import ScoreETL +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +@pytest.fixture +def toy_score_df(scope="module"): + return pd.read_csv( + settings.APP_ROOT + / "tests" + / "score" + / "test_utils" + / "data" + / "test_drop_tracts_from_percentile.csv", + dtype={field_names.GEOID_TRACT_FIELD: str}, + ) + + +def _helper_test_dropping_tracts(toy_score_df, drop_tracts): + logger.info(drop_tracts) + test_frame = toy_score_df[ + ~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts) + ] + return_df = ScoreETL._add_percentiles_to_df( + df=toy_score_df, + input_column_name="to_rank", + output_column_name_root="to_rank_auto", + drop_tracts=drop_tracts, + ) + + test_frame = test_frame.assign( + true_rank=test_frame["to_rank"].rank(pct=True) + ) + + check_frame = test_frame.merge( + return_df[ + [ + field_names.GEOID_TRACT_FIELD, + "to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX, + ] + ], + on=[field_names.GEOID_TRACT_FIELD], + ) + + return check_frame["true_rank"].equals( + check_frame["to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX] + ) + + +def test_drop_0_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=[] + ), "Percentile in score fails when we do not drop any tracts" + + +def test_drop_1_tract(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=["1"] + ), "Percentile in score fails when we do drop a single tract" + + +def test_drop_2_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=["1", "2"] + ), "Percentile in score fails when we drop two tracts" + + +def test_drop_many_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, + drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list()[:5], + ), "Percentile in score fails when we drop many tracts" + + +def test_drop_all_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, + drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list(), + ), "Percentile in score fails when we drop all tracts" diff --git a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv new file mode 100644 index 00000000..5177546c --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv @@ -0,0 +1,101 @@ +GEOID10_TRACT,to_rank +1,1 +2,2 +3,3 +4,4 +5,5 +6,6 +7,7 +8,8 +9,9 +10,10 +11,11 +12,12 +13,13 +14,14 +15,15 +16,16 +17,17 +18,18 +19,19 +20,20 +21,21 +22,22 +23,23 +24,24 +25,25 +26,26 +27,27 +28,28 +29,29 +30,30 +31,31 +32,32 +33,33 +34,34 +35,35 +36,36 +37,37 +38,38 +39,39 +40,40 +41,41 +42,42 +43,43 +44,44 +45,45 +46,46 +47,47 +48,48 +49,49 +50,50 +51,51 +52,52 +53,53 +54,54 +55,55 +56,56 +57,57 +58,58 +59,59 +60,60 +61,61 +62,62 +63,63 +64,64 +65,65 +66,66 +67,67 +68,68 +69,69 +70,70 +71,71 +72,72 +73,73 +74,74 +75,75 +76,76 +77,77 +78,78 +79,79 +80,80 +81,81 +82,82 +83,83 +84,84 +85,85 +86,86 +87,87 +88,88 +89,89 +90,90 +91,91 +92,92 +93,93 +94,94 +95,95 +96,96 +97,97 +98,98 +99,99 +100,100