Merge branch 'emma-nechamkin/1849-calculation-tests' of github.com:usds/justice40-tool into emma-nechamkin/release/score-narwhal

2025-10-21 15:53:52 -07:00 · 2022-08-31 10:25:55 -04:00 · 2022-08-31 10:25:55 -04:00 · 7c6a9078e3
commit 7c6a9078e3
parent 6e575c6110 15b4f5b617
4 changed files with 208 additions and 7 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -153,6 +153,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
            lower=self.AGRIVALUE_LOWER_BOUND
        )

+        ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
+        base_expectation = (
+            disaster_agriculture_sum_series
+            / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
+        )
+        assert (
+            df_nri[
+                df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+                != base_expectation
+            ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
+            <= self.AGRIVALUE_LOWER_BOUND
+        ), (
+            "Clipping the agrivalue did not work. There are places where the value doesn't "
+            + "match an unclipped ratio, even where the agrivalue is above the lower bound!"
+        )
+
+        assert (
+            df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            != base_expectation
+        ).sum() > 0, "Clipping the agrivalue did nothing!"
+
        # This produces a boolean that is True in the case of non-zero agricultural value
        df_nri[self.CONTAINS_AGRIVALUE] = (
            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -493,13 +493,6 @@ class ScoreNarwhal(Score):
            field_names.AML_BOOLEAN
        ].fillna(False)

-        logger.info(
-            f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}"
-        )
-        logger.info(
-            f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}"
-        )
-
        self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[
            [
                field_names.RMP_PCTILE_THRESHOLD,
--- a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
@ -0,0 +1,86 @@
+# pylint: disable=protected-access
+
+import pandas as pd
+import pytest
+from data_pipeline.config import settings
+from data_pipeline.score import field_names
+from data_pipeline.etl.score.etl_score import ScoreETL
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+@pytest.fixture
+def toy_score_df(scope="module"):
+    return pd.read_csv(
+        settings.APP_ROOT
+        / "tests"
+        / "score"
+        / "test_utils"
+        / "data"
+        / "test_drop_tracts_from_percentile.csv",
+        dtype={field_names.GEOID_TRACT_FIELD: str},
+    )
+
+
+def _helper_test_dropping_tracts(toy_score_df, drop_tracts):
+    logger.info(drop_tracts)
+    test_frame = toy_score_df[
+        ~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)
+    ]
+    return_df = ScoreETL._add_percentiles_to_df(
+        df=toy_score_df,
+        input_column_name="to_rank",
+        output_column_name_root="to_rank_auto",
+        drop_tracts=drop_tracts,
+    )
+
+    test_frame = test_frame.assign(
+        true_rank=test_frame["to_rank"].rank(pct=True)
+    )
+
+    check_frame = test_frame.merge(
+        return_df[
+            [
+                field_names.GEOID_TRACT_FIELD,
+                "to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX,
+            ]
+        ],
+        on=[field_names.GEOID_TRACT_FIELD],
+    )
+
+    return check_frame["true_rank"].equals(
+        check_frame["to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX]
+    )
+
+
+def test_drop_0_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df, drop_tracts=[]
+    ), "Percentile in score fails when we do not drop any tracts"
+
+
+def test_drop_1_tract(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df, drop_tracts=["1"]
+    ), "Percentile in score fails when we do drop a single tract"
+
+
+def test_drop_2_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df, drop_tracts=["1", "2"]
+    ), "Percentile in score fails when we drop two tracts"
+
+
+def test_drop_many_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df,
+        drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list()[:5],
+    ), "Percentile in score fails when we drop many tracts"
+
+
+def test_drop_all_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df,
+        drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list(),
+    ), "Percentile in score fails when we drop all tracts"
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv
@ -0,0 +1,101 @@
+GEOID10_TRACT,to_rank
+1,1
+2,2
+3,3
+4,4
+5,5
+6,6
+7,7
+8,8
+9,9
+10,10
+11,11
+12,12
+13,13
+14,14
+15,15
+16,16
+17,17
+18,18
+19,19
+20,20
+21,21
+22,22
+23,23
+24,24
+25,25
+26,26
+27,27
+28,28
+29,29
+30,30
+31,31
+32,32
+33,33
+34,34
+35,35
+36,36
+37,37
+38,38
+39,39
+40,40
+41,41
+42,42
+43,43
+44,44
+45,45
+46,46
+47,47
+48,48
+49,49
+50,50
+51,51
+52,52
+53,53
+54,54
+55,55
+56,56
+57,57
+58,58
+59,59
+60,60
+61,61
+62,62
+63,63
+64,64
+65,65
+66,66
+67,67
+68,68
+69,69
+70,70
+71,71
+72,72
+73,73
+74,74
+75,75
+76,76
+77,77
+78,78
+79,79
+80,80
+81,81
+82,82
+83,83
+84,84
+85,85
+86,86
+87,87
+88,88
+89,89
+90,90
+91,91
+92,92
+93,93
+94,94
+95,95
+96,96
+97,97
+98,98
+99,99
+100,100