Merge branch 'emma-nechamkin/1849-calculation-tests' of github.com:usds/justice40-tool into emma-nechamkin/release/score-narwhal

2025-10-22 10:53:52 -07:00 · 2022-08-31 10:25:55 -04:00 · 2022-08-31 10:25:55 -04:00 · 7c6a9078e3
commit 7c6a9078e3
parent 6e575c6110 15b4f5b617
4 changed files with 208 additions and 7 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -153,6 +153,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
            lower=self.AGRIVALUE_LOWER_BOUND
        )
        ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
        base_expectation = (
            disaster_agriculture_sum_series
            / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
        )
        assert (
            df_nri[
                df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
                != base_expectation
            ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
            <= self.AGRIVALUE_LOWER_BOUND
        ), (
            "Clipping the agrivalue did not work. There are places where the value doesn't "
            + "match an unclipped ratio, even where the agrivalue is above the lower bound!"
        )
        assert (
            df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
            != base_expectation
        ).sum() > 0, "Clipping the agrivalue did nothing!"
        # This produces a boolean that is True in the case of non-zero agricultural value
        df_nri[self.CONTAINS_AGRIVALUE] = (
            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -493,13 +493,6 @@ class ScoreNarwhal(Score):
            field_names.AML_BOOLEAN
        ].fillna(False)
        logger.info(
            f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}"
        )
        logger.info(
            f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}"
        )
        self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[
            [
                field_names.RMP_PCTILE_THRESHOLD,
--- a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
@ -0,0 +1,86 @@
 # pylint: disable=protected-access
 import pandas as pd
 import pytest
 from data_pipeline.config import settings
 from data_pipeline.score import field_names
 from data_pipeline.etl.score.etl_score import ScoreETL
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
@pytest.fixture
 def toy_score_df(scope="module"):
    return pd.read_csv(
        settings.APP_ROOT
        / "tests"
        / "score"
        / "test_utils"
        / "data"
        / "test_drop_tracts_from_percentile.csv",
        dtype={field_names.GEOID_TRACT_FIELD: str},
    )
 def _helper_test_dropping_tracts(toy_score_df, drop_tracts):
    logger.info(drop_tracts)
    test_frame = toy_score_df[
        ~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)
    ]
    return_df = ScoreETL._add_percentiles_to_df(
        df=toy_score_df,
        input_column_name="to_rank",
        output_column_name_root="to_rank_auto",
        drop_tracts=drop_tracts,
    )
    test_frame = test_frame.assign(
        true_rank=test_frame["to_rank"].rank(pct=True)
    )
    check_frame = test_frame.merge(
        return_df[
            [
                field_names.GEOID_TRACT_FIELD,
                "to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX,
            ]
        ],
        on=[field_names.GEOID_TRACT_FIELD],
    )
    return check_frame["true_rank"].equals(
        check_frame["to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX]
    )
 def test_drop_0_tracts(toy_score_df):
    assert _helper_test_dropping_tracts(
        toy_score_df, drop_tracts=[]
    ), "Percentile in score fails when we do not drop any tracts"
 def test_drop_1_tract(toy_score_df):
    assert _helper_test_dropping_tracts(
        toy_score_df, drop_tracts=["1"]
    ), "Percentile in score fails when we do drop a single tract"
 def test_drop_2_tracts(toy_score_df):
    assert _helper_test_dropping_tracts(
        toy_score_df, drop_tracts=["1", "2"]
    ), "Percentile in score fails when we drop two tracts"
 def test_drop_many_tracts(toy_score_df):
    assert _helper_test_dropping_tracts(
        toy_score_df,
        drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list()[:5],
    ), "Percentile in score fails when we drop many tracts"
 def test_drop_all_tracts(toy_score_df):
    assert _helper_test_dropping_tracts(
        toy_score_df,
        drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list(),
    ), "Percentile in score fails when we drop all tracts"
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv
@ -0,0 +1,101 @@
 GEOID10_TRACT,to_rank
 1,1
 2,2
 3,3
 4,4
 5,5
 6,6
 7,7
 8,8
 9,9
 10,10
 11,11
 12,12
 13,13
 14,14
 15,15
 16,16
 17,17
 18,18
 19,19
 20,20
 21,21
 22,22
 23,23
 24,24
 25,25
 26,26
 27,27
 28,28
 29,29
 30,30
 31,31
 32,32
 33,33
 34,34
 35,35
 36,36
 37,37
 38,38
 39,39
 40,40
 41,41
 42,42
 43,43
 44,44
 45,45
 46,46
 47,47
 48,48
 49,49
 50,50
 51,51
 52,52
 53,53
 54,54
 55,55
 56,56
 57,57
 58,58
 59,59
 60,60
 61,61
 62,62
 63,63
 64,64
 65,65
 66,66
 67,67
 68,68
 69,69
 70,70
 71,71
 72,72
 73,73
 74,74
 75,75
 76,76
 77,77
 78,78
 79,79
 80,80
 81,81
 82,82
 83,83
 84,84
 85,85
 86,86
 87,87
 88,88
 89,89
 90,90
 91,91
 92,92
 93,93
 94,94
 95,95
 96,96
 97,97
 98,98
 99,99
 100,100