updated to show T/F/null vs T/F for AML and FUDS (#1866)

2025-07-25 07:20:18 -07:00 · 2022-08-24 20:22:59 -04:00 · 2022-08-24 20:22:59 -04:00 · 637b8c305c
commit 637b8c305c
parent 6418335219
11 changed files with 40 additions and 10 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -309,8 +309,10 @@ TILES_SCORE_COLUMNS = {
    field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
    + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
    field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
-    field_names.AML_BOOLEAN: "AML_ET",
-    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET",
+    field_names.AML_BOOLEAN: "AML_RAW",
+    field_names.AML_BOOLEAN_FILLED_IN: "AML_ET",
+    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_RAW",
+    field_names.ELIGIBLE_FUDS_FILLED_IN_FIELD_NAME: "FUDS_ET",
    field_names.IMPUTED_INCOME_FLAG_FIELD_NAME: "IMP_FLG",
    ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
    ## FPL_200 (there is no higher ed in narwhal)
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -322,6 +322,8 @@ class ScoreETL(ExtractTransformLoad):
        # which are now deprecated.
        if not drop_tracts:
            # Create the "basic" percentile.
+            ## note: I believe this is less performant than if we made a bunch of these PFS columns
+            ## and then concatenated the list. For the refactor!
            df[
                f"{output_column_name_root}"
                f"{field_names.PERCENTILE_FIELD_SUFFIX}"
@ -538,9 +540,12 @@ class ScoreETL(ExtractTransformLoad):

        df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)

-        # coerce all booleans to bools
+        # coerce all booleans to bools preserving nan character
+        # since this is a boolean, need to use `None`
        for col in boolean_columns:
-            df_copy[col] = df_copy[col].astype(bool)
+            tmp = df_copy[col].copy()
+            df_copy[col] = np.where(tmp.notna(), tmp.astype(bool), None)
+            logger.info(f"{col} contains {df_copy[col].isna().sum()} nulls.")

        # Convert all columns to numeric and do math
        # Note that we have a few special conditions here and we handle them explicitly.
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl