Merge branch 'emma-nechamkin/release/score-narwhal' of github.com:usds/justice40-tool into emma-nechamkin/release/score-narwhal

2025-08-27 06:21:41 -07:00 · 2022-08-31 14:29:45 -04:00 · 2022-08-31 14:29:45 -04:00 · b7af13b2a6
commit b7af13b2a6
parent 7c6a9078e3 5201f9e457
5 changed files with 15 additions and 4802 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -394,4 +394,10 @@ TILES_SCORE_FLOAT_COLUMNS = [
    field_names.PERCENT_AGE_UNDER_10,
    field_names.PERCENT_AGE_10_TO_64,
    field_names.PERCENT_AGE_OVER_64,
+    # Geojson cannot support nulls in a boolean column when we create tiles;
+    # to preserve null character, we coerce to floats for all fields
+    # that use null to signify missing information in a boolean field.
+    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME, 
+    field_names.AML_BOOLEAN, 
+    field_names.HISTORIC_REDLINING_SCORE_EXCEEDED
 ]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -494,6 +494,7 @@ class ScoreETL(ExtractTransformLoad):
        # For some columns, high values are "good", so we want to reverse the percentile
        # so that high values are "bad" and any scoring logic can still check if it's
        # >= some threshold.
+        # Note that we must use dataclass here instead of namedtuples on account of pylint
        # TODO: Add more fields here.
        #  https://github.com/usds/justice40-tool/issues/970
        @dataclass
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -96,7 +96,9 @@ class GeoScoreETL(ExtractTransformLoad):
        logger.info("Reading score CSV")
        self.score_usa_df = pd.read_csv(
            self.TILE_SCORE_CSV,
-            dtype={self.TRACT_SHORT_FIELD: "string"},
+            dtype={
+                self.TRACT_SHORT_FIELD: str,
+            },
            low_memory=False,
        )

@ -136,7 +138,7 @@ class GeoScoreETL(ExtractTransformLoad):
            columns={self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO}
        )

-        logger.info("Converting to geojson into tracts")
+        logger.info("Converting geojson into geodf with tracts")
        usa_tracts = gpd.GeoDataFrame(
            usa_tracts,
            columns=[
@ -272,8 +274,10 @@ class GeoScoreETL(ExtractTransformLoad):
        # Create separate threads to run each write to disk.
        def write_high_to_file():
            logger.info("Writing usa-high (~9 minutes)")
+
            self.geojson_score_usa_high.to_file(
-                filename=self.SCORE_HIGH_GEOJSON, driver="GeoJSON"
+                filename=self.SCORE_HIGH_GEOJSON,
+                driver="GeoJSON",
            )
            logger.info("Completed writing usa-high")

@ -375,7 +379,7 @@ class GeoScoreETL(ExtractTransformLoad):
                for task in [
                    write_high_to_file,
                    write_low_to_file,
-                    # write_esri_shapefile,
+                    write_esri_shapefile,
                ]
            }

--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb