Merge branch 'emma-nechamkin/release/score-narwhal' of github.com:usds/justice40-tool into emma-nechamkin/release/score-narwhal

2025-09-30 02:13:18 -07:00 · 2022-09-08 15:48:01 -04:00 · 2022-09-08 15:48:01 -04:00 · e78c6d0fef
commit e78c6d0fef
parent 31eac4101e fb4c484e5c
2 changed files with 28 additions and 3 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -380,7 +380,8 @@ class ScoreETL(ExtractTransformLoad):
        ), "Join against national tract list ADDED rows"
        logger.info(
            "Dropped %s tracts not in the 2010 tract data",
-            pre_join_len - census_tract_df[field_names.GEOID_TRACT_FIELD].nunique()
+            pre_join_len
            - census_tract_df[field_names.GEOID_TRACT_FIELD].nunique(),
        )
        # Now sanity-check the merged df.
@ -551,6 +552,9 @@ class ScoreETL(ExtractTransformLoad):
        #     For *Non-Natural Space*, we may only want to include tracts that have at least 35 acreas, I think. This will
        #     get rid of  tracts that we think are aberrations statistically. Right now, we have left this out
        #     pending ground-truthing.
        #
        #     For *Traffic Barriers*, we want to exclude low population tracts, which may have high burden because they are
        #     low population alone. We set this low population constant in the if statement.
        for numeric_column in numeric_columns:
            drop_tracts = []
@ -575,6 +579,17 @@ class ScoreETL(ExtractTransformLoad):
                    f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
                )
            elif numeric_column == field_names.DOT_TRAVEL_BURDEN_FIELD:
                # Not having any people appears to be correlated with transit burden, but also doesn't represent
                # on the ground need. For now, we remove these tracts from the percentile calculation. (To be QAed live)
                low_population = 20
                drop_tracts = df_copy[
                    df_copy[field_names.TOTAL_POP_FIELD] <= low_population
                ][field_names.GEOID_TRACT_FIELD].to_list()
                logger.info(
                    f"Dropping {len(drop_tracts)} tracts from DOT traffic burden"
                )
            df_copy = self._add_percentiles_to_df(
                df=df_copy,
                input_column_name=numeric_column,
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -60,6 +60,7 @@ class GeoScoreETL(ExtractTransformLoad):
            field_names.GEOID_TRACT_FIELD
        ]
        self.GEOMETRY_FIELD_NAME = "geometry"
        self.LAND_FIELD_NAME = "ALAND10"
        # We will adjust this upwards while there is some fractional value
        # in the score. This is a starting value.
@ -86,13 +87,22 @@ class GeoScoreETL(ExtractTransformLoad):
        )
        logger.info("Reading US GeoJSON (~6 minutes)")
-        self.geojson_usa_df = gpd.read_file(
+        full_geojson_usa_df = gpd.read_file(
            self.CENSUS_USA_GEOJSON,
            dtype={self.GEOID_FIELD_NAME: "string"},
-            usecols=[self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME],
+            usecols=[
                self.GEOID_FIELD_NAME,
                self.GEOMETRY_FIELD_NAME,
                self.LAND_FIELD_NAME,
            ],
            low_memory=False,
        )
        # We only want to keep tracts to visualize that have non-0 land
        self.geojson_usa_df = full_geojson_usa_df[
            full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
        ]
        logger.info("Reading score CSV")
        self.score_usa_df = pd.read_csv(
            self.TILE_SCORE_CSV,