Issue 954: Add various data sources from Child Opportunity Index (#986)

* Adds four fields: * Summer days above 90F * Percent low access to healthy food * Percent impenetrable surface areas * Low third grade reading proficiency * Each of these four gets added into Definition L in various factors. * Additionally, I add college attendance fields to the ETL for Census ACS. * This PR also introduces the notion of "reverse percentiles", relevant to ticket #970.
2025-07-25 19:30:17 -07:00 · 2021-12-07 11:33:49 -05:00 · 2021-12-07 11:33:49 -05:00 · 5a6d6d8557
commit 5a6d6d8557
parent df564658a5
8 changed files with 357 additions and 40 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -1,4 +1,6 @@
 import functools
+from collections import namedtuple
+
 import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
@ -29,6 +31,7 @@ class ScoreETL(ExtractTransformLoad):
        self.persistent_poverty_df: pd.DataFrame
        self.census_decennial_df: pd.DataFrame
        self.census_2010_df: pd.DataFrame
+        self.child_opportunity_index_df: pd.DataFrame

    def extract(self) -> None:
        logger.info("Loading data sets from disk.")
@ -162,6 +165,19 @@ class ScoreETL(ExtractTransformLoad):
            low_memory=False,
        )

+        # Load COI data
+        child_opportunity_index_csv = (
+            constants.DATA_PATH
+            / "dataset"
+            / "child_opportunity_index"
+            / "usa.csv"
+        )
+        self.child_opportunity_index_df = pd.read_csv(
+            child_opportunity_index_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+
    def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
        logger.info("Joining Census Tract dataframes")

@ -255,6 +271,7 @@ class ScoreETL(ExtractTransformLoad):
            self.census_acs_median_incomes_df,
            self.census_decennial_df,
            self.census_2010_df,
+            self.child_opportunity_index_df,
        ]

        # Sanity check each data frame before merging.
@ -323,6 +340,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.HIGH_SCHOOL_ED_FIELD,
            field_names.UNEMPLOYMENT_FIELD,
            field_names.MEDIAN_HOUSE_VALUE_FIELD,
+            field_names.COLLEGE_ATTENDANCE_FIELD,
            field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
            field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
            field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
@ -333,6 +351,9 @@ class ScoreETL(ExtractTransformLoad):
            field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
            field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
            field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
+            field_names.EXTREME_HEAT_FIELD,
+            field_names.HEALTHY_FOOD_FIELD,
+            field_names.IMPENETRABLE_SURFACES_FIELD,
        ]

        non_numeric_columns = [
@ -340,7 +361,32 @@ class ScoreETL(ExtractTransformLoad):
            field_names.PERSISTENT_POVERTY_FIELD,
        ]

-        columns_to_keep = non_numeric_columns + numeric_columns
+        # For some columns, high values are "good", so we want to reverse the percentile
+        # so that high values are "bad" and any scoring logic can still check if it's
+        # >= some threshold.
+        # TODO: Add more fields here.
+        #  https://github.com/usds/justice40-tool/issues/970
+        ReversePercentile = namedtuple(
+            typename="ReversePercentile",
+            field_names=["field_name", "low_field_name"],
+        )
+        reverse_percentiles = [
+            # This dictionary follows the format:
+            # <field name> : <field name for low values>
+            # for instance, 3rd grade reading level : Low 3rd grade reading level.
+            # This low field will not exist yet, it is only calculated for the
+            # percentile.
+            ReversePercentile(
+                field_name=field_names.READING_FIELD,
+                low_field_name=field_names.LOW_READING_FIELD,
+            )
+        ]
+
+        columns_to_keep = (
+            non_numeric_columns
+            + numeric_columns
+            + [rp.field_name for rp in reverse_percentiles]
+        )

        df_copy = df[columns_to_keep].copy()

@ -375,6 +421,19 @@ class ScoreETL(ExtractTransformLoad):
                df_copy[col] - min_value
            ) / (max_value - min_value)

+        # Create reversed percentiles for these fields
+        for reverse_percentile in reverse_percentiles:
+            # Calculate reverse percentiles
+            # For instance, for 3rd grade reading level (score from 0-500),
+            # calculate reversed percentiles and give the result the name
+            # `Low 3rd grade reading level (percentile)`.
+            df_copy[
+                f"{reverse_percentile.low_field_name}"
+                f"{field_names.PERCENTILE_FIELD_SUFFIX}"
+            ] = df_copy[reverse_percentile.field_name].rank(
+                pct=True, ascending=False
+            )
+
        # Special logic: create a combined population field.
        # We sometimes run analytics on "population", and this makes a single field
        # that is either the island area's population in 2009 or the state's