Updating higher education to be reversed (#1387)

Summary In this PR, we create a new variable so that the % college students is expressed as % not college students. This means that the front end can display % not college students. Includes old variables so that this will not break fe.
2025-10-19 08:33:52 -07:00 · 2022-03-15 16:43:32 -04:00 · 2022-03-15 16:43:32 -04:00 · e7c7c0abeb
commit e7c7c0abeb
parent 2279a04c94
12 changed files with 30 additions and 5 deletions
--- a/data/data-pipeline/data_pipeline/content/config/csv.yml
+++ b/data/data-pipeline/data_pipeline/content/config/csv.yml
@ -248,3 +248,6 @@ fields:
  - score_name: Greater than or equal to the 90th percentile for low median household income as a percent of area median income and has low HS education in 2009 (island areas)?
    label: Greater than or equal to the 90th percentile for low median household income as a percent of area median income and has low HS education in 2009 (island areas)?
    format: bool
+  - score_name: Percent of population not currently enrolled in college or graduate school
+    label: Percent of residents who are not currently enrolled in higher ed
+    format: percentage
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -238,6 +238,9 @@ sheets:
      - score_name: Percent individuals age 25 or over with less than high school degree
        label: Percent individuals age 25 or over with less than high school degree
        format: percentage
+      - score_name: Percent of population not currently enrolled in college or graduate school
+        label: Percent of residents who are not currently enrolled in higher ed
+        format: percentage
      - score_name: Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)
        label: Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)
        format: percentage
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -1,6 +1,5 @@
 from pathlib import Path
 import datetime
-
 from data_pipeline.config import settings

 from data_pipeline.score import field_names
@ -205,6 +204,8 @@ TILES_SCORE_COLUMNS = {
    # Percentage of HS Degree completion for Islands
    field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009: "IAHSEF",
    field_names.COLLEGE_ATTENDANCE_FIELD: "CA",
+    field_names.COLLEGE_NON_ATTENDANCE_FIELD: "NCA",
+    # This is logically equivalent to "non-college greater than 80%"
    field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD: "CA_LT20",
    field_names.LOW_INCOME_THRESHOLD: "FPL200S",
    # Booleans for the front end about the types of thresholds exceeded
@ -270,5 +271,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
    field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
    field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX,
+    field_names.COLLEGE_NON_ATTENDANCE_FIELD,
    field_names.COLLEGE_ATTENDANCE_FIELD,
 ]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -443,6 +443,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.UNEMPLOYMENT_FIELD,
            field_names.MEDIAN_HOUSE_VALUE_FIELD,
            field_names.COLLEGE_ATTENDANCE_FIELD,
+            field_names.COLLEGE_NON_ATTENDANCE_FIELD,
            field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
            field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
            field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -136,6 +136,8 @@ class CensusACSETL(ExtractTransformLoad):
            "Percent enrollment in college or graduate school"
        )

+        self.COLLEGE_NON_ATTENDANCE_FIELD = "Percent of population not currently enrolled in college or graduate school"
+
        self.RE_FIELDS = [
            "B02001_001E",
            "B02001_002E",
@ -190,6 +192,7 @@ class CensusACSETL(ExtractTransformLoad):
                self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
                self.HIGH_SCHOOL_ED_FIELD,
                self.COLLEGE_ATTENDANCE_FIELD,
+                self.COLLEGE_NON_ATTENDANCE_FIELD,
            ]
            + self.RE_OUTPUT_FIELDS
            + [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
@ -354,6 +357,10 @@ class CensusACSETL(ExtractTransformLoad):
            + df[self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE]
        ) / df[self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED]

+        df[self.COLLEGE_NON_ATTENDANCE_FIELD] = (
+            1 - df[self.COLLEGE_ATTENDANCE_FIELD]
+        )
+
        # strip columns
        df = df[self.COLUMNS_TO_KEEP]

--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -112,6 +112,9 @@ MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD = (
 PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
 AMI_FIELD = "Area Median Income (State or metropolitan)"
 COLLEGE_ATTENDANCE_FIELD = "Percent enrollment in college or graduate school"
+COLLEGE_NON_ATTENDANCE_FIELD = (
+    "Percent of population not currently enrolled in college or graduate school"
+)
 MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD = (
    "Median household income as a percent of area median income"
 )
--- a/data/data-pipeline/data_pipeline/score/score_m.py
+++ b/data/data-pipeline/data_pipeline/score/score_m.py
@ -797,9 +797,15 @@ class ScoreM(Score):
            >= self.LOW_INCOME_THRESHOLD
        )

+        # Because we are moving this variable to be in the same direction as all
+        # other variables, we change this to be < rather than <=. This translates
+        # to "80% or more of residents are not college students", rather than
+        # "Strictly greater than 80% of residents are not college students."
+        # There are two tracts that are impacted by this (that is, they have exactly)
+        # 20% college students -- neither of these has been a DAC under any score.
        self.df[field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD] = (
            self.df[field_names.COLLEGE_ATTENDANCE_FIELD]
-            <= self.MAX_COLLEGE_ATTENDANCE_THRESHOLD
+            < self.MAX_COLLEGE_ATTENDANCE_THRESHOLD
        )

        self.df[