Issue 844: Add island areas to Definition L (#957)

This ended up being a pretty large task. Here's what this PR does: 1. Pulls in Vincent's data from island areas into the score ETL. This is from the 2010 decennial census, the last census of any kind in the island areas. 2. Grabs a few new fields from 2010 island areas decennial census. 3. Calculates area median income for island areas. 4. Stops using EJSCREEN as the source of our high school education data and directly pulls that from census (this was related to this project so I went ahead and fixed it). 5. Grabs a bunch of data from the 2010 ACS in the states/Puerto Rico/DC, so that we can create percentiles comparing apples-to-apples (ish) from 2010 island areas decennial census data to 2010 ACS data. This required creating a new class because all the ACS fields are different between 2010 and 2019, so it wasn't as simple as looping over a year parameter. 6. Creates a combined population field of island areas and mainland so we can use those stats in our comparison tool, and updates the comparison tool accordingly.
2025-07-28 08:11:16 -07:00 · 2021-12-03 15:46:10 -05:00 · 2021-12-03 15:46:10 -05:00 · 1d101c93d2
commit 1d101c93d2
parent 8cb9d197df
15 changed files with 882 additions and 153 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/README.md
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -0,0 +1,186 @@
+import pandas as pd
+
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class CensusACS2010ETL(ExtractTransformLoad):
+    """Extract ACS data from 2010 or approximately that year.
+
+    Note: Census ACS 2010 uses different fields than those captured in CensusACSETL.
+
+    To support this, we created a separate class.
+    """
+
+    def __init__(self):
+        self.ACS_YEAR = 2010
+        self.ACS_TYPE = "acs5"
+        self.OUTPUT_PATH = (
+            self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
+        )
+
+        # Employment fields
+        self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED = (
+            "B23006_007E"
+            # Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian!!Unemployed
+        )
+        self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED = (
+            "B23006_014E"
+            # Estimate!!Total!!High school graduate!!In labor force!!Civilian!!Unemployed
+        )
+        self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED = (
+            "B23006_021E"
+            # Estimate!!Total!!Some college or associate's degree!!In labor force!!Civilian!!Unemployed
+        )
+        self.EMPLOYMENT_COLLEGE_UNEMPLOYED = (
+            "B23006_028E"
+            # Estimate!!Total!!Bachelor's degree or higher!!In labor force!!Civilian!!Unemployed
+        )
+
+        self.UNEMPLOYED_FIELDS = [
+            self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED,
+            self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED,
+            self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED,
+            self.EMPLOYMENT_COLLEGE_UNEMPLOYED,
+        ]
+
+        self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE = (
+            # TODO: FIX!!!!!!
+            "B23006_005E"
+            # Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian
+        )
+        self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE = (
+            "B23006_010E"
+            # Estimate!!Total!!High school graduate!!In labor force
+        )
+        self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE = (
+            "B23006_017E"
+            # Estimate!!Total!!Some college or associate's degree!!In labor force
+        )
+        self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE = (
+            "B23006_024E"
+            # Estimate!!Total!!Bachelor's degree or higher!!In labor force
+        )
+
+        self.IN_LABOR_FORCE_FIELDS = [
+            self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE,
+            self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE,
+            self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE,
+            self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE,
+        ]
+
+        self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
+
+        self.POVERTY_FIELDS = [
+            "C17002_001E",  # Estimate!!Total,
+            "C17002_002E",  # Estimate!!Total!!Under .50
+            "C17002_003E",  # Estimate!!Total!!.50 to .99
+            "C17002_004E",  # Estimate!!Total!!1.00 to 1.24
+            "C17002_005E",  # Estimate!!Total!!1.25 to 1.49
+            "C17002_006E",  # Estimate!!Total!!1.50 to 1.84
+            "C17002_007E",  # Estimate!!Total!!1.85 to 1.99
+        ]
+
+        self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME = (
+            "Percent of individuals < 100% Federal Poverty Line"
+        )
+        self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME = (
+            "Percent of individuals < 150% Federal Poverty Line"
+        )
+        self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
+            "Percent of individuals < 200% Federal Poverty Line"
+        )
+
+        self.STATE_GEOID_FIELD_NAME = "GEOID2"
+
+        self.df: pd.DataFrame
+
+    def extract(self) -> None:
+        # Define the variables to retrieve
+        variables = (
+            self.UNEMPLOYED_FIELDS
+            + self.IN_LABOR_FORCE_FIELDS
+            + self.POVERTY_FIELDS
+        )
+
+        # Use the method defined on CensusACSETL to reduce coding redundancy.
+        self.df = retrieve_census_acs_data(
+            acs_year=self.ACS_YEAR,
+            variables=variables,
+            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+            data_path_for_fips_codes=self.DATA_PATH,
+            acs_type=self.ACS_TYPE,
+            raise_errors=False,
+        )
+
+    def transform(self) -> None:
+        logger.info("Starting Census ACS Transform")
+
+        df = self.df
+
+        # Calculate percent unemployment.
+        # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
+        unemployed_totals = df[self.UNEMPLOYED_FIELDS].sum(axis=1)
+        labor_force_totals = df[self.IN_LABOR_FORCE_FIELDS].sum(axis=1)
+
+        df[self.UNEMPLOYED_FIELD_NAME] = unemployed_totals / labor_force_totals
+
+        # Calculate percent at different poverty thresholds
+        df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
+            df["C17002_002E"] + df["C17002_003E"]
+        ) / df["C17002_001E"]
+
+        df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
+            df["C17002_002E"]
+            + df["C17002_003E"]
+            + df["C17002_004E"]
+            + df["C17002_005E"]
+        ) / df["C17002_001E"]
+
+        df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
+            df["C17002_002E"]
+            + df["C17002_003E"]
+            + df["C17002_004E"]
+            + df["C17002_005E"]
+            + df["C17002_006E"]
+            + df["C17002_007E"]
+        ) / df["C17002_001E"]
+
+        # Save results to self.
+        self.df = df
+
+    def load(self) -> None:
+        logger.info("Saving Census ACS Data")
+
+        # mkdir census
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+
+        columns_to_include = [
+            self.GEOID_TRACT_FIELD_NAME,
+            self.UNEMPLOYED_FIELD_NAME,
+            self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
+            self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
+            self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
+        ]
+
+        output_df = self.df[columns_to_include]
+
+        # Add the year to the end of every column, so when it's all joined in the
+        # score df, it's obvious which year this data is from.
+        for column in columns_to_include:
+            if column != self.GEOID_TRACT_FIELD_NAME:
+                output_df = output_df.rename(
+                    columns={
+                        column: f"{column} in {self.ACS_YEAR}",
+                    }
+                )
+
+        output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
+
+    def validate(self) -> None:
+        logger.info("Validating Census ACS Data")
+
+        pass