diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 46adab52..410d194c 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -381,8 +381,6 @@ TILES_SCORE_COLUMNS = { field_names.PERCENT_AGE_OVER_64: "AGE_OLD", field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT: "TA_COUNT", field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "TA_PERC", - - } # columns to round floats to 2 decimals @@ -456,5 +454,5 @@ TILES_SCORE_FLOAT_COLUMNS = [ field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME, field_names.AML_BOOLEAN, field_names.HISTORIC_REDLINING_SCORE_EXCEEDED, - field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT + field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT, ] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index b0fc3b4d..5a865d5f 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -1,4 +1,6 @@ import functools +from typing import List + from dataclasses import dataclass import numpy as np @@ -56,6 +58,8 @@ class ScoreETL(ExtractTransformLoad): self.fuds_df: pd.DataFrame self.tribal_overlap_df: pd.DataFrame + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = [] + def extract(self) -> None: logger.info("Loading data sets from disk.") @@ -402,6 +406,25 @@ class ScoreETL(ExtractTransformLoad): df[field_names.MEDIAN_INCOME_FIELD] / df[field_names.AMI_FIELD] ) + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS = [ + field_names.PERCENT_BLACK_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_AMERICAN_INDIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_ASIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HAWAIIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_TWO_OR_MORE_RACES_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_NON_HISPANIC_WHITE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HISPANIC_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_OTHER_RACE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + ] + # Donut columns get added later numeric_columns = [ field_names.HOUSING_BURDEN_FIELD, @@ -471,7 +494,7 @@ class ScoreETL(ExtractTransformLoad): field_names.PERCENT_AGE_OVER_64, field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT, field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT, - ] + ] + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS non_numeric_columns = [ self.GEOID_TRACT_FIELD_NAME, @@ -636,6 +659,32 @@ class ScoreETL(ExtractTransformLoad): return df_copy + @staticmethod + def _get_island_areas(df: pd.DataFrame) -> pd.Series: + return ( + df[field_names.GEOID_TRACT_FIELD] + .str[:2] + .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) + ) + + def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Backfilling island demographic data") + island_index = self._get_island_areas(df) + for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: + actual_field_name = backfill_field_name.replace( + field_names.ISLAND_AREA_BACKFILL_SUFFIX, "" + ) + df.loc[island_index, actual_field_name] = df.loc[ + island_index, backfill_field_name + ] + df = df.drop(columns=self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS) + + df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[ + island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010 + ] + + return df + def transform(self) -> None: logger.info("Transforming Score Data") @@ -645,6 +694,9 @@ class ScoreETL(ExtractTransformLoad): # calculate scores self.df = ScoreRunner(df=self.df).calculate_scores() + # We add island demographic data since it doesn't matter to the score anyway + self.df = self._backfill_island_demographics(self.df) + def load(self) -> None: logger.info("Saving Score CSV") constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index ea503f62..0954f8e8 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -1,4 +1,5 @@ import json +from typing import List import requests import numpy as np @@ -147,6 +148,65 @@ class CensusDecennialETL(ExtractTransformLoad): field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 ) + # Race/Ethnicity fields + self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total + self.ASIAN_FIELD = "PCT086002" # Total!!Asian + self.BLACK_FIELD = "PCT086003" # Total!!Black or African American + self.HAWAIIAN_FIELD = ( + "PCT086004" # Total!!Native Hawaiian and Other Pacific Islander + ) + # Note that the 2010 census for island araeas does not break out + # hispanic and non-hispanic white, so this is slightly different from + # our other demographic data + self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White + self.HISPANIC_FIELD = "PCT086006" # Total!!Hispanic or Latino + self.OTHER_RACE_FIELD = ( + "PCT086007" # Total!!Other Ethnic Origin or Ra + ) + + self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total + self.BLACK_VI_FIELD = ( + "P003003" # Total!!One race!!Black or African American alone + ) + self.AMERICAN_INDIAN_VI_FIELD = "P003005" # Total!!One race!!American Indian and Alaska Native alone + self.ASIAN_VI_FIELD = "P003006" # Total!!One race!!Asian alone + self.HAWAIIAN_VI_FIELD = "P003007" # Total!!One race!!Native Hawaiian and Other Pacific Islander alone + self.TWO_OR_MORE_RACES_VI_FIELD = "P003009" # Total!!Two or More Races + self.NON_HISPANIC_WHITE_VI_FIELD = ( + "P005006" # Total!!Not Hispanic or Latino!!One race!!White alone + ) + self.HISPANIC_VI_FIELD = "P005002" # Total!!Hispanic or Latino + self.OTHER_RACE_VI_FIELD = ( + "P003008" # Total!!One race!!Some Other Race alone + ) + self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total + + self.TOTAL_RACE_POPULATION_FIELD_NAME = ( + "Total population surveyed on racial data" + ) + self.BLACK_FIELD_NAME = "Black or African American" + self.AMERICAN_INDIAN_FIELD_NAME = "American Indian / Alaska Native" + self.ASIAN_FIELD_NAME = "Asian" + self.HAWAIIAN_FIELD_NAME = "Native Hawaiian or Pacific" + self.TWO_OR_MORE_RACES_FIELD_NAME = "two or more races" + self.NON_HISPANIC_WHITE_FIELD_NAME = "White" + self.HISPANIC_FIELD_NAME = "Hispanic or Latino" + # Note that `other` is lowercase because the whole field will show up in the download + # file as "Percent other races" + self.OTHER_RACE_FIELD_NAME = "other races" + + # Name output demographics fields. + self.RE_OUTPUT_FIELDS = [ + self.BLACK_FIELD_NAME, + self.AMERICAN_INDIAN_FIELD_NAME, + self.ASIAN_FIELD_NAME, + self.HAWAIIAN_FIELD_NAME, + self.TWO_OR_MORE_RACES_FIELD_NAME, + self.NON_HISPANIC_WHITE_FIELD_NAME, + self.HISPANIC_FIELD_NAME, + self.OTHER_RACE_FIELD_NAME, + ] + var_list = [ self.MEDIAN_INCOME_FIELD, self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD, @@ -162,6 +222,13 @@ class CensusDecennialETL(ExtractTransformLoad): self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD, self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, self.TOTAL_POP_FIELD, + self.TOTAL_RACE_POPULATION_FIELD, + self.ASIAN_FIELD, + self.BLACK_FIELD, + self.HAWAIIAN_FIELD, + self.NON_HISPANIC_WHITE_FIELD, + self.HISPANIC_FIELD, + self.OTHER_RACE_FIELD, ] var_list = ",".join(var_list) @@ -180,6 +247,15 @@ class CensusDecennialETL(ExtractTransformLoad): self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD, self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD, self.TOTAL_POP_VI_FIELD, + self.BLACK_VI_FIELD, + self.AMERICAN_INDIAN_VI_FIELD, + self.ASIAN_VI_FIELD, + self.HAWAIIAN_VI_FIELD, + self.TWO_OR_MORE_RACES_VI_FIELD, + self.NON_HISPANIC_WHITE_VI_FIELD, + self.HISPANIC_VI_FIELD, + self.OTHER_RACE_VI_FIELD, + self.TOTAL_RACE_POPULATION_VI_FIELD, ] var_list_vi = ",".join(var_list_vi) @@ -210,6 +286,23 @@ class CensusDecennialETL(ExtractTransformLoad): self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD, self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD, self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, + self.TOTAL_RACE_POPULATION_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME, + self.TOTAL_RACE_POPULATION_VI_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME, + # Note there is no American Indian data for AS/GU/MI + self.AMERICAN_INDIAN_VI_FIELD: self.AMERICAN_INDIAN_FIELD_NAME, + self.ASIAN_FIELD: self.ASIAN_FIELD_NAME, + self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME, + self.BLACK_FIELD: self.BLACK_FIELD_NAME, + self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME, + self.HAWAIIAN_FIELD: self.HAWAIIAN_FIELD_NAME, + self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME, + self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME, + self.NON_HISPANIC_WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, + self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, + self.HISPANIC_FIELD: self.HISPANIC_FIELD_NAME, + self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME, + self.OTHER_RACE_FIELD: self.OTHER_RACE_FIELD_NAME, + self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME, } # To do: Ask Census Slack Group about whether you need to hardcode the county fips @@ -252,6 +345,8 @@ class CensusDecennialETL(ExtractTransformLoad): + "&for=tract:*&in=state:{}%20county:{}" ) + self.final_race_fields: List[str] = [] + self.df: pd.DataFrame self.df_vi: pd.DataFrame self.df_all: pd.DataFrame @@ -264,14 +359,16 @@ class CensusDecennialETL(ExtractTransformLoad): f"Downloading data for state/territory {island['state_abbreviation']}" ) for county in island["county_fips"]: + api_url = self.API_URL.format( + self.DECENNIAL_YEAR, + island["state_abbreviation"], + island["var_list"], + island["fips"], + county, + ) + logger.debug(f"CENSUS: Requesting {api_url}") download = requests.get( - self.API_URL.format( - self.DECENNIAL_YEAR, - island["state_abbreviation"], - island["var_list"], - island["fips"], - county, - ), + api_url, timeout=settings.REQUESTS_DEFAULT_TIMOUT, ) @@ -379,6 +476,19 @@ class CensusDecennialETL(ExtractTransformLoad): self.df_all["state"] + self.df_all["county"] + self.df_all["tract"] ) + # Calculate stats by race + for race_field_name in self.RE_OUTPUT_FIELDS: + output_field_name = ( + field_names.PERCENT_PREFIX + + race_field_name + + field_names.ISLAND_AREA_BACKFILL_SUFFIX + ) + self.final_race_fields.append(output_field_name) + self.df_all[output_field_name] = ( + self.df_all[race_field_name] + / self.df_all[self.TOTAL_RACE_POPULATION_FIELD_NAME] + ) + # Reporting Missing Values for col in self.df_all.columns: missing_value_count = self.df_all[col].isnull().sum() @@ -402,7 +512,7 @@ class CensusDecennialETL(ExtractTransformLoad): self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME, self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME, self.UNEMPLOYMENT_FIELD_NAME, - ] + ] + self.final_race_fields self.df_all[columns_to_include].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 570dae88..80115a41 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -3,6 +3,7 @@ PERCENTILE_FIELD_SUFFIX = " (percentile)" ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas" ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)" ADJACENCY_INDEX_SUFFIX = " (average of neighbors)" +ISLAND_AREA_BACKFILL_SUFFIX = " in 2009" # Geographic field names GEOID_TRACT_FIELD = "GEOID10_TRACT" diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index fd7129ff..7e92654b 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -999,7 +999,6 @@ class ScoreNarwhal(Score): def add_columns(self) -> pd.DataFrame: logger.info("Adding Score Narhwal") - self.df[field_names.THRESHOLD_COUNT] = 0 self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = ( diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index f10e6f71..75fb144a 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -5,8 +5,10 @@ from typing import List import pytest import pandas as pd import numpy as np +from data_pipeline.etl.score import constants from data_pipeline.score import field_names from data_pipeline.score.field_names import GEOID_TRACT_FIELD +from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES from .fixtures import ( final_score_df, ejscreen_df, @@ -266,7 +268,7 @@ def test_data_sources( # is the "equal" to the data from the ETL, allowing for the minor # differences that come from floating point comparisons for data_source_name, data_source in data_sources.items(): - final = "final_" + final = "_final" df: pd.DataFrame = final_score_df.merge( data_source, on=GEOID_TRACT_FIELD, @@ -287,7 +289,24 @@ def test_data_sources( # Make sure we have NAs for any tracts in the final data that aren't # included in the data source - assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + has_additional_non_null_tracts = not np.all( + df[df.MERGE == "left_only"][final_columns].isna() + ) + if has_additional_non_null_tracts: + # We backfill island areas with data from the 2010 census, so if THOSE tracts + # have data beyond the data source, that's to be expected and is fine to pass. + # If some other state or territory does though, this should fail + left_only = df.loc[(df.MERGE == "left_only")] + left_only_has_value = left_only.loc[ + ~df[final_columns].isna().all(axis=1) + ] + fips_with_values = set( + left_only_has_value[field_names.GEOID_TRACT_FIELD].str[0:2] + ) + non_island_fips_codes = fips_with_values.difference( + TILES_ISLAND_AREA_FIPS_CODES + ) + assert not non_island_fips_codes # Make sure the datasource doesn't have a ton of unmatched tracts, implying it # has moved to 2020 tracts @@ -323,6 +342,77 @@ def test_data_sources( ), error_message +def test_island_demographic_backfill(final_score_df, census_decennial_df): + # Copied from score_etl because there's no better source of truth for it + ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS = [ + field_names.PERCENT_BLACK_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_AMERICAN_INDIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_ASIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HAWAIIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_TWO_OR_MORE_RACES_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_NON_HISPANIC_WHITE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HISPANIC_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_OTHER_RACE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.TOTAL_POP_FIELD + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + ] + + # rename the columns from the decennial census to be their final score names + decennial_cols = { + col_name: col_name.replace(field_names.ISLAND_AREA_BACKFILL_SUFFIX, "") + for col_name in ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS + } + census_decennial_df: pd.DataFrame = census_decennial_df.rename( + columns=decennial_cols + ) + + # Merge decennial data with the final score + df: pd.DataFrame = final_score_df.merge( + census_decennial_df, + on=GEOID_TRACT_FIELD, + indicator="MERGE", + suffixes=("_final", "_decennial"), + how="outer", + ) + + # Make sure columns from both the decennial census and final score overlap + core_cols = census_decennial_df.columns.intersection( + final_score_df.columns + ).drop(GEOID_TRACT_FIELD) + final_columns = [f"{col}_final" for col in core_cols] + assert ( + final_columns + ), "No columns from decennial census show up in final score, extremely weird" + + # Make sure we're only grabbing island tracts for the decennial data + assert ( + sorted( + df[df.MERGE == "both"][field_names.GEOID_TRACT_FIELD] + .str[:2] + .unique() + ) + == constants.TILES_ISLAND_AREA_FIPS_CODES + ), "2010 Decennial census contributed unexpected tracts" + + df = df[df.MERGE == "both"] + + # Make sure for all the backfill tracts, the data made it into the + # final score. This can be simple since it's all perenctages and an int + for col in final_columns: + assert np.allclose( + df[col], + df[col.replace("_final", "_decennial")], + equal_nan=True, + ), f"Data mismatch in decennial census backfill for {col}" + + def test_output_tracts(final_score_df, national_tract_df): df = final_score_df.merge( national_tract_df, @@ -365,8 +455,15 @@ def test_imputed_tracts(final_score_df): ) # Make sure that no tracts with population have null imputed income + # We DO NOT impute income for island areas, so remove those from the test + is_island_area = ( + final_score_df[field_names.GEOID_TRACT_FIELD] + .str[:2] + .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) + ) + tracts_with_some_population_df = final_score_df[ - final_score_df[field_names.TOTAL_POP_FIELD] > 0 + (final_score_df[field_names.TOTAL_POP_FIELD] > 0) & ~is_island_area ] assert ( not tracts_with_some_population_df[ diff --git a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py index 37b15f65..09275de2 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py @@ -156,4 +156,4 @@ class TestAbandondedLandMineETL(TestETL): "data_pipeline.etl.sources.eamlis.etl.add_tracts_for_geometries", new=_fake_add_tracts_for_geometries, ): - super().test_tract_id_lengths(mock_etl, mock_paths) \ No newline at end of file + super().test_tract_id_lengths(mock_etl, mock_paths)