diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 044fe0c8..ff7a87e2 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -32,10 +32,15 @@ class ExtractTransformLoad: FILES_PATH: Path = settings.APP_ROOT / "files" GEOID_FIELD_NAME: str = "GEOID10" GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" - # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods. + + # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might + # be from CBGs at different time periods. EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000 - # TODO: investigate. Census says there are only 73,057 tracts in the US. This might be from tracts at different time periods. - EXPECTED_MAX_CENSUS_TRACTS: int = 74027 + + # TODO: investigate. Census says there are only 74,134 tracts in the US, + # Puerto Rico, and island areas. This might be from tracts at different time + # periods. https://github.com/usds/justice40-tool/issues/964 + EXPECTED_MAX_CENSUS_TRACTS: int = 74160 def __init__(self, config_path: Path) -> None: """Inits the class with instance specific variables""" diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index 831d1e18..2c8b1141 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -4,6 +4,11 @@ DATASET_LIST = [ "module_dir": "census_acs", "class_name": "CensusACSETL", }, + { + "name": "census_acs_2010", + "module_dir": "census_acs_2010", + "class_name": "CensusACS2010ETL", + }, { "name": "ejscreen", "module_dir": "ejscreen", @@ -14,16 +19,6 @@ DATASET_LIST = [ "module_dir": "hud_housing", "class_name": "HudHousingETL", }, - { - "name": "calenviroscreen", - "module_dir": "calenviroscreen", - "class_name": "CalEnviroScreenETL", - }, - { - "name": "hud_recap", - "module_dir": "hud_recap", - "class_name": "HudRecapETL", - }, { "name": "cdc_places", "module_dir": "cdc_places", @@ -74,6 +69,16 @@ DATASET_LIST = [ "module_dir": "housing_and_transportation", "class_name": "HousingTransportationETL", }, + { + "name": "calenviroscreen", + "module_dir": "calenviroscreen", + "class_name": "CalEnviroScreenETL", + }, + { + "name": "hud_recap", + "module_dir": "hud_recap", + "class_name": "HudRecapETL", + }, { "name": "tree_equity_score", "module_dir": "tree_equity_score", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 086697b1..f49681d8 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -27,6 +27,8 @@ class ScoreETL(ExtractTransformLoad): self.national_risk_index_df: pd.DataFrame self.geocorr_urban_rural_df: pd.DataFrame self.persistent_poverty_df: pd.DataFrame + self.census_decennial_df: pd.DataFrame + self.census_2010_df: pd.DataFrame def extract(self) -> None: logger.info("Loading data sets from disk.") @@ -137,6 +139,29 @@ class ScoreETL(ExtractTransformLoad): low_memory=False, ) + # Load decennial census data + census_decennial_csv = ( + constants.DATA_PATH + / "dataset" + / "census_decennial_2010" + / "usa.csv" + ) + self.census_decennial_df = pd.read_csv( + census_decennial_csv, + dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + + # Load 2010 ACS data from states + census_2010_csv = ( + constants.DATA_PATH / "dataset" / "census_acs_2010" / "usa.csv" + ) + self.census_2010_df = pd.read_csv( + census_2010_csv, + dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + ) + def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame: logger.info("Joining Census Tract dataframes") @@ -228,6 +253,8 @@ class ScoreETL(ExtractTransformLoad): self.persistent_poverty_df, self.national_risk_index_df, self.census_acs_median_incomes_df, + self.census_decennial_df, + self.census_2010_df, ] # Sanity check each data frame before merging. @@ -296,9 +323,16 @@ class ScoreETL(ExtractTransformLoad): field_names.HIGH_SCHOOL_ED_FIELD, field_names.UNEMPLOYMENT_FIELD, field_names.MEDIAN_HOUSE_VALUE_FIELD, - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME, - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME, + field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD, + field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD, + field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009, + field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009, + field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009, + field_names.CENSUS_UNEMPLOYMENT_FIELD_2010, + field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010, + field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009, + field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009, ] non_numeric_columns = [ @@ -315,9 +349,9 @@ class ScoreETL(ExtractTransformLoad): # Convert all columns to numeric and do math for col in numeric_columns: # Calculate percentiles - df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[col].rank( - pct=True - ) + df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[ + col + ].rank(pct=True) # Min-max normalization: # ( @@ -341,6 +375,20 @@ class ScoreETL(ExtractTransformLoad): df_copy[col] - min_value ) / (max_value - min_value) + # Special logic: create a combined population field. + # We sometimes run analytics on "population", and this makes a single field + # that is either the island area's population in 2009 or the state's + # population in 2019. + # There should only be one entry in either 2009 or 2019, not one in both. + # But just to be safe, we take the mean and ignore null values so if there + # *were* entries in both fields, this result would make sense. + df_copy[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010] = df_copy[ + [ + field_names.TOTAL_POP_FIELD, + field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009, + ] + ].mean(axis=1, skipna=True) + return df_copy def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index b22fdf3a..8e4f8c8d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -1,8 +1,7 @@ import pandas as pd -import censusdata from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes +from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) @@ -14,7 +13,15 @@ class CensusACSETL(ExtractTransformLoad): self.OUTPUT_PATH = ( self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" ) + + self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E" + self.TOTAL_IN_LABOR_FORCE = "B23025_003E" + self.EMPLOYMENT_FIELDS = [ + self.TOTAL_UNEMPLOYED_FIELD, + self.TOTAL_IN_LABOR_FORCE, + ] self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" + self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = ( "Linguistic isolation (total)" @@ -55,59 +62,89 @@ class CensusACSETL(ExtractTransformLoad): "Median value ($) of owner-occupied housing units" ) + # Educational attainment figures + self.EDUCATION_POPULATION_OVER_25 = "B15003_001E" # Estimate!!Total + self.EDUCATION_NO_SCHOOLING = ( + "B15003_002E" # Estimate!!Total!!No schooling completed + ) + self.EDUCATION_NURSERY = ( + "B15003_003E" # Estimate!!Total!!Nursery school + ) + self.EDUCATION_KINDERGARTEN = ( + "B15003_004E" # Estimate!!Total!!Kindergarten + ) + self.EDUCATION_FIRST = "B15003_005E" # Estimate!!Total!!1st grade + self.EDUCATION_SECOND = "B15003_006E" # Estimate!!Total!!2nd grade + self.EDUCATION_THIRD = "B15003_007E" # Estimate!!Total!!3rd grade + self.EDUCATION_FOURTH = "B15003_008E" # Estimate!!Total!!4th grade + self.EDUCATION_FIFTH = "B15003_009E" # Estimate!!Total!!5th grade + self.EDUCATION_SIXTH = "B15003_010E" # Estimate!!Total!!6th grade + self.EDUCATION_SEVENTH = "B15003_011E" # Estimate!!Total!!7th grade + self.EDUCATION_EIGHTH = "B15003_012E" # Estimate!!Total!!8th grade + self.EDUCATION_NINTH = "B15003_013E" # Estimate!!Total!!9th grade + self.EDUCATION_TENTH = "B15003_014E" # Estimate!!Total!!10th grade + self.EDUCATION_ELEVENTH = "B15003_015E" # Estimate!!Total!!11th grade + self.EDUCATION_TWELFTH_NO_DIPLOMA = ( + "B15003_016E" # Estimate!!Total!!12th grade, no diploma + ) + + self.EDUCATIONAL_FIELDS = [ + self.EDUCATION_POPULATION_OVER_25, + self.EDUCATION_NO_SCHOOLING, + self.EDUCATION_NURSERY, + self.EDUCATION_KINDERGARTEN, + self.EDUCATION_FIRST, + self.EDUCATION_SECOND, + self.EDUCATION_THIRD, + self.EDUCATION_FOURTH, + self.EDUCATION_FIFTH, + self.EDUCATION_SIXTH, + self.EDUCATION_SEVENTH, + self.EDUCATION_EIGHTH, + self.EDUCATION_NINTH, + self.EDUCATION_TENTH, + self.EDUCATION_ELEVENTH, + self.EDUCATION_TWELFTH_NO_DIPLOMA, + ] + + self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD = ( + "Individuals age 25 or over with less than high school degree" + ) + self.HIGH_SCHOOL_ED_FIELD = "Percent individuals age 25 or over with less than high school degree" + self.STATE_GEOID_FIELD_NAME = "GEOID2" + self.df: pd.DataFrame - def _fips_from_censusdata_censusgeo( - self, censusgeo: censusdata.censusgeo - ) -> str: - """Create a FIPS code from the proprietary censusgeo index.""" - fips = "".join([value for (key, value) in censusgeo.params()]) - return fips - def extract(self) -> None: - dfs = [] - for fips in get_state_fips_codes(self.DATA_PATH): - logger.info( - f"Downloading data for state/territory with FIPS code {fips}" - ) + # Define the variables to retrieve + variables = ( + [ + # Income field + self.MEDIAN_INCOME_FIELD, + # House value + self.MEDIAN_HOUSE_VALUE_FIELD, + ] + + self.EMPLOYMENT_FIELDS + + self.LINGUISTIC_ISOLATION_FIELDS + + self.POVERTY_FIELDS + + self.EDUCATIONAL_FIELDS + ) - try: - response = censusdata.download( - src="acs5", - year=self.ACS_YEAR, - geo=censusdata.censusgeo( - [("state", fips), ("county", "*"), ("tract", "*")] - ), - var=[ - # Emploment fields - "B23025_005E", - "B23025_003E", - # Income field - self.MEDIAN_INCOME_FIELD, - # House value - self.MEDIAN_HOUSE_VALUE_FIELD, - ] - + self.LINGUISTIC_ISOLATION_FIELDS - + self.POVERTY_FIELDS, - ) - dfs.append(response) - except ValueError: - logger.error( - f"Could not download data for state/territory with FIPS code {fips}" - ) - - self.df = pd.concat(dfs) - - self.df[self.GEOID_TRACT_FIELD_NAME] = self.df.index.to_series().apply( - func=self._fips_from_censusdata_censusgeo + self.df = retrieve_census_acs_data( + acs_year=self.ACS_YEAR, + variables=variables, + tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, + data_path_for_fips_codes=self.DATA_PATH, ) def transform(self) -> None: logger.info("Starting Census ACS Transform") + df = self.df + # Rename two fields. - self.df = self.df.rename( + df = df.rename( columns={ self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME, self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME, @@ -119,19 +156,17 @@ class CensusACSETL(ExtractTransformLoad): self.MEDIAN_INCOME_FIELD_NAME, self.MEDIAN_HOUSE_VALUE_FIELD_NAME, ]: - missing_value_count = sum(self.df[field] == -666666666) + missing_value_count = sum(df[field] == -666666666) logger.info( - f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of " + f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of " + f"`{field}` being marked as null values." ) - self.df[field] = self.df[field].replace( - to_replace=-666666666, value=None - ) + df[field] = df[field].replace(to_replace=-666666666, value=None) # Calculate percent unemployment. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. - self.df[self.UNEMPLOYED_FIELD_NAME] = ( - self.df.B23025_005E / self.df.B23025_003E + df[self.UNEMPLOYED_FIELD_NAME] = ( + df[self.TOTAL_UNEMPLOYED_FIELD] / df[self.TOTAL_IN_LABOR_FORCE] ) # Calculate linguistic isolation. @@ -142,34 +177,64 @@ class CensusACSETL(ExtractTransformLoad): "C16002_013E", ] - self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = self.df[ + df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[ individual_limited_english_fields ].sum(axis=1, skipna=True) - self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = ( - self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float) - / self.df["C16002_001E"] + df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = ( + df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float) + / df["C16002_001E"] ) # Calculate percent at different poverty thresholds - self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = ( - self.df["C17002_002E"] + self.df["C17002_003E"] - ) / self.df["C17002_001E"] + df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = ( + df["C17002_002E"] + df["C17002_003E"] + ) / df["C17002_001E"] - self.df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = ( - self.df["C17002_002E"] - + self.df["C17002_003E"] - + self.df["C17002_004E"] - + self.df["C17002_005E"] - ) / self.df["C17002_001E"] + df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = ( + df["C17002_002E"] + + df["C17002_003E"] + + df["C17002_004E"] + + df["C17002_005E"] + ) / df["C17002_001E"] - self.df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = ( - self.df["C17002_002E"] - + self.df["C17002_003E"] - + self.df["C17002_004E"] - + self.df["C17002_005E"] - + self.df["C17002_006E"] - + self.df["C17002_007E"] - ) / self.df["C17002_001E"] + df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = ( + df["C17002_002E"] + + df["C17002_003E"] + + df["C17002_004E"] + + df["C17002_005E"] + + df["C17002_006E"] + + df["C17002_007E"] + ) / df["C17002_001E"] + + # Calculate educational attainment + educational_numerator_fields = [ + self.EDUCATION_NO_SCHOOLING, + self.EDUCATION_NURSERY, + self.EDUCATION_KINDERGARTEN, + self.EDUCATION_FIRST, + self.EDUCATION_SECOND, + self.EDUCATION_THIRD, + self.EDUCATION_FOURTH, + self.EDUCATION_FIFTH, + self.EDUCATION_SIXTH, + self.EDUCATION_SEVENTH, + self.EDUCATION_EIGHTH, + self.EDUCATION_NINTH, + self.EDUCATION_TENTH, + self.EDUCATION_ELEVENTH, + self.EDUCATION_TWELFTH_NO_DIPLOMA, + ] + + df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD] = df[ + educational_numerator_fields + ].sum(axis=1) + df[self.HIGH_SCHOOL_ED_FIELD] = ( + df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD] + / df[self.EDUCATION_POPULATION_OVER_25] + ) + + # Save results to self. + self.df = df def load(self) -> None: logger.info("Saving Census ACS Data") @@ -186,6 +251,7 @@ class CensusACSETL(ExtractTransformLoad): self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, self.MEDIAN_HOUSE_VALUE_FIELD_NAME, + self.HIGH_SCHOOL_ED_FIELD, ] self.df[columns_to_include].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py new file mode 100644 index 00000000..ce3c901c --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py @@ -0,0 +1,61 @@ +from pathlib import Path +from typing import List +import censusdata +import pandas as pd + +from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +def _fips_from_censusdata_censusgeo( + censusgeo: censusdata.censusgeo +) -> str: + """Create a FIPS code from the proprietary censusgeo index.""" + fips = "".join([value for (key, value) in censusgeo.params()]) + return fips + + +# pylint: disable=too-many-arguments +def retrieve_census_acs_data( + acs_year: int, + variables: List[str], + tract_output_field_name: str, + data_path_for_fips_codes: Path, + acs_type="acs5", + raise_errors: bool = False, +) -> pd.DataFrame: + """Retrieves and combines census ACS data for a given year.""" + dfs = [] + for fips in get_state_fips_codes(data_path_for_fips_codes): + logger.info( + f"Downloading data for state/territory with FIPS code {fips}" + ) + + try: + response = censusdata.download( + src=acs_type, + year=acs_year, + geo=censusdata.censusgeo( + [("state", fips), ("county", "*"), ("tract", "*")] + ), + var=variables, + ) + dfs.append(response) + + except ValueError as e: + logger.error( + f"Could not download data for state/territory with FIPS code {fips}" + ) + + if raise_errors: + raise e + + df = pd.concat(dfs) + + df[tract_output_field_name] = df.index.to_series().apply( + func=_fips_from_censusdata_censusgeo + ) + + return df diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/README.md b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/README.md new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py new file mode 100644 index 00000000..870b8a7f --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py @@ -0,0 +1,186 @@ +import pandas as pd + +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +class CensusACS2010ETL(ExtractTransformLoad): + """Extract ACS data from 2010 or approximately that year. + + Note: Census ACS 2010 uses different fields than those captured in CensusACSETL. + + To support this, we created a separate class. + """ + + def __init__(self): + self.ACS_YEAR = 2010 + self.ACS_TYPE = "acs5" + self.OUTPUT_PATH = ( + self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" + ) + + # Employment fields + self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED = ( + "B23006_007E" + # Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian!!Unemployed + ) + self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED = ( + "B23006_014E" + # Estimate!!Total!!High school graduate!!In labor force!!Civilian!!Unemployed + ) + self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED = ( + "B23006_021E" + # Estimate!!Total!!Some college or associate's degree!!In labor force!!Civilian!!Unemployed + ) + self.EMPLOYMENT_COLLEGE_UNEMPLOYED = ( + "B23006_028E" + # Estimate!!Total!!Bachelor's degree or higher!!In labor force!!Civilian!!Unemployed + ) + + self.UNEMPLOYED_FIELDS = [ + self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED, + self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED, + self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED, + self.EMPLOYMENT_COLLEGE_UNEMPLOYED, + ] + + self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE = ( + # TODO: FIX!!!!!! + "B23006_005E" + # Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian + ) + self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE = ( + "B23006_010E" + # Estimate!!Total!!High school graduate!!In labor force + ) + self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE = ( + "B23006_017E" + # Estimate!!Total!!Some college or associate's degree!!In labor force + ) + self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE = ( + "B23006_024E" + # Estimate!!Total!!Bachelor's degree or higher!!In labor force + ) + + self.IN_LABOR_FORCE_FIELDS = [ + self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE, + self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE, + self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE, + self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE, + ] + + self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" + + self.POVERTY_FIELDS = [ + "C17002_001E", # Estimate!!Total, + "C17002_002E", # Estimate!!Total!!Under .50 + "C17002_003E", # Estimate!!Total!!.50 to .99 + "C17002_004E", # Estimate!!Total!!1.00 to 1.24 + "C17002_005E", # Estimate!!Total!!1.25 to 1.49 + "C17002_006E", # Estimate!!Total!!1.50 to 1.84 + "C17002_007E", # Estimate!!Total!!1.85 to 1.99 + ] + + self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME = ( + "Percent of individuals < 100% Federal Poverty Line" + ) + self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME = ( + "Percent of individuals < 150% Federal Poverty Line" + ) + self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = ( + "Percent of individuals < 200% Federal Poverty Line" + ) + + self.STATE_GEOID_FIELD_NAME = "GEOID2" + + self.df: pd.DataFrame + + def extract(self) -> None: + # Define the variables to retrieve + variables = ( + self.UNEMPLOYED_FIELDS + + self.IN_LABOR_FORCE_FIELDS + + self.POVERTY_FIELDS + ) + + # Use the method defined on CensusACSETL to reduce coding redundancy. + self.df = retrieve_census_acs_data( + acs_year=self.ACS_YEAR, + variables=variables, + tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, + data_path_for_fips_codes=self.DATA_PATH, + acs_type=self.ACS_TYPE, + raise_errors=False, + ) + + def transform(self) -> None: + logger.info("Starting Census ACS Transform") + + df = self.df + + # Calculate percent unemployment. + # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. + unemployed_totals = df[self.UNEMPLOYED_FIELDS].sum(axis=1) + labor_force_totals = df[self.IN_LABOR_FORCE_FIELDS].sum(axis=1) + + df[self.UNEMPLOYED_FIELD_NAME] = unemployed_totals / labor_force_totals + + # Calculate percent at different poverty thresholds + df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = ( + df["C17002_002E"] + df["C17002_003E"] + ) / df["C17002_001E"] + + df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = ( + df["C17002_002E"] + + df["C17002_003E"] + + df["C17002_004E"] + + df["C17002_005E"] + ) / df["C17002_001E"] + + df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = ( + df["C17002_002E"] + + df["C17002_003E"] + + df["C17002_004E"] + + df["C17002_005E"] + + df["C17002_006E"] + + df["C17002_007E"] + ) / df["C17002_001E"] + + # Save results to self. + self.df = df + + def load(self) -> None: + logger.info("Saving Census ACS Data") + + # mkdir census + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + + columns_to_include = [ + self.GEOID_TRACT_FIELD_NAME, + self.UNEMPLOYED_FIELD_NAME, + self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME, + self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME, + self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, + ] + + output_df = self.df[columns_to_include] + + # Add the year to the end of every column, so when it's all joined in the + # score df, it's obvious which year this data is from. + for column in columns_to_include: + if column != self.GEOID_TRACT_FIELD_NAME: + output_df = output_df.rename( + columns={ + column: f"{column} in {self.ACS_YEAR}", + } + ) + + output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) + + def validate(self) -> None: + logger.info("Validating Census ACS Data") + + pass diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 6190beea..cffb28d2 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -27,12 +27,21 @@ class CensusDecennialETL(ExtractTransformLoad): # https://api.census.gov/data/2010/dec/gu/variables.html # https://api.census.gov/data/2010/dec/mp/variables.html # https://api.census.gov/data/2010/dec/vi/variables.html + + # Total population field is the same in all island areas + self.TOTAL_POP_FIELD = self.TOTAL_POP_VI_FIELD = "P001001" + self.TOTAL_POP_FIELD_NAME = "Total population in 2009" + self.MEDIAN_INCOME_FIELD = "PBG049001" self.MEDIAN_INCOME_VI_FIELD = "PBG047001" - self.MEDIAN_INCOME_FIELD_NAME = ( - "MEDIAN HOUSEHOLD INCOME IN 2009 (DOLLARS)" + self.MEDIAN_INCOME_FIELD_NAME = "Median household income in 2009 ($)" + self.AREA_MEDIAN_INCOME_FIELD_NAME = ( + "Median household income as a percent of " + "territory median income in 2009" ) + self.TERRITORY_MEDIAN_INCOME_FIELD = "Territory Median Income" + self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001" self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = ( "PBG077001" @@ -48,7 +57,39 @@ class CensusDecennialETL(ExtractTransformLoad): ) self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = ( - "PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL" + "Percentage households below 200% of federal poverty line in 2009" + ) + + # We will combine three fields to get households < 100% FPL. + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE = ( + "PBG083002" # Total!!Under .50 + ) + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO = ( + "PBG083003" # Total!!.50 to .74 + ) + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE = ( + "PBG083004" # Total!!.75 to .99 + ) + + # Same fields, for Virgin Islands. + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE = ( + "PBG077002" # Total!!Under .50 + ) + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO = ( + "PBG077003" # Total!!.50 to .74 + ) + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE = ( + "PBG077004" # Total!!.75 to .99 + ) + + self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010" + self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010" + self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = ( + "Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009" + ) + + self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME = ( + "Percentage households below 100% of federal poverty line in 2009" ) # High School Education Fields @@ -70,9 +111,37 @@ class CensusDecennialETL(ExtractTransformLoad): "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER" ) - self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = ( - "PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME" + self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree in 2009" + + # Employment fields + self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD = ( + "PBG038003" # Total!!Male!!In labor force ) + self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD = ( + "PBG038007" # Total!!Male!!In labor force!!Civilian!!Unemployed + ) + self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD = ( + "PBG038010" # Total!!Female!!In labor force + ) + self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD = ( + "PBG038014" # Total!!Female!!In labor force!!Civilian!!Unemployed + ) + + # Same fields, Virgin Islands. + self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD = ( + "PBG036003" # Total!!Male!!In labor force + ) + self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD = ( + "PBG036007" # Total!!Male!!In labor force!!Civilian!!Unemployed + ) + self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD = ( + "PBG036010" # Total!!Female!!In labor force + ) + self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD = ( + "PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed + ) + + self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009" var_list = [ self.MEDIAN_INCOME_FIELD, @@ -81,6 +150,14 @@ class CensusDecennialETL(ExtractTransformLoad): self.TOTAL_POPULATION_FIELD, self.MALE_HIGH_SCHOOL_ED_FIELD, self.FEMALE_HIGH_SCHOOL_ED_FIELD, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE, + self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD, + self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD, + self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD, + self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, + self.TOTAL_POP_FIELD, ] var_list = ",".join(var_list) @@ -91,6 +168,14 @@ class CensusDecennialETL(ExtractTransformLoad): self.TOTAL_POPULATION_VI_FIELD, self.MALE_HIGH_SCHOOL_ED_VI_FIELD, self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE, + self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD, + self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD, + self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD, + self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD, + self.TOTAL_POP_VI_FIELD, ] var_list_vi = ",".join(var_list_vi) @@ -107,6 +192,20 @@ class CensusDecennialETL(ExtractTransformLoad): self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME, self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME, self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE, + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE, + self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD, + self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD, + self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD, + self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, + self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD, + self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD, + self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD, + self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, } # To do: Ask Census Slack Group about whether you need to hardcode the county fips @@ -117,24 +216,30 @@ class CensusDecennialETL(ExtractTransformLoad): "fips": "60", "county_fips": ["010", "020", "030", "040", "050"], "var_list": var_list, + # Note: we hardcode the median income for each territory in this dict, + # because that data is hard to programmatically access. + self.TERRITORY_MEDIAN_INCOME_FIELD: 23892, }, { "state_abbreviation": "gu", "fips": "66", "county_fips": ["010"], "var_list": var_list, + self.TERRITORY_MEDIAN_INCOME_FIELD: 48274, }, { "state_abbreviation": "mp", "fips": "69", "county_fips": ["085", "100", "110", "120"], "var_list": var_list, + self.TERRITORY_MEDIAN_INCOME_FIELD: 19958, }, { "state_abbreviation": "vi", "fips": "78", "county_fips": ["010", "020", "030"], "var_list": var_list_vi, + self.TERRITORY_MEDIAN_INCOME_FIELD: 37254, }, ] @@ -198,6 +303,11 @@ class CensusDecennialETL(ExtractTransformLoad): # Combine the dfs after renaming self.df_all = pd.concat([self.df, self.df_vi]) + # Rename total population: + self.df_all[self.TOTAL_POP_FIELD_NAME] = self.df_all[ + self.TOTAL_POP_FIELD + ] + # Percentage of households below 200% which is # [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total) self.df_all[ @@ -211,6 +321,25 @@ class CensusDecennialETL(ExtractTransformLoad): self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME ] + # Percentage of households below 100% FPL + # which we get by adding `Total!!Under .50`, `Total!!.50 to .74`, ` Total!!.75 to .99`, + # and then dividing by PBG083001 (total) + self.df_all[ + self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME + ] = ( + self.df_all[ + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE + ] + + self.df_all[ + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO + ] + + self.df_all[ + self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE + ] + ) / self.df_all[ + self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME + ] + # Percentage High School Achievement is # Percentage = (Male + Female) / (Total) self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = ( @@ -218,6 +347,28 @@ class CensusDecennialETL(ExtractTransformLoad): + self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME] ) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME] + # Calculate employment. + self.df_all[self.UNEMPLOYMENT_FIELD_NAME] = ( + self.df_all[self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD] + + self.df_all[self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD] + ) / ( + self.df_all[self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD] + + self.df_all[self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD] + ) + + # Calculate area median income + median_income_df = pd.DataFrame(self.ISLAND_TERRITORIES) + median_income_df = median_income_df[ + ["fips", self.TERRITORY_MEDIAN_INCOME_FIELD] + ] + self.df_all = self.df_all.merge( + right=median_income_df, left_on="state", right_on="fips", how="left" + ) + self.df_all[self.AREA_MEDIAN_INCOME_FIELD_NAME] = ( + self.df_all[self.MEDIAN_INCOME_FIELD_NAME] + / self.df_all[self.TERRITORY_MEDIAN_INCOME_FIELD] + ) + # Creating Geo ID (Census Block Group) Field Name self.df_all[self.GEOID_TRACT_FIELD_NAME] = ( self.df_all["state"] + self.df_all["county"] + self.df_all["tract"] @@ -238,9 +389,14 @@ class CensusDecennialETL(ExtractTransformLoad): columns_to_include = [ self.GEOID_TRACT_FIELD_NAME, + self.TOTAL_POP_FIELD_NAME, self.MEDIAN_INCOME_FIELD_NAME, + self.TERRITORY_MEDIAN_INCOME_FIELD, + self.AREA_MEDIAN_INCOME_FIELD_NAME, + self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME, self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME, self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME, + self.UNEMPLOYMENT_FIELD_NAME, ] self.df_all[columns_to_include].to_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py index 2626cbd5..c5724d7d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py @@ -14,6 +14,27 @@ class EJSCREENETL(ExtractTransformLoad): self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019" self.df: pd.DataFrame + self.COLUMNS_TO_KEEP = [ + self.GEOID_TRACT_FIELD_NAME, + field_names.TOTAL_POP_FIELD, + # pylint: disable=duplicate-code + field_names.AIR_TOXICS_CANCER_RISK_FIELD, + field_names.RESPITORY_HAZARD_FIELD, + field_names.DIESEL_FIELD, + field_names.PM25_FIELD, + field_names.OZONE_FIELD, + field_names.TRAFFIC_FIELD, + field_names.RMP_FIELD, + field_names.TSDF_FIELD, + field_names.NPL_FIELD, + field_names.WASTEWATER_FIELD, + field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD, + field_names.POVERTY_FIELD, + field_names.OVER_64_FIELD, + field_names.UNDER_5_FIELD, + field_names.LEAD_PAINT_FIELD, + ] + def extract(self) -> None: logger.info("Downloading EJScreen Data") super().extract( @@ -51,7 +72,6 @@ class EJSCREENETL(ExtractTransformLoad): "PWDIS": field_names.WASTEWATER_FIELD, "LINGISOPCT": field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD, "LOWINCPCT": field_names.POVERTY_FIELD, - "LESSHSPCT": field_names.HIGH_SCHOOL_ED_FIELD, "OVER64PCT": field_names.OVER_64_FIELD, "UNDER5PCT": field_names.UNDER_5_FIELD, "PRE1960PCT": field_names.LEAD_PAINT_FIELD, @@ -63,4 +83,6 @@ class EJSCREENETL(ExtractTransformLoad): logger.info("Saving EJScreen CSV") # write nationwide csv self.CSV_PATH.mkdir(parents=True, exist_ok=True) - self.df.to_csv(self.CSV_PATH / "usa.csv", index=False) + self.df[self.COLUMNS_TO_KEEP].to_csv( + self.CSV_PATH / "usa.csv", index=False + ) diff --git a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb index d29076d2..5e9b9360 100644 --- a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb @@ -21,7 +21,7 @@ "from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n", "\n", "\n", - "ACS_YEAR = 2019\n", + "ACS_YEAR = 2010\n", "\n", "DATA_PATH = Path.cwd().parent / \"data\"\n", "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n", @@ -45,11 +45,13 @@ "source": [ "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", - "censusdata.printtable(\n", - " censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n", - ")\n", + "# censusdata.printtable(\n", + "# censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n", + "# )\n", "\n", - "# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')" + "censusdata.search(\n", + " src=\"acs5\", year=ACS_YEAR, field=\"label\", criterion=\"employment status\"\n", + ")" ] }, { diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 12c4d518..6930a8cb 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -3,7 +3,6 @@ { "cell_type": "code", "execution_count": null, - "id": "51412a14", "metadata": { "scrolled": true }, @@ -49,7 +48,6 @@ { "cell_type": "code", "execution_count": null, - "id": "e3234c61", "metadata": { "scrolled": true }, @@ -81,7 +79,6 @@ { "cell_type": "code", "execution_count": null, - "id": "3b1b5ccf", "metadata": { "scrolled": true }, @@ -108,7 +105,6 @@ { "cell_type": "code", "execution_count": null, - "id": "1b1083e8", "metadata": {}, "outputs": [], "source": [ @@ -142,7 +138,6 @@ { "cell_type": "code", "execution_count": null, - "id": "fec0ed63", "metadata": {}, "outputs": [], "source": [ @@ -165,7 +160,6 @@ { "cell_type": "code", "execution_count": null, - "id": "d9968187", "metadata": { "scrolled": false }, @@ -192,7 +186,6 @@ { "cell_type": "code", "execution_count": null, - "id": "a7cfeb3c", "metadata": { "scrolled": false }, @@ -222,7 +215,6 @@ { "cell_type": "code", "execution_count": null, - "id": "df458f08", "metadata": {}, "outputs": [], "source": [ @@ -255,7 +247,6 @@ { "cell_type": "code", "execution_count": null, - "id": "a6c85d87", "metadata": { "scrolled": false }, @@ -282,7 +273,7 @@ " raise ValueError(\"Some of the census tract data has the wrong length.\")\n", "\n", "if len(merged_df) > ExtractTransformLoad.EXPECTED_MAX_CENSUS_TRACTS:\n", - " raise ValueError(\"Too many rows in the join.\")\n", + " raise ValueError(f\"Too many rows in the join: {len(merged_df)}.\")\n", "\n", "merged_df.head()" ] @@ -290,7 +281,6 @@ { "cell_type": "code", "execution_count": null, - "id": "274f6bc6", "metadata": { "scrolled": true }, @@ -393,17 +383,17 @@ "ejscreen_areas_of_concern_census_block_group_indices = [\n", " Index(\n", " method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n", - " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", + " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", " method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n", - " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", + " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", " method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n", - " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", + " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", "]\n", @@ -439,7 +429,6 @@ { "cell_type": "code", "execution_count": null, - "id": "bfae9cf5", "metadata": { "scrolled": true }, @@ -457,7 +446,8 @@ "\n", " # Calculate the population included as priority communities per tract. Will either be 0 or the population.\n", " df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n", - " df[priority_communities_field] * df[field_names.TOTAL_POP_FIELD]\n", + " df[priority_communities_field]\n", + " * df[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010]\n", " )\n", "\n", " def calculate_state_comparison(\n", @@ -496,7 +486,9 @@ " summary_dict[\"Geography name\"] = division_id\n", "\n", " total_tracts_in_geography = len(frame)\n", - " total_population_in_geography = frame[field_names.TOTAL_POP_FIELD].sum()\n", + " total_population_in_geography = frame[\n", + " field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010\n", + " ].sum()\n", "\n", " if geography_field == field_names.URBAN_HEURISTIC_FIELD:\n", " urban_flag = frame[field_names.URBAN_HEURISTIC_FIELD].unique()[0]\n", @@ -719,7 +711,6 @@ { "cell_type": "code", "execution_count": null, - "id": "c4d0e783", "metadata": {}, "outputs": [], "source": [ @@ -825,7 +816,6 @@ { "cell_type": "code", "execution_count": null, - "id": "8790cd64", "metadata": { "scrolled": true }, @@ -1024,7 +1014,6 @@ { "cell_type": "code", "execution_count": null, - "id": "eeb9699d", "metadata": { "scrolled": true }, @@ -1201,7 +1190,6 @@ { "cell_type": "code", "execution_count": null, - "id": "983abcea", "metadata": {}, "outputs": [], "source": [ diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index caacaa6e..1c845c49 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -57,13 +57,13 @@ AMI_FIELD = "Area Median Income (State or metropolitan)" # Climate FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score" -EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = ( +EXPECTED_BUILDING_LOSS_RATE_FIELD = ( "Expected building loss rate (Natural Hazards Risk Index)" ) -EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = ( +EXPECTED_AGRICULTURE_LOSS_RATE_FIELD = ( "Expected agricultural loss rate (Natural Hazards Risk Index)" ) -EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = ( +EXPECTED_POPULATION_LOSS_RATE_FIELD = ( "Expected population loss rate (Natural Hazards Risk Index)" ) @@ -117,6 +117,34 @@ AGGREGATION_POPULATION_FIELD = "Population Characteristics" UNDER_5_FIELD = "Individuals under 5 years old" OVER_64_FIELD = "Individuals over 64 years old" +# Fields from 2010 decennial census (generally only loaded for the territories) +CENSUS_DECENNIAL_MEDIAN_INCOME_2009 = "Median household income in 2009 ($)" +CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = ( + "Median household income as a percent of territory median income in 2009" +) +CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = ( + "Percentage households below 100% of federal poverty line in 2009" +) +CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009" +CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009" +CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009" + +# Fields from 2010 ACS (loaded for comparison with the territories) +CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010" +CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = ( + "Percent of individuals < 100% Federal Poverty Line in 2010" +) + +# Combined fields that merge island areas and states data +COMBINED_CENSUS_TOTAL_POPULATION_2010 = ( + "Total population in 2009 (island areas) and 2019 (states and PR)" +) +COMBINED_UNEMPLOYMENT_2010 = "Unemployed civilians (percent) in 2009 (island areas) and 2010 (states and PR)" +COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = ( + "Percentage households below 100% of federal poverty line in 2009 (island areas) " + "and 2010 (states and PR)" +) + # Urban Rural Map URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag" @@ -124,39 +152,39 @@ URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag" MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units" # EJSCREEN Areas of Concern -EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, National, 70th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, National, 75th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, National, 80th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, National, 85th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, National, 90th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, National, 95th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, State, 70th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, State, 75th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, State, 80th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, State, 85th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, State, 90th percentile (communities)" ) -EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( +EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = ( "EJSCREEN Areas of Concern, State, 95th percentile (communities)" ) diff --git a/data/data-pipeline/data_pipeline/score/score_l.py b/data/data-pipeline/data_pipeline/score/score_l.py index 6904e6e3..85c6b5d9 100644 --- a/data/data-pipeline/data_pipeline/score/score_l.py +++ b/data/data-pipeline/data_pipeline/score/score_l.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from data_pipeline.score.score import Score @@ -12,8 +13,86 @@ class ScoreL(Score): self.LOW_INCOME_THRESHOLD: float = 0.65 self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90 + self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10 + super().__init__(df) + def _combine_island_areas_with_states_and_set_thresholds( + self, + df: pd.DataFrame, + column_from_island_areas: str, + column_from_decennial_census: str, + combined_column_name: str, + threshold_cutoff_for_island_areas: float, + ) -> (pd.DataFrame, str): + """Steps to set thresholds for island areas. + + This function is fairly logically complicated. It takes the following steps: + + 1. Combine the two different fields into a single field. + 2. Calculate the 90th percentile cutoff raw value for the combined field. + 3. Create a boolean series that is true for any census tract in the island + areas (and only the island areas) that exceeds this cutoff. + + For step one, it combines data that is either the island area's Decennial Census + value in 2009 or the state's value in 5-year ACS ending in 2010. + + This will be used to generate the percentile cutoff for the 90th percentile. + + The stateside decennial census stopped asking economic comparisons, + so this is as close to apples-to-apples as we get. We use 5-year ACS for data + robustness over 1-year ACS. + """ + # Create the combined field. + # There should only be one entry in either 2009 or 2019 fields, not one in both. + # But just to be safe, we take the mean and ignore null values so if there + # *were* entries in both, this result would make sense. + df[combined_column_name] = df[ + [column_from_island_areas, column_from_decennial_census] + ].mean(axis=1, skipna=True) + + logger.info( + f"Combined field `{combined_column_name}` has " + f"{df[combined_column_name].isnull().sum()} " + f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) " + f"missing values for census tracts. " + ) + + # Calculate the percentile threshold raw value. + raw_threshold = np.nanquantile( + a=df[combined_column_name], q=threshold_cutoff_for_island_areas + ) + + logger.info( + f"For combined field `{combined_column_name}`, " + f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a " + f"raw value of {raw_threshold:.3f}." + ) + + threshold_column_name = ( + f"{column_from_island_areas} exceeds " + f"{threshold_cutoff_for_island_areas*100:.0f}th percentile" + ) + + df[threshold_column_name] = ( + df[column_from_island_areas] >= raw_threshold + ) + + percent_of_tracts_highlighted = ( + 100 + * df[threshold_column_name].sum() + / df[column_from_island_areas].notnull().sum() + ) + + logger.info( + f"For `{threshold_column_name}`, " + f"{df[threshold_column_name].sum()} (" + f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data " + f"in the column) have a value of TRUE." + ) + + return df, threshold_column_name + def add_columns(self) -> pd.DataFrame: logger.info("Adding Score L") @@ -67,21 +146,21 @@ class ScoreL(Score): climate_criteria = ( ( self.df[ - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME + field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX ] >= self.ENVIRONMENTAL_BURDEN_THRESHOLD ) | ( self.df[ - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX ] >= self.ENVIRONMENTAL_BURDEN_THRESHOLD ) | ( self.df[ - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME + field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX ] >= self.ENVIRONMENTAL_BURDEN_THRESHOLD @@ -204,14 +283,24 @@ class ScoreL(Score): # poverty level. Source: Census's American Community Survey] pollution_criteria = ( - self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) | ( - self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) | ( - self.df[field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD + ( + self.df[ + field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX + ] + >= self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + | ( + self.df[ + field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX + ] + >= self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + | ( + self.df[ + field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX + ] + >= self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) ) return pollution_criteria & ( @@ -306,7 +395,7 @@ class ScoreL(Score): # AND # Where the high school degree achievement rates for adults 25 years and older is less than 95% # (necessary to screen out university block groups) - workforce_criteria = ( + workforce_criteria_for_states = ( ( self.df[ field_names.UNEMPLOYMENT_FIELD @@ -338,6 +427,76 @@ class ScoreL(Score): >= self.ENVIRONMENTAL_BURDEN_THRESHOLD ) ) + workforce_combined_criteria_for_states = ( + self.df[field_names.HIGH_SCHOOL_ED_FIELD] + >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD + ) & workforce_criteria_for_states + + # Now, calculate workforce criteria for island territories. + + # F a couple of values, create a combined field and criteria field. + # First, combine unemployment. + ( + self.df, + unemployment_island_areas_criteria_field_name, + ) = self._combine_island_areas_with_states_and_set_thresholds( + df=self.df, + column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009, + column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010, + combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010, + threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + + # Next, combine poverty. + ( + self.df, + poverty_island_areas_criteria_field_name, + ) = self._combine_island_areas_with_states_and_set_thresholds( + df=self.df, + column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009, + column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010, + combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010, + threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD, + ) + + workforce_combined_criteria_for_island_areas = ( + self.df[unemployment_island_areas_criteria_field_name] + | self.df[poverty_island_areas_criteria_field_name] + # Also check whether area median income is 10th percentile or lower + # within the islands. + | ( + self.df[ + field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 + + field_names.PERCENTILE_FIELD_SUFFIX + ] + # Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it. + # and then look for median income lower than that (not greater than). + < 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + ) & ( + self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009] + > self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD + ) + + percent_of_island_tracts_highlighted = ( + 100 + * workforce_combined_criteria_for_island_areas.sum() + # Choosing a random column from island areas to calculate the denominator. + / self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009] + .notnull() + .sum() + ) + + logger.info( + f"For workforce criteria in island areas, " + f"{workforce_combined_criteria_for_island_areas.sum()} (" + f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data " + f"in the column) have a value of TRUE." + ) + + # A tract is included if it meets either the states tract criteria or the + # island areas tract criteria. return ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] >= 0.10 - ) & workforce_criteria + workforce_combined_criteria_for_states + | workforce_combined_criteria_for_island_areas + ) diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml index 8d805216..18a45ede 100644 --- a/data/data-pipeline/pyproject.toml +++ b/data/data-pipeline/pyproject.toml @@ -67,6 +67,9 @@ disable = [ "C0115", # Disables missing class docstring "R0915", # Disables too many statements (score generation transform) "W0231", # Disables super init not called + "R0801", # Disables duplicate code. There are a couple places we have similar code and + # unfortunately you can't disable this rule for individual lines or files, it's a + # known bug. https://github.com/PyCQA/pylint/issues/214# ] [tool.pylint.FORMAT]