Merge branch 'usds:main' into main

This commit is contained in:
Saran Ahluwalia 2021-12-03 15:51:43 -05:00 committed by GitHub
commit 12456c8dc5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 885 additions and 161 deletions

View file

@ -32,10 +32,15 @@ class ExtractTransformLoad:
FILES_PATH: Path = settings.APP_ROOT / "files" FILES_PATH: Path = settings.APP_ROOT / "files"
GEOID_FIELD_NAME: str = "GEOID10" GEOID_FIELD_NAME: str = "GEOID10"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
# be from CBGs at different time periods.
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000 EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
# TODO: investigate. Census says there are only 73,057 tracts in the US. This might be from tracts at different time periods.
EXPECTED_MAX_CENSUS_TRACTS: int = 74027 # TODO: investigate. Census says there are only 74,134 tracts in the US,
# Puerto Rico, and island areas. This might be from tracts at different time
# periods. https://github.com/usds/justice40-tool/issues/964
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
def __init__(self, config_path: Path) -> None: def __init__(self, config_path: Path) -> None:
"""Inits the class with instance specific variables""" """Inits the class with instance specific variables"""

View file

@ -4,6 +4,11 @@ DATASET_LIST = [
"module_dir": "census_acs", "module_dir": "census_acs",
"class_name": "CensusACSETL", "class_name": "CensusACSETL",
}, },
{
"name": "census_acs_2010",
"module_dir": "census_acs_2010",
"class_name": "CensusACS2010ETL",
},
{ {
"name": "ejscreen", "name": "ejscreen",
"module_dir": "ejscreen", "module_dir": "ejscreen",
@ -14,16 +19,6 @@ DATASET_LIST = [
"module_dir": "hud_housing", "module_dir": "hud_housing",
"class_name": "HudHousingETL", "class_name": "HudHousingETL",
}, },
{
"name": "calenviroscreen",
"module_dir": "calenviroscreen",
"class_name": "CalEnviroScreenETL",
},
{
"name": "hud_recap",
"module_dir": "hud_recap",
"class_name": "HudRecapETL",
},
{ {
"name": "cdc_places", "name": "cdc_places",
"module_dir": "cdc_places", "module_dir": "cdc_places",
@ -74,6 +69,16 @@ DATASET_LIST = [
"module_dir": "housing_and_transportation", "module_dir": "housing_and_transportation",
"class_name": "HousingTransportationETL", "class_name": "HousingTransportationETL",
}, },
{
"name": "calenviroscreen",
"module_dir": "calenviroscreen",
"class_name": "CalEnviroScreenETL",
},
{
"name": "hud_recap",
"module_dir": "hud_recap",
"class_name": "HudRecapETL",
},
{ {
"name": "tree_equity_score", "name": "tree_equity_score",
"module_dir": "tree_equity_score", "module_dir": "tree_equity_score",

View file

@ -27,6 +27,8 @@ class ScoreETL(ExtractTransformLoad):
self.national_risk_index_df: pd.DataFrame self.national_risk_index_df: pd.DataFrame
self.geocorr_urban_rural_df: pd.DataFrame self.geocorr_urban_rural_df: pd.DataFrame
self.persistent_poverty_df: pd.DataFrame self.persistent_poverty_df: pd.DataFrame
self.census_decennial_df: pd.DataFrame
self.census_2010_df: pd.DataFrame
def extract(self) -> None: def extract(self) -> None:
logger.info("Loading data sets from disk.") logger.info("Loading data sets from disk.")
@ -137,6 +139,29 @@ class ScoreETL(ExtractTransformLoad):
low_memory=False, low_memory=False,
) )
# Load decennial census data
census_decennial_csv = (
constants.DATA_PATH
/ "dataset"
/ "census_decennial_2010"
/ "usa.csv"
)
self.census_decennial_df = pd.read_csv(
census_decennial_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
# Load 2010 ACS data from states
census_2010_csv = (
constants.DATA_PATH / "dataset" / "census_acs_2010" / "usa.csv"
)
self.census_2010_df = pd.read_csv(
census_2010_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame: def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
logger.info("Joining Census Tract dataframes") logger.info("Joining Census Tract dataframes")
@ -228,6 +253,8 @@ class ScoreETL(ExtractTransformLoad):
self.persistent_poverty_df, self.persistent_poverty_df,
self.national_risk_index_df, self.national_risk_index_df,
self.census_acs_median_incomes_df, self.census_acs_median_incomes_df,
self.census_decennial_df,
self.census_2010_df,
] ]
# Sanity check each data frame before merging. # Sanity check each data frame before merging.
@ -296,9 +323,16 @@ class ScoreETL(ExtractTransformLoad):
field_names.HIGH_SCHOOL_ED_FIELD, field_names.HIGH_SCHOOL_ED_FIELD,
field_names.UNEMPLOYMENT_FIELD, field_names.UNEMPLOYMENT_FIELD,
field_names.MEDIAN_HOUSE_VALUE_FIELD, field_names.MEDIAN_HOUSE_VALUE_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME, field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME, field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME, field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
] ]
non_numeric_columns = [ non_numeric_columns = [
@ -315,9 +349,9 @@ class ScoreETL(ExtractTransformLoad):
# Convert all columns to numeric and do math # Convert all columns to numeric and do math
for col in numeric_columns: for col in numeric_columns:
# Calculate percentiles # Calculate percentiles
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[col].rank( df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[
pct=True col
) ].rank(pct=True)
# Min-max normalization: # Min-max normalization:
# ( # (
@ -341,6 +375,20 @@ class ScoreETL(ExtractTransformLoad):
df_copy[col] - min_value df_copy[col] - min_value
) / (max_value - min_value) ) / (max_value - min_value)
# Special logic: create a combined population field.
# We sometimes run analytics on "population", and this makes a single field
# that is either the island area's population in 2009 or the state's
# population in 2019.
# There should only be one entry in either 2009 or 2019, not one in both.
# But just to be safe, we take the mean and ignore null values so if there
# *were* entries in both fields, this result would make sense.
df_copy[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010] = df_copy[
[
field_names.TOTAL_POP_FIELD,
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
]
].mean(axis=1, skipna=True)
return df_copy return df_copy
def transform(self) -> None: def transform(self) -> None:

View file

@ -1,8 +1,7 @@
import pandas as pd import pandas as pd
import censusdata
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -14,7 +13,15 @@ class CensusACSETL(ExtractTransformLoad):
self.OUTPUT_PATH = ( self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
) )
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
self.EMPLOYMENT_FIELDS = [
self.TOTAL_UNEMPLOYED_FIELD,
self.TOTAL_IN_LABOR_FORCE,
]
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = ( self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
"Linguistic isolation (total)" "Linguistic isolation (total)"
@ -55,59 +62,89 @@ class CensusACSETL(ExtractTransformLoad):
"Median value ($) of owner-occupied housing units" "Median value ($) of owner-occupied housing units"
) )
# Educational attainment figures
self.EDUCATION_POPULATION_OVER_25 = "B15003_001E" # Estimate!!Total
self.EDUCATION_NO_SCHOOLING = (
"B15003_002E" # Estimate!!Total!!No schooling completed
)
self.EDUCATION_NURSERY = (
"B15003_003E" # Estimate!!Total!!Nursery school
)
self.EDUCATION_KINDERGARTEN = (
"B15003_004E" # Estimate!!Total!!Kindergarten
)
self.EDUCATION_FIRST = "B15003_005E" # Estimate!!Total!!1st grade
self.EDUCATION_SECOND = "B15003_006E" # Estimate!!Total!!2nd grade
self.EDUCATION_THIRD = "B15003_007E" # Estimate!!Total!!3rd grade
self.EDUCATION_FOURTH = "B15003_008E" # Estimate!!Total!!4th grade
self.EDUCATION_FIFTH = "B15003_009E" # Estimate!!Total!!5th grade
self.EDUCATION_SIXTH = "B15003_010E" # Estimate!!Total!!6th grade
self.EDUCATION_SEVENTH = "B15003_011E" # Estimate!!Total!!7th grade
self.EDUCATION_EIGHTH = "B15003_012E" # Estimate!!Total!!8th grade
self.EDUCATION_NINTH = "B15003_013E" # Estimate!!Total!!9th grade
self.EDUCATION_TENTH = "B15003_014E" # Estimate!!Total!!10th grade
self.EDUCATION_ELEVENTH = "B15003_015E" # Estimate!!Total!!11th grade
self.EDUCATION_TWELFTH_NO_DIPLOMA = (
"B15003_016E" # Estimate!!Total!!12th grade, no diploma
)
self.EDUCATIONAL_FIELDS = [
self.EDUCATION_POPULATION_OVER_25,
self.EDUCATION_NO_SCHOOLING,
self.EDUCATION_NURSERY,
self.EDUCATION_KINDERGARTEN,
self.EDUCATION_FIRST,
self.EDUCATION_SECOND,
self.EDUCATION_THIRD,
self.EDUCATION_FOURTH,
self.EDUCATION_FIFTH,
self.EDUCATION_SIXTH,
self.EDUCATION_SEVENTH,
self.EDUCATION_EIGHTH,
self.EDUCATION_NINTH,
self.EDUCATION_TENTH,
self.EDUCATION_ELEVENTH,
self.EDUCATION_TWELFTH_NO_DIPLOMA,
]
self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD = (
"Individuals age 25 or over with less than high school degree"
)
self.HIGH_SCHOOL_ED_FIELD = "Percent individuals age 25 or over with less than high school degree"
self.STATE_GEOID_FIELD_NAME = "GEOID2" self.STATE_GEOID_FIELD_NAME = "GEOID2"
self.df: pd.DataFrame self.df: pd.DataFrame
def _fips_from_censusdata_censusgeo(
self, censusgeo: censusdata.censusgeo
) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
def extract(self) -> None: def extract(self) -> None:
dfs = [] # Define the variables to retrieve
for fips in get_state_fips_codes(self.DATA_PATH): variables = (
logger.info( [
f"Downloading data for state/territory with FIPS code {fips}" # Income field
) self.MEDIAN_INCOME_FIELD,
# House value
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
)
try: self.df = retrieve_census_acs_data(
response = censusdata.download( acs_year=self.ACS_YEAR,
src="acs5", variables=variables,
year=self.ACS_YEAR, tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
geo=censusdata.censusgeo( data_path_for_fips_codes=self.DATA_PATH,
[("state", fips), ("county", "*"), ("tract", "*")]
),
var=[
# Emploment fields
"B23025_005E",
"B23025_003E",
# Income field
self.MEDIAN_INCOME_FIELD,
# House value
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS,
)
dfs.append(response)
except ValueError:
logger.error(
f"Could not download data for state/territory with FIPS code {fips}"
)
self.df = pd.concat(dfs)
self.df[self.GEOID_TRACT_FIELD_NAME] = self.df.index.to_series().apply(
func=self._fips_from_censusdata_censusgeo
) )
def transform(self) -> None: def transform(self) -> None:
logger.info("Starting Census ACS Transform") logger.info("Starting Census ACS Transform")
df = self.df
# Rename two fields. # Rename two fields.
self.df = self.df.rename( df = df.rename(
columns={ columns={
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME, self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME, self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
@ -119,19 +156,17 @@ class CensusACSETL(ExtractTransformLoad):
self.MEDIAN_INCOME_FIELD_NAME, self.MEDIAN_INCOME_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME, self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
]: ]:
missing_value_count = sum(self.df[field] == -666666666) missing_value_count = sum(df[field] == -666666666)
logger.info( logger.info(
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of " f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
+ f"`{field}` being marked as null values." + f"`{field}` being marked as null values."
) )
self.df[field] = self.df[field].replace( df[field] = df[field].replace(to_replace=-666666666, value=None)
to_replace=-666666666, value=None
)
# Calculate percent unemployment. # Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = ( df[self.UNEMPLOYED_FIELD_NAME] = (
self.df.B23025_005E / self.df.B23025_003E df[self.TOTAL_UNEMPLOYED_FIELD] / df[self.TOTAL_IN_LABOR_FORCE]
) )
# Calculate linguistic isolation. # Calculate linguistic isolation.
@ -142,34 +177,64 @@ class CensusACSETL(ExtractTransformLoad):
"C16002_013E", "C16002_013E",
] ]
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = self.df[ df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[
individual_limited_english_fields individual_limited_english_fields
].sum(axis=1, skipna=True) ].sum(axis=1, skipna=True)
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = ( df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float) df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
/ self.df["C16002_001E"] / df["C16002_001E"]
) )
# Calculate percent at different poverty thresholds # Calculate percent at different poverty thresholds
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = ( df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"] + self.df["C17002_003E"] df["C17002_002E"] + df["C17002_003E"]
) / self.df["C17002_001E"] ) / df["C17002_001E"]
self.df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = ( df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"] df["C17002_002E"]
+ self.df["C17002_003E"] + df["C17002_003E"]
+ self.df["C17002_004E"] + df["C17002_004E"]
+ self.df["C17002_005E"] + df["C17002_005E"]
) / self.df["C17002_001E"] ) / df["C17002_001E"]
self.df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = ( df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"] df["C17002_002E"]
+ self.df["C17002_003E"] + df["C17002_003E"]
+ self.df["C17002_004E"] + df["C17002_004E"]
+ self.df["C17002_005E"] + df["C17002_005E"]
+ self.df["C17002_006E"] + df["C17002_006E"]
+ self.df["C17002_007E"] + df["C17002_007E"]
) / self.df["C17002_001E"] ) / df["C17002_001E"]
# Calculate educational attainment
educational_numerator_fields = [
self.EDUCATION_NO_SCHOOLING,
self.EDUCATION_NURSERY,
self.EDUCATION_KINDERGARTEN,
self.EDUCATION_FIRST,
self.EDUCATION_SECOND,
self.EDUCATION_THIRD,
self.EDUCATION_FOURTH,
self.EDUCATION_FIFTH,
self.EDUCATION_SIXTH,
self.EDUCATION_SEVENTH,
self.EDUCATION_EIGHTH,
self.EDUCATION_NINTH,
self.EDUCATION_TENTH,
self.EDUCATION_ELEVENTH,
self.EDUCATION_TWELFTH_NO_DIPLOMA,
]
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD] = df[
educational_numerator_fields
].sum(axis=1)
df[self.HIGH_SCHOOL_ED_FIELD] = (
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD]
/ df[self.EDUCATION_POPULATION_OVER_25]
)
# Save results to self.
self.df = df
def load(self) -> None: def load(self) -> None:
logger.info("Saving Census ACS Data") logger.info("Saving Census ACS Data")
@ -186,6 +251,7 @@ class CensusACSETL(ExtractTransformLoad):
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME, self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.HIGH_SCHOOL_ED_FIELD,
] ]
self.df[columns_to_include].to_csv( self.df[columns_to_include].to_csv(

View file

@ -0,0 +1,61 @@
from pathlib import Path
from typing import List
import censusdata
import pandas as pd
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
def _fips_from_censusdata_censusgeo(
censusgeo: censusdata.censusgeo
) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
# pylint: disable=too-many-arguments
def retrieve_census_acs_data(
acs_year: int,
variables: List[str],
tract_output_field_name: str,
data_path_for_fips_codes: Path,
acs_type="acs5",
raise_errors: bool = False,
) -> pd.DataFrame:
"""Retrieves and combines census ACS data for a given year."""
dfs = []
for fips in get_state_fips_codes(data_path_for_fips_codes):
logger.info(
f"Downloading data for state/territory with FIPS code {fips}"
)
try:
response = censusdata.download(
src=acs_type,
year=acs_year,
geo=censusdata.censusgeo(
[("state", fips), ("county", "*"), ("tract", "*")]
),
var=variables,
)
dfs.append(response)
except ValueError as e:
logger.error(
f"Could not download data for state/territory with FIPS code {fips}"
)
if raise_errors:
raise e
df = pd.concat(dfs)
df[tract_output_field_name] = df.index.to_series().apply(
func=_fips_from_censusdata_censusgeo
)
return df

View file

@ -0,0 +1,186 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class CensusACS2010ETL(ExtractTransformLoad):
"""Extract ACS data from 2010 or approximately that year.
Note: Census ACS 2010 uses different fields than those captured in CensusACSETL.
To support this, we created a separate class.
"""
def __init__(self):
self.ACS_YEAR = 2010
self.ACS_TYPE = "acs5"
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
)
# Employment fields
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED = (
"B23006_007E"
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian!!Unemployed
)
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED = (
"B23006_014E"
# Estimate!!Total!!High school graduate!!In labor force!!Civilian!!Unemployed
)
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED = (
"B23006_021E"
# Estimate!!Total!!Some college or associate's degree!!In labor force!!Civilian!!Unemployed
)
self.EMPLOYMENT_COLLEGE_UNEMPLOYED = (
"B23006_028E"
# Estimate!!Total!!Bachelor's degree or higher!!In labor force!!Civilian!!Unemployed
)
self.UNEMPLOYED_FIELDS = [
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED,
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED,
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED,
self.EMPLOYMENT_COLLEGE_UNEMPLOYED,
]
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE = (
# TODO: FIX!!!!!!
"B23006_005E"
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian
)
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE = (
"B23006_010E"
# Estimate!!Total!!High school graduate!!In labor force
)
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE = (
"B23006_017E"
# Estimate!!Total!!Some college or associate's degree!!In labor force
)
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE = (
"B23006_024E"
# Estimate!!Total!!Bachelor's degree or higher!!In labor force
)
self.IN_LABOR_FORCE_FIELDS = [
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE,
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE,
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE,
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE,
]
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.POVERTY_FIELDS = [
"C17002_001E", # Estimate!!Total,
"C17002_002E", # Estimate!!Total!!Under .50
"C17002_003E", # Estimate!!Total!!.50 to .99
"C17002_004E", # Estimate!!Total!!1.00 to 1.24
"C17002_005E", # Estimate!!Total!!1.25 to 1.49
"C17002_006E", # Estimate!!Total!!1.50 to 1.84
"C17002_007E", # Estimate!!Total!!1.85 to 1.99
]
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME = (
"Percent of individuals < 100% Federal Poverty Line"
)
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME = (
"Percent of individuals < 150% Federal Poverty Line"
)
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
"Percent of individuals < 200% Federal Poverty Line"
)
self.STATE_GEOID_FIELD_NAME = "GEOID2"
self.df: pd.DataFrame
def extract(self) -> None:
# Define the variables to retrieve
variables = (
self.UNEMPLOYED_FIELDS
+ self.IN_LABOR_FORCE_FIELDS
+ self.POVERTY_FIELDS
)
# Use the method defined on CensusACSETL to reduce coding redundancy.
self.df = retrieve_census_acs_data(
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type=self.ACS_TYPE,
raise_errors=False,
)
def transform(self) -> None:
logger.info("Starting Census ACS Transform")
df = self.df
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
unemployed_totals = df[self.UNEMPLOYED_FIELDS].sum(axis=1)
labor_force_totals = df[self.IN_LABOR_FORCE_FIELDS].sum(axis=1)
df[self.UNEMPLOYED_FIELD_NAME] = unemployed_totals / labor_force_totals
# Calculate percent at different poverty thresholds
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
df["C17002_002E"] + df["C17002_003E"]
) / df["C17002_001E"]
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
df["C17002_002E"]
+ df["C17002_003E"]
+ df["C17002_004E"]
+ df["C17002_005E"]
) / df["C17002_001E"]
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
df["C17002_002E"]
+ df["C17002_003E"]
+ df["C17002_004E"]
+ df["C17002_005E"]
+ df["C17002_006E"]
+ df["C17002_007E"]
) / df["C17002_001E"]
# Save results to self.
self.df = df
def load(self) -> None:
logger.info("Saving Census ACS Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
columns_to_include = [
self.GEOID_TRACT_FIELD_NAME,
self.UNEMPLOYED_FIELD_NAME,
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
]
output_df = self.df[columns_to_include]
# Add the year to the end of every column, so when it's all joined in the
# score df, it's obvious which year this data is from.
for column in columns_to_include:
if column != self.GEOID_TRACT_FIELD_NAME:
output_df = output_df.rename(
columns={
column: f"{column} in {self.ACS_YEAR}",
}
)
output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating Census ACS Data")
pass

View file

@ -27,12 +27,21 @@ class CensusDecennialETL(ExtractTransformLoad):
# https://api.census.gov/data/2010/dec/gu/variables.html # https://api.census.gov/data/2010/dec/gu/variables.html
# https://api.census.gov/data/2010/dec/mp/variables.html # https://api.census.gov/data/2010/dec/mp/variables.html
# https://api.census.gov/data/2010/dec/vi/variables.html # https://api.census.gov/data/2010/dec/vi/variables.html
# Total population field is the same in all island areas
self.TOTAL_POP_FIELD = self.TOTAL_POP_VI_FIELD = "P001001"
self.TOTAL_POP_FIELD_NAME = "Total population in 2009"
self.MEDIAN_INCOME_FIELD = "PBG049001" self.MEDIAN_INCOME_FIELD = "PBG049001"
self.MEDIAN_INCOME_VI_FIELD = "PBG047001" self.MEDIAN_INCOME_VI_FIELD = "PBG047001"
self.MEDIAN_INCOME_FIELD_NAME = ( self.MEDIAN_INCOME_FIELD_NAME = "Median household income in 2009 ($)"
"MEDIAN HOUSEHOLD INCOME IN 2009 (DOLLARS)" self.AREA_MEDIAN_INCOME_FIELD_NAME = (
"Median household income as a percent of "
"territory median income in 2009"
) )
self.TERRITORY_MEDIAN_INCOME_FIELD = "Territory Median Income"
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001" self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001"
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = ( self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = (
"PBG077001" "PBG077001"
@ -48,7 +57,39 @@ class CensusDecennialETL(ExtractTransformLoad):
) )
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = ( self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
"PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL" "Percentage households below 200% of federal poverty line in 2009"
)
# We will combine three fields to get households < 100% FPL.
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE = (
"PBG083002" # Total!!Under .50
)
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO = (
"PBG083003" # Total!!.50 to .74
)
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE = (
"PBG083004" # Total!!.75 to .99
)
# Same fields, for Virgin Islands.
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE = (
"PBG077002" # Total!!Under .50
)
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO = (
"PBG077003" # Total!!.50 to .74
)
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE = (
"PBG077004" # Total!!.75 to .99
)
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
)
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME = (
"Percentage households below 100% of federal poverty line in 2009"
) )
# High School Education Fields # High School Education Fields
@ -70,9 +111,37 @@ class CensusDecennialETL(ExtractTransformLoad):
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER" "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
) )
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = ( self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree in 2009"
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"
# Employment fields
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD = (
"PBG038003" # Total!!Male!!In labor force
) )
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD = (
"PBG038007" # Total!!Male!!In labor force!!Civilian!!Unemployed
)
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD = (
"PBG038010" # Total!!Female!!In labor force
)
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD = (
"PBG038014" # Total!!Female!!In labor force!!Civilian!!Unemployed
)
# Same fields, Virgin Islands.
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD = (
"PBG036003" # Total!!Male!!In labor force
)
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD = (
"PBG036007" # Total!!Male!!In labor force!!Civilian!!Unemployed
)
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD = (
"PBG036010" # Total!!Female!!In labor force
)
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD = (
"PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed
)
self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009"
var_list = [ var_list = [
self.MEDIAN_INCOME_FIELD, self.MEDIAN_INCOME_FIELD,
@ -81,6 +150,14 @@ class CensusDecennialETL(ExtractTransformLoad):
self.TOTAL_POPULATION_FIELD, self.TOTAL_POPULATION_FIELD,
self.MALE_HIGH_SCHOOL_ED_FIELD, self.MALE_HIGH_SCHOOL_ED_FIELD,
self.FEMALE_HIGH_SCHOOL_ED_FIELD, self.FEMALE_HIGH_SCHOOL_ED_FIELD,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
self.TOTAL_POP_FIELD,
] ]
var_list = ",".join(var_list) var_list = ",".join(var_list)
@ -91,6 +168,14 @@ class CensusDecennialETL(ExtractTransformLoad):
self.TOTAL_POPULATION_VI_FIELD, self.TOTAL_POPULATION_VI_FIELD,
self.MALE_HIGH_SCHOOL_ED_VI_FIELD, self.MALE_HIGH_SCHOOL_ED_VI_FIELD,
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD, self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE,
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD,
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD,
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
self.TOTAL_POP_VI_FIELD,
] ]
var_list_vi = ",".join(var_list_vi) var_list_vi = ",".join(var_list_vi)
@ -107,6 +192,20 @@ class CensusDecennialETL(ExtractTransformLoad):
self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME, self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME, self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME, self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
} }
# To do: Ask Census Slack Group about whether you need to hardcode the county fips # To do: Ask Census Slack Group about whether you need to hardcode the county fips
@ -117,24 +216,30 @@ class CensusDecennialETL(ExtractTransformLoad):
"fips": "60", "fips": "60",
"county_fips": ["010", "020", "030", "040", "050"], "county_fips": ["010", "020", "030", "040", "050"],
"var_list": var_list, "var_list": var_list,
# Note: we hardcode the median income for each territory in this dict,
# because that data is hard to programmatically access.
self.TERRITORY_MEDIAN_INCOME_FIELD: 23892,
}, },
{ {
"state_abbreviation": "gu", "state_abbreviation": "gu",
"fips": "66", "fips": "66",
"county_fips": ["010"], "county_fips": ["010"],
"var_list": var_list, "var_list": var_list,
self.TERRITORY_MEDIAN_INCOME_FIELD: 48274,
}, },
{ {
"state_abbreviation": "mp", "state_abbreviation": "mp",
"fips": "69", "fips": "69",
"county_fips": ["085", "100", "110", "120"], "county_fips": ["085", "100", "110", "120"],
"var_list": var_list, "var_list": var_list,
self.TERRITORY_MEDIAN_INCOME_FIELD: 19958,
}, },
{ {
"state_abbreviation": "vi", "state_abbreviation": "vi",
"fips": "78", "fips": "78",
"county_fips": ["010", "020", "030"], "county_fips": ["010", "020", "030"],
"var_list": var_list_vi, "var_list": var_list_vi,
self.TERRITORY_MEDIAN_INCOME_FIELD: 37254,
}, },
] ]
@ -198,6 +303,11 @@ class CensusDecennialETL(ExtractTransformLoad):
# Combine the dfs after renaming # Combine the dfs after renaming
self.df_all = pd.concat([self.df, self.df_vi]) self.df_all = pd.concat([self.df, self.df_vi])
# Rename total population:
self.df_all[self.TOTAL_POP_FIELD_NAME] = self.df_all[
self.TOTAL_POP_FIELD
]
# Percentage of households below 200% which is # Percentage of households below 200% which is
# [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total) # [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total)
self.df_all[ self.df_all[
@ -211,6 +321,25 @@ class CensusDecennialETL(ExtractTransformLoad):
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
] ]
# Percentage of households below 100% FPL
# which we get by adding `Total!!Under .50`, `Total!!.50 to .74`, ` Total!!.75 to .99`,
# and then dividing by PBG083001 (total)
self.df_all[
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME
] = (
self.df_all[
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE
]
+ self.df_all[
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO
]
+ self.df_all[
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE
]
) / self.df_all[
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
]
# Percentage High School Achievement is # Percentage High School Achievement is
# Percentage = (Male + Female) / (Total) # Percentage = (Male + Female) / (Total)
self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = ( self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = (
@ -218,6 +347,28 @@ class CensusDecennialETL(ExtractTransformLoad):
+ self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME] + self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME]
) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME] ) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME]
# Calculate employment.
self.df_all[self.UNEMPLOYMENT_FIELD_NAME] = (
self.df_all[self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD]
+ self.df_all[self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD]
) / (
self.df_all[self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD]
+ self.df_all[self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD]
)
# Calculate area median income
median_income_df = pd.DataFrame(self.ISLAND_TERRITORIES)
median_income_df = median_income_df[
["fips", self.TERRITORY_MEDIAN_INCOME_FIELD]
]
self.df_all = self.df_all.merge(
right=median_income_df, left_on="state", right_on="fips", how="left"
)
self.df_all[self.AREA_MEDIAN_INCOME_FIELD_NAME] = (
self.df_all[self.MEDIAN_INCOME_FIELD_NAME]
/ self.df_all[self.TERRITORY_MEDIAN_INCOME_FIELD]
)
# Creating Geo ID (Census Block Group) Field Name # Creating Geo ID (Census Block Group) Field Name
self.df_all[self.GEOID_TRACT_FIELD_NAME] = ( self.df_all[self.GEOID_TRACT_FIELD_NAME] = (
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"] self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
@ -238,9 +389,14 @@ class CensusDecennialETL(ExtractTransformLoad):
columns_to_include = [ columns_to_include = [
self.GEOID_TRACT_FIELD_NAME, self.GEOID_TRACT_FIELD_NAME,
self.TOTAL_POP_FIELD_NAME,
self.MEDIAN_INCOME_FIELD_NAME, self.MEDIAN_INCOME_FIELD_NAME,
self.TERRITORY_MEDIAN_INCOME_FIELD,
self.AREA_MEDIAN_INCOME_FIELD_NAME,
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME,
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME, self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME, self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
self.UNEMPLOYMENT_FIELD_NAME,
] ]
self.df_all[columns_to_include].to_csv( self.df_all[columns_to_include].to_csv(

View file

@ -12,15 +12,15 @@ class DOEEnergyBurden(ExtractTransformLoad):
def __init__(self): def __init__(self):
self.DOE_FILE_URL = ( self.DOE_FILE_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_with_EJSCREEN.csv.zip" + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
) )
self.OUTPUT_PATH: Path = ( self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "doe_energy_burden" self.DATA_PATH / "dataset" / "doe_energy_burden"
) )
self.TRACT_INPUT_COLUMN_NAME = "GEOID" self.TRACT_INPUT_COLUMN_NAME = "FIP"
self.ENERGY_BURDEN_FIELD_NAME = "Energy burden" self.ENERGY_BURDEN_FIELD_NAME = "BURDEN"
# Constants for output # Constants for output
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
@ -61,11 +61,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
} }
) )
# Convert energy burden to a fraction, since we represent all other percentages as fractions.
output_df[self.ENERGY_BURDEN_FIELD_NAME] = (
output_df[self.ENERGY_BURDEN_FIELD_NAME] / 100
)
# Left-pad the tracts with 0s # Left-pad the tracts with 0s
expected_length_of_census_tract_field = 11 expected_length_of_census_tract_field = 11
output_df[self.GEOID_TRACT_FIELD_NAME] = ( output_df[self.GEOID_TRACT_FIELD_NAME] = (

View file

@ -14,6 +14,27 @@ class EJSCREENETL(ExtractTransformLoad):
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019" self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
self.df: pd.DataFrame self.df: pd.DataFrame
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
field_names.TOTAL_POP_FIELD,
# pylint: disable=duplicate-code
field_names.AIR_TOXICS_CANCER_RISK_FIELD,
field_names.RESPITORY_HAZARD_FIELD,
field_names.DIESEL_FIELD,
field_names.PM25_FIELD,
field_names.OZONE_FIELD,
field_names.TRAFFIC_FIELD,
field_names.RMP_FIELD,
field_names.TSDF_FIELD,
field_names.NPL_FIELD,
field_names.WASTEWATER_FIELD,
field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
field_names.POVERTY_FIELD,
field_names.OVER_64_FIELD,
field_names.UNDER_5_FIELD,
field_names.LEAD_PAINT_FIELD,
]
def extract(self) -> None: def extract(self) -> None:
logger.info("Downloading EJScreen Data") logger.info("Downloading EJScreen Data")
super().extract( super().extract(
@ -51,7 +72,6 @@ class EJSCREENETL(ExtractTransformLoad):
"PWDIS": field_names.WASTEWATER_FIELD, "PWDIS": field_names.WASTEWATER_FIELD,
"LINGISOPCT": field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD, "LINGISOPCT": field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
"LOWINCPCT": field_names.POVERTY_FIELD, "LOWINCPCT": field_names.POVERTY_FIELD,
"LESSHSPCT": field_names.HIGH_SCHOOL_ED_FIELD,
"OVER64PCT": field_names.OVER_64_FIELD, "OVER64PCT": field_names.OVER_64_FIELD,
"UNDER5PCT": field_names.UNDER_5_FIELD, "UNDER5PCT": field_names.UNDER_5_FIELD,
"PRE1960PCT": field_names.LEAD_PAINT_FIELD, "PRE1960PCT": field_names.LEAD_PAINT_FIELD,
@ -63,4 +83,6 @@ class EJSCREENETL(ExtractTransformLoad):
logger.info("Saving EJScreen CSV") logger.info("Saving EJScreen CSV")
# write nationwide csv # write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False) self.df[self.COLUMNS_TO_KEEP].to_csv(
self.CSV_PATH / "usa.csv", index=False
)

View file

@ -21,7 +21,7 @@
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n", "from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n",
"\n", "\n",
"\n", "\n",
"ACS_YEAR = 2019\n", "ACS_YEAR = 2010\n",
"\n", "\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n", "DATA_PATH = Path.cwd().parent / \"data\"\n",
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n", "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
@ -45,11 +45,13 @@
"source": [ "source": [
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
"censusdata.printtable(\n", "# censusdata.printtable(\n",
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n", "# censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
")\n", "# )\n",
"\n", "\n",
"# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')" "censusdata.search(\n",
" src=\"acs5\", year=ACS_YEAR, field=\"label\", criterion=\"employment status\"\n",
")"
] ]
}, },
{ {

View file

@ -3,7 +3,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "51412a14",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -49,7 +48,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "e3234c61",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -81,7 +79,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "3b1b5ccf",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -108,7 +105,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "1b1083e8",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -142,7 +138,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "fec0ed63",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -165,7 +160,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "d9968187",
"metadata": { "metadata": {
"scrolled": false "scrolled": false
}, },
@ -192,7 +186,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "a7cfeb3c",
"metadata": { "metadata": {
"scrolled": false "scrolled": false
}, },
@ -222,7 +215,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "df458f08",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -255,7 +247,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "a6c85d87",
"metadata": { "metadata": {
"scrolled": false "scrolled": false
}, },
@ -282,7 +273,7 @@
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n", " raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
"\n", "\n",
"if len(merged_df) > ExtractTransformLoad.EXPECTED_MAX_CENSUS_TRACTS:\n", "if len(merged_df) > ExtractTransformLoad.EXPECTED_MAX_CENSUS_TRACTS:\n",
" raise ValueError(\"Too many rows in the join.\")\n", " raise ValueError(f\"Too many rows in the join: {len(merged_df)}.\")\n",
"\n", "\n",
"merged_df.head()" "merged_df.head()"
] ]
@ -290,7 +281,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "274f6bc6",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -393,17 +383,17 @@
"ejscreen_areas_of_concern_census_block_group_indices = [\n", "ejscreen_areas_of_concern_census_block_group_indices = [\n",
" Index(\n", " Index(\n",
" method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n", " method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n", " other_census_tract_fields_to_keep=[],\n",
" ),\n", " ),\n",
" Index(\n", " Index(\n",
" method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n", " method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n", " other_census_tract_fields_to_keep=[],\n",
" ),\n", " ),\n",
" Index(\n", " Index(\n",
" method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n", " method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n", " priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n", " other_census_tract_fields_to_keep=[],\n",
" ),\n", " ),\n",
"]\n", "]\n",
@ -439,7 +429,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "bfae9cf5",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -457,7 +446,8 @@
"\n", "\n",
" # Calculate the population included as priority communities per tract. Will either be 0 or the population.\n", " # Calculate the population included as priority communities per tract. Will either be 0 or the population.\n",
" df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n", " df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
" df[priority_communities_field] * df[field_names.TOTAL_POP_FIELD]\n", " df[priority_communities_field]\n",
" * df[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010]\n",
" )\n", " )\n",
"\n", "\n",
" def calculate_state_comparison(\n", " def calculate_state_comparison(\n",
@ -496,7 +486,9 @@
" summary_dict[\"Geography name\"] = division_id\n", " summary_dict[\"Geography name\"] = division_id\n",
"\n", "\n",
" total_tracts_in_geography = len(frame)\n", " total_tracts_in_geography = len(frame)\n",
" total_population_in_geography = frame[field_names.TOTAL_POP_FIELD].sum()\n", " total_population_in_geography = frame[\n",
" field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010\n",
" ].sum()\n",
"\n", "\n",
" if geography_field == field_names.URBAN_HEURISTIC_FIELD:\n", " if geography_field == field_names.URBAN_HEURISTIC_FIELD:\n",
" urban_flag = frame[field_names.URBAN_HEURISTIC_FIELD].unique()[0]\n", " urban_flag = frame[field_names.URBAN_HEURISTIC_FIELD].unique()[0]\n",
@ -719,7 +711,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "c4d0e783",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -825,7 +816,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "8790cd64",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -1024,7 +1014,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "eeb9699d",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -1201,7 +1190,6 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "983abcea",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [

View file

@ -57,13 +57,13 @@ AMI_FIELD = "Area Median Income (State or metropolitan)"
# Climate # Climate
FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score" FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score"
EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = ( EXPECTED_BUILDING_LOSS_RATE_FIELD = (
"Expected building loss rate (Natural Hazards Risk Index)" "Expected building loss rate (Natural Hazards Risk Index)"
) )
EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = ( EXPECTED_AGRICULTURE_LOSS_RATE_FIELD = (
"Expected agricultural loss rate (Natural Hazards Risk Index)" "Expected agricultural loss rate (Natural Hazards Risk Index)"
) )
EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = ( EXPECTED_POPULATION_LOSS_RATE_FIELD = (
"Expected population loss rate (Natural Hazards Risk Index)" "Expected population loss rate (Natural Hazards Risk Index)"
) )
@ -117,6 +117,34 @@ AGGREGATION_POPULATION_FIELD = "Population Characteristics"
UNDER_5_FIELD = "Individuals under 5 years old" UNDER_5_FIELD = "Individuals under 5 years old"
OVER_64_FIELD = "Individuals over 64 years old" OVER_64_FIELD = "Individuals over 64 years old"
# Fields from 2010 decennial census (generally only loaded for the territories)
CENSUS_DECENNIAL_MEDIAN_INCOME_2009 = "Median household income in 2009 ($)"
CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = (
"Median household income as a percent of territory median income in 2009"
)
CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = (
"Percentage households below 100% of federal poverty line in 2009"
)
CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009"
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009"
CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009"
# Fields from 2010 ACS (loaded for comparison with the territories)
CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010"
CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
"Percent of individuals < 100% Federal Poverty Line in 2010"
)
# Combined fields that merge island areas and states data
COMBINED_CENSUS_TOTAL_POPULATION_2010 = (
"Total population in 2009 (island areas) and 2019 (states and PR)"
)
COMBINED_UNEMPLOYMENT_2010 = "Unemployed civilians (percent) in 2009 (island areas) and 2010 (states and PR)"
COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
"Percentage households below 100% of federal poverty line in 2009 (island areas) "
"and 2010 (states and PR)"
)
# Urban Rural Map # Urban Rural Map
URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag" URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
@ -124,39 +152,39 @@ URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units" MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
# EJSCREEN Areas of Concern # EJSCREEN Areas of Concern
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, National, 70th percentile (communities)" "EJSCREEN Areas of Concern, National, 70th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, National, 75th percentile (communities)" "EJSCREEN Areas of Concern, National, 75th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, National, 80th percentile (communities)" "EJSCREEN Areas of Concern, National, 80th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, National, 85th percentile (communities)" "EJSCREEN Areas of Concern, National, 85th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, National, 90th percentile (communities)" "EJSCREEN Areas of Concern, National, 90th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, National, 95th percentile (communities)" "EJSCREEN Areas of Concern, National, 95th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, State, 70th percentile (communities)" "EJSCREEN Areas of Concern, State, 70th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, State, 75th percentile (communities)" "EJSCREEN Areas of Concern, State, 75th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, State, 80th percentile (communities)" "EJSCREEN Areas of Concern, State, 80th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, State, 85th percentile (communities)" "EJSCREEN Areas of Concern, State, 85th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, State, 90th percentile (communities)" "EJSCREEN Areas of Concern, State, 90th percentile (communities)"
) )
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = (
"EJSCREEN Areas of Concern, State, 95th percentile (communities)" "EJSCREEN Areas of Concern, State, 95th percentile (communities)"
) )

View file

@ -1,3 +1,4 @@
import numpy as np
import pandas as pd import pandas as pd
from data_pipeline.score.score import Score from data_pipeline.score.score import Score
@ -12,8 +13,86 @@ class ScoreL(Score):
self.LOW_INCOME_THRESHOLD: float = 0.65 self.LOW_INCOME_THRESHOLD: float = 0.65
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90 self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
super().__init__(df) super().__init__(df)
def _combine_island_areas_with_states_and_set_thresholds(
self,
df: pd.DataFrame,
column_from_island_areas: str,
column_from_decennial_census: str,
combined_column_name: str,
threshold_cutoff_for_island_areas: float,
) -> (pd.DataFrame, str):
"""Steps to set thresholds for island areas.
This function is fairly logically complicated. It takes the following steps:
1. Combine the two different fields into a single field.
2. Calculate the 90th percentile cutoff raw value for the combined field.
3. Create a boolean series that is true for any census tract in the island
areas (and only the island areas) that exceeds this cutoff.
For step one, it combines data that is either the island area's Decennial Census
value in 2009 or the state's value in 5-year ACS ending in 2010.
This will be used to generate the percentile cutoff for the 90th percentile.
The stateside decennial census stopped asking economic comparisons,
so this is as close to apples-to-apples as we get. We use 5-year ACS for data
robustness over 1-year ACS.
"""
# Create the combined field.
# There should only be one entry in either 2009 or 2019 fields, not one in both.
# But just to be safe, we take the mean and ignore null values so if there
# *were* entries in both, this result would make sense.
df[combined_column_name] = df[
[column_from_island_areas, column_from_decennial_census]
].mean(axis=1, skipna=True)
logger.info(
f"Combined field `{combined_column_name}` has "
f"{df[combined_column_name].isnull().sum()} "
f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
f"missing values for census tracts. "
)
# Calculate the percentile threshold raw value.
raw_threshold = np.nanquantile(
a=df[combined_column_name], q=threshold_cutoff_for_island_areas
)
logger.info(
f"For combined field `{combined_column_name}`, "
f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
f"raw value of {raw_threshold:.3f}."
)
threshold_column_name = (
f"{column_from_island_areas} exceeds "
f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
)
df[threshold_column_name] = (
df[column_from_island_areas] >= raw_threshold
)
percent_of_tracts_highlighted = (
100
* df[threshold_column_name].sum()
/ df[column_from_island_areas].notnull().sum()
)
logger.info(
f"For `{threshold_column_name}`, "
f"{df[threshold_column_name].sum()} ("
f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data "
f"in the column) have a value of TRUE."
)
return df, threshold_column_name
def add_columns(self) -> pd.DataFrame: def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score L") logger.info("Adding Score L")
@ -67,21 +146,21 @@ class ScoreL(Score):
climate_criteria = ( climate_criteria = (
( (
self.df[ self.df[
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX + field_names.PERCENTILE_FIELD_SUFFIX
] ]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) )
| ( | (
self.df[ self.df[
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX + field_names.PERCENTILE_FIELD_SUFFIX
] ]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) )
| ( | (
self.df[ self.df[
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX + field_names.PERCENTILE_FIELD_SUFFIX
] ]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
@ -204,14 +283,24 @@ class ScoreL(Score):
# poverty level. Source: Census's American Community Survey] # poverty level. Source: Census's American Community Survey]
pollution_criteria = ( pollution_criteria = (
self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] (
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD self.df[
) | ( field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] ]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) | ( )
self.df[field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] | (
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD self.df[
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
| (
self.df[
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
) )
return pollution_criteria & ( return pollution_criteria & (
@ -306,7 +395,7 @@ class ScoreL(Score):
# AND # AND
# Where the high school degree achievement rates for adults 25 years and older is less than 95% # Where the high school degree achievement rates for adults 25 years and older is less than 95%
# (necessary to screen out university block groups) # (necessary to screen out university block groups)
workforce_criteria = ( workforce_criteria_for_states = (
( (
self.df[ self.df[
field_names.UNEMPLOYMENT_FIELD field_names.UNEMPLOYMENT_FIELD
@ -338,6 +427,76 @@ class ScoreL(Score):
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) )
) )
workforce_combined_criteria_for_states = (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
) & workforce_criteria_for_states
# Now, calculate workforce criteria for island territories.
# F a couple of values, create a combined field and criteria field.
# First, combine unemployment.
(
self.df,
unemployment_island_areas_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# Next, combine poverty.
(
self.df,
poverty_island_areas_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
workforce_combined_criteria_for_island_areas = (
self.df[unemployment_island_areas_criteria_field_name]
| self.df[poverty_island_areas_criteria_field_name]
# Also check whether area median income is 10th percentile or lower
# within the islands.
| (
self.df[
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
+ field_names.PERCENTILE_FIELD_SUFFIX
]
# Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
# and then look for median income lower than that (not greater than).
< 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
) & (
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
> self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
)
percent_of_island_tracts_highlighted = (
100
* workforce_combined_criteria_for_island_areas.sum()
# Choosing a random column from island areas to calculate the denominator.
/ self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
.notnull()
.sum()
)
logger.info(
f"For workforce criteria in island areas, "
f"{workforce_combined_criteria_for_island_areas.sum()} ("
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
f"in the column) have a value of TRUE."
)
# A tract is included if it meets either the states tract criteria or the
# island areas tract criteria.
return ( return (
self.df[field_names.HIGH_SCHOOL_ED_FIELD] >= 0.10 workforce_combined_criteria_for_states
) & workforce_criteria | workforce_combined_criteria_for_island_areas
)

View file

@ -67,6 +67,9 @@ disable = [
"C0115", # Disables missing class docstring "C0115", # Disables missing class docstring
"R0915", # Disables too many statements (score generation transform) "R0915", # Disables too many statements (score generation transform)
"W0231", # Disables super init not called "W0231", # Disables super init not called
"R0801", # Disables duplicate code. There are a couple places we have similar code and
# unfortunately you can't disable this rule for individual lines or files, it's a
# known bug. https://github.com/PyCQA/pylint/issues/214#
] ]
[tool.pylint.FORMAT] [tool.pylint.FORMAT]