mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Merge branch 'usds:main' into main
This commit is contained in:
commit
12456c8dc5
16 changed files with 885 additions and 161 deletions
|
@ -32,10 +32,15 @@ class ExtractTransformLoad:
|
|||
FILES_PATH: Path = settings.APP_ROOT / "files"
|
||||
GEOID_FIELD_NAME: str = "GEOID10"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
|
||||
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||
# be from CBGs at different time periods.
|
||||
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
|
||||
# TODO: investigate. Census says there are only 73,057 tracts in the US. This might be from tracts at different time periods.
|
||||
EXPECTED_MAX_CENSUS_TRACTS: int = 74027
|
||||
|
||||
# TODO: investigate. Census says there are only 74,134 tracts in the US,
|
||||
# Puerto Rico, and island areas. This might be from tracts at different time
|
||||
# periods. https://github.com/usds/justice40-tool/issues/964
|
||||
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
|
||||
|
||||
def __init__(self, config_path: Path) -> None:
|
||||
"""Inits the class with instance specific variables"""
|
||||
|
|
|
@ -4,6 +4,11 @@ DATASET_LIST = [
|
|||
"module_dir": "census_acs",
|
||||
"class_name": "CensusACSETL",
|
||||
},
|
||||
{
|
||||
"name": "census_acs_2010",
|
||||
"module_dir": "census_acs_2010",
|
||||
"class_name": "CensusACS2010ETL",
|
||||
},
|
||||
{
|
||||
"name": "ejscreen",
|
||||
"module_dir": "ejscreen",
|
||||
|
@ -14,16 +19,6 @@ DATASET_LIST = [
|
|||
"module_dir": "hud_housing",
|
||||
"class_name": "HudHousingETL",
|
||||
},
|
||||
{
|
||||
"name": "calenviroscreen",
|
||||
"module_dir": "calenviroscreen",
|
||||
"class_name": "CalEnviroScreenETL",
|
||||
},
|
||||
{
|
||||
"name": "hud_recap",
|
||||
"module_dir": "hud_recap",
|
||||
"class_name": "HudRecapETL",
|
||||
},
|
||||
{
|
||||
"name": "cdc_places",
|
||||
"module_dir": "cdc_places",
|
||||
|
@ -74,6 +69,16 @@ DATASET_LIST = [
|
|||
"module_dir": "housing_and_transportation",
|
||||
"class_name": "HousingTransportationETL",
|
||||
},
|
||||
{
|
||||
"name": "calenviroscreen",
|
||||
"module_dir": "calenviroscreen",
|
||||
"class_name": "CalEnviroScreenETL",
|
||||
},
|
||||
{
|
||||
"name": "hud_recap",
|
||||
"module_dir": "hud_recap",
|
||||
"class_name": "HudRecapETL",
|
||||
},
|
||||
{
|
||||
"name": "tree_equity_score",
|
||||
"module_dir": "tree_equity_score",
|
||||
|
|
|
@ -27,6 +27,8 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.national_risk_index_df: pd.DataFrame
|
||||
self.geocorr_urban_rural_df: pd.DataFrame
|
||||
self.persistent_poverty_df: pd.DataFrame
|
||||
self.census_decennial_df: pd.DataFrame
|
||||
self.census_2010_df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Loading data sets from disk.")
|
||||
|
@ -137,6 +139,29 @@ class ScoreETL(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
|
||||
# Load decennial census data
|
||||
census_decennial_csv = (
|
||||
constants.DATA_PATH
|
||||
/ "dataset"
|
||||
/ "census_decennial_2010"
|
||||
/ "usa.csv"
|
||||
)
|
||||
self.census_decennial_df = pd.read_csv(
|
||||
census_decennial_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
# Load 2010 ACS data from states
|
||||
census_2010_csv = (
|
||||
constants.DATA_PATH / "dataset" / "census_acs_2010" / "usa.csv"
|
||||
)
|
||||
self.census_2010_df = pd.read_csv(
|
||||
census_2010_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
||||
logger.info("Joining Census Tract dataframes")
|
||||
|
||||
|
@ -228,6 +253,8 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.persistent_poverty_df,
|
||||
self.national_risk_index_df,
|
||||
self.census_acs_median_incomes_df,
|
||||
self.census_decennial_df,
|
||||
self.census_2010_df,
|
||||
]
|
||||
|
||||
# Sanity check each data frame before merging.
|
||||
|
@ -296,9 +323,16 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.MEDIAN_HOUSE_VALUE_FIELD,
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME,
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME,
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME,
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
|
||||
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
|
||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
|
||||
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
|
||||
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
|
||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
|
||||
]
|
||||
|
||||
non_numeric_columns = [
|
||||
|
@ -315,9 +349,9 @@ class ScoreETL(ExtractTransformLoad):
|
|||
# Convert all columns to numeric and do math
|
||||
for col in numeric_columns:
|
||||
# Calculate percentiles
|
||||
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[col].rank(
|
||||
pct=True
|
||||
)
|
||||
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[
|
||||
col
|
||||
].rank(pct=True)
|
||||
|
||||
# Min-max normalization:
|
||||
# (
|
||||
|
@ -341,6 +375,20 @@ class ScoreETL(ExtractTransformLoad):
|
|||
df_copy[col] - min_value
|
||||
) / (max_value - min_value)
|
||||
|
||||
# Special logic: create a combined population field.
|
||||
# We sometimes run analytics on "population", and this makes a single field
|
||||
# that is either the island area's population in 2009 or the state's
|
||||
# population in 2019.
|
||||
# There should only be one entry in either 2009 or 2019, not one in both.
|
||||
# But just to be safe, we take the mean and ignore null values so if there
|
||||
# *were* entries in both fields, this result would make sense.
|
||||
df_copy[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010] = df_copy[
|
||||
[
|
||||
field_names.TOTAL_POP_FIELD,
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
|
||||
]
|
||||
].mean(axis=1, skipna=True)
|
||||
|
||||
return df_copy
|
||||
|
||||
def transform(self) -> None:
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
import pandas as pd
|
||||
import censusdata
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
@ -14,7 +13,15 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.OUTPUT_PATH = (
|
||||
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
|
||||
)
|
||||
|
||||
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
|
||||
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
|
||||
self.EMPLOYMENT_FIELDS = [
|
||||
self.TOTAL_UNEMPLOYED_FIELD,
|
||||
self.TOTAL_IN_LABOR_FORCE,
|
||||
]
|
||||
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
||||
|
||||
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
|
||||
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
|
||||
"Linguistic isolation (total)"
|
||||
|
@ -55,59 +62,89 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
"Median value ($) of owner-occupied housing units"
|
||||
)
|
||||
|
||||
# Educational attainment figures
|
||||
self.EDUCATION_POPULATION_OVER_25 = "B15003_001E" # Estimate!!Total
|
||||
self.EDUCATION_NO_SCHOOLING = (
|
||||
"B15003_002E" # Estimate!!Total!!No schooling completed
|
||||
)
|
||||
self.EDUCATION_NURSERY = (
|
||||
"B15003_003E" # Estimate!!Total!!Nursery school
|
||||
)
|
||||
self.EDUCATION_KINDERGARTEN = (
|
||||
"B15003_004E" # Estimate!!Total!!Kindergarten
|
||||
)
|
||||
self.EDUCATION_FIRST = "B15003_005E" # Estimate!!Total!!1st grade
|
||||
self.EDUCATION_SECOND = "B15003_006E" # Estimate!!Total!!2nd grade
|
||||
self.EDUCATION_THIRD = "B15003_007E" # Estimate!!Total!!3rd grade
|
||||
self.EDUCATION_FOURTH = "B15003_008E" # Estimate!!Total!!4th grade
|
||||
self.EDUCATION_FIFTH = "B15003_009E" # Estimate!!Total!!5th grade
|
||||
self.EDUCATION_SIXTH = "B15003_010E" # Estimate!!Total!!6th grade
|
||||
self.EDUCATION_SEVENTH = "B15003_011E" # Estimate!!Total!!7th grade
|
||||
self.EDUCATION_EIGHTH = "B15003_012E" # Estimate!!Total!!8th grade
|
||||
self.EDUCATION_NINTH = "B15003_013E" # Estimate!!Total!!9th grade
|
||||
self.EDUCATION_TENTH = "B15003_014E" # Estimate!!Total!!10th grade
|
||||
self.EDUCATION_ELEVENTH = "B15003_015E" # Estimate!!Total!!11th grade
|
||||
self.EDUCATION_TWELFTH_NO_DIPLOMA = (
|
||||
"B15003_016E" # Estimate!!Total!!12th grade, no diploma
|
||||
)
|
||||
|
||||
self.EDUCATIONAL_FIELDS = [
|
||||
self.EDUCATION_POPULATION_OVER_25,
|
||||
self.EDUCATION_NO_SCHOOLING,
|
||||
self.EDUCATION_NURSERY,
|
||||
self.EDUCATION_KINDERGARTEN,
|
||||
self.EDUCATION_FIRST,
|
||||
self.EDUCATION_SECOND,
|
||||
self.EDUCATION_THIRD,
|
||||
self.EDUCATION_FOURTH,
|
||||
self.EDUCATION_FIFTH,
|
||||
self.EDUCATION_SIXTH,
|
||||
self.EDUCATION_SEVENTH,
|
||||
self.EDUCATION_EIGHTH,
|
||||
self.EDUCATION_NINTH,
|
||||
self.EDUCATION_TENTH,
|
||||
self.EDUCATION_ELEVENTH,
|
||||
self.EDUCATION_TWELFTH_NO_DIPLOMA,
|
||||
]
|
||||
|
||||
self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD = (
|
||||
"Individuals age 25 or over with less than high school degree"
|
||||
)
|
||||
self.HIGH_SCHOOL_ED_FIELD = "Percent individuals age 25 or over with less than high school degree"
|
||||
|
||||
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
||||
|
||||
self.df: pd.DataFrame
|
||||
|
||||
def _fips_from_censusdata_censusgeo(
|
||||
self, censusgeo: censusdata.censusgeo
|
||||
) -> str:
|
||||
"""Create a FIPS code from the proprietary censusgeo index."""
|
||||
fips = "".join([value for (key, value) in censusgeo.params()])
|
||||
return fips
|
||||
|
||||
def extract(self) -> None:
|
||||
dfs = []
|
||||
for fips in get_state_fips_codes(self.DATA_PATH):
|
||||
logger.info(
|
||||
f"Downloading data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
# Define the variables to retrieve
|
||||
variables = (
|
||||
[
|
||||
# Income field
|
||||
self.MEDIAN_INCOME_FIELD,
|
||||
# House value
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD,
|
||||
]
|
||||
+ self.EMPLOYMENT_FIELDS
|
||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||
+ self.POVERTY_FIELDS
|
||||
+ self.EDUCATIONAL_FIELDS
|
||||
)
|
||||
|
||||
try:
|
||||
response = censusdata.download(
|
||||
src="acs5",
|
||||
year=self.ACS_YEAR,
|
||||
geo=censusdata.censusgeo(
|
||||
[("state", fips), ("county", "*"), ("tract", "*")]
|
||||
),
|
||||
var=[
|
||||
# Emploment fields
|
||||
"B23025_005E",
|
||||
"B23025_003E",
|
||||
# Income field
|
||||
self.MEDIAN_INCOME_FIELD,
|
||||
# House value
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD,
|
||||
]
|
||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||
+ self.POVERTY_FIELDS,
|
||||
)
|
||||
dfs.append(response)
|
||||
except ValueError:
|
||||
logger.error(
|
||||
f"Could not download data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
self.df = pd.concat(dfs)
|
||||
|
||||
self.df[self.GEOID_TRACT_FIELD_NAME] = self.df.index.to_series().apply(
|
||||
func=self._fips_from_censusdata_censusgeo
|
||||
self.df = retrieve_census_acs_data(
|
||||
acs_year=self.ACS_YEAR,
|
||||
variables=variables,
|
||||
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||
data_path_for_fips_codes=self.DATA_PATH,
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting Census ACS Transform")
|
||||
|
||||
df = self.df
|
||||
|
||||
# Rename two fields.
|
||||
self.df = self.df.rename(
|
||||
df = df.rename(
|
||||
columns={
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
||||
|
@ -119,19 +156,17 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.MEDIAN_INCOME_FIELD_NAME,
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||
]:
|
||||
missing_value_count = sum(self.df[field] == -666666666)
|
||||
missing_value_count = sum(df[field] == -666666666)
|
||||
logger.info(
|
||||
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
|
||||
f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
|
||||
+ f"`{field}` being marked as null values."
|
||||
)
|
||||
self.df[field] = self.df[field].replace(
|
||||
to_replace=-666666666, value=None
|
||||
)
|
||||
df[field] = df[field].replace(to_replace=-666666666, value=None)
|
||||
|
||||
# Calculate percent unemployment.
|
||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||
self.df[self.UNEMPLOYED_FIELD_NAME] = (
|
||||
self.df.B23025_005E / self.df.B23025_003E
|
||||
df[self.UNEMPLOYED_FIELD_NAME] = (
|
||||
df[self.TOTAL_UNEMPLOYED_FIELD] / df[self.TOTAL_IN_LABOR_FORCE]
|
||||
)
|
||||
|
||||
# Calculate linguistic isolation.
|
||||
|
@ -142,34 +177,64 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
"C16002_013E",
|
||||
]
|
||||
|
||||
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = self.df[
|
||||
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[
|
||||
individual_limited_english_fields
|
||||
].sum(axis=1, skipna=True)
|
||||
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
|
||||
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
|
||||
/ self.df["C16002_001E"]
|
||||
df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
|
||||
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
|
||||
/ df["C16002_001E"]
|
||||
)
|
||||
|
||||
# Calculate percent at different poverty thresholds
|
||||
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||
self.df["C17002_002E"] + self.df["C17002_003E"]
|
||||
) / self.df["C17002_001E"]
|
||||
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"] + df["C17002_003E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
self.df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||
self.df["C17002_002E"]
|
||||
+ self.df["C17002_003E"]
|
||||
+ self.df["C17002_004E"]
|
||||
+ self.df["C17002_005E"]
|
||||
) / self.df["C17002_001E"]
|
||||
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
self.df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||
self.df["C17002_002E"]
|
||||
+ self.df["C17002_003E"]
|
||||
+ self.df["C17002_004E"]
|
||||
+ self.df["C17002_005E"]
|
||||
+ self.df["C17002_006E"]
|
||||
+ self.df["C17002_007E"]
|
||||
) / self.df["C17002_001E"]
|
||||
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
+ df["C17002_006E"]
|
||||
+ df["C17002_007E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
# Calculate educational attainment
|
||||
educational_numerator_fields = [
|
||||
self.EDUCATION_NO_SCHOOLING,
|
||||
self.EDUCATION_NURSERY,
|
||||
self.EDUCATION_KINDERGARTEN,
|
||||
self.EDUCATION_FIRST,
|
||||
self.EDUCATION_SECOND,
|
||||
self.EDUCATION_THIRD,
|
||||
self.EDUCATION_FOURTH,
|
||||
self.EDUCATION_FIFTH,
|
||||
self.EDUCATION_SIXTH,
|
||||
self.EDUCATION_SEVENTH,
|
||||
self.EDUCATION_EIGHTH,
|
||||
self.EDUCATION_NINTH,
|
||||
self.EDUCATION_TENTH,
|
||||
self.EDUCATION_ELEVENTH,
|
||||
self.EDUCATION_TWELFTH_NO_DIPLOMA,
|
||||
]
|
||||
|
||||
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD] = df[
|
||||
educational_numerator_fields
|
||||
].sum(axis=1)
|
||||
df[self.HIGH_SCHOOL_ED_FIELD] = (
|
||||
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD]
|
||||
/ df[self.EDUCATION_POPULATION_OVER_25]
|
||||
)
|
||||
|
||||
# Save results to self.
|
||||
self.df = df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census ACS Data")
|
||||
|
@ -186,6 +251,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||
self.HIGH_SCHOOL_ED_FIELD,
|
||||
]
|
||||
|
||||
self.df[columns_to_include].to_csv(
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
from pathlib import Path
|
||||
from typing import List
|
||||
import censusdata
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def _fips_from_censusdata_censusgeo(
|
||||
censusgeo: censusdata.censusgeo
|
||||
) -> str:
|
||||
"""Create a FIPS code from the proprietary censusgeo index."""
|
||||
fips = "".join([value for (key, value) in censusgeo.params()])
|
||||
return fips
|
||||
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def retrieve_census_acs_data(
|
||||
acs_year: int,
|
||||
variables: List[str],
|
||||
tract_output_field_name: str,
|
||||
data_path_for_fips_codes: Path,
|
||||
acs_type="acs5",
|
||||
raise_errors: bool = False,
|
||||
) -> pd.DataFrame:
|
||||
"""Retrieves and combines census ACS data for a given year."""
|
||||
dfs = []
|
||||
for fips in get_state_fips_codes(data_path_for_fips_codes):
|
||||
logger.info(
|
||||
f"Downloading data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
try:
|
||||
response = censusdata.download(
|
||||
src=acs_type,
|
||||
year=acs_year,
|
||||
geo=censusdata.censusgeo(
|
||||
[("state", fips), ("county", "*"), ("tract", "*")]
|
||||
),
|
||||
var=variables,
|
||||
)
|
||||
dfs.append(response)
|
||||
|
||||
except ValueError as e:
|
||||
logger.error(
|
||||
f"Could not download data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
if raise_errors:
|
||||
raise e
|
||||
|
||||
df = pd.concat(dfs)
|
||||
|
||||
df[tract_output_field_name] = df.index.to_series().apply(
|
||||
func=_fips_from_censusdata_censusgeo
|
||||
)
|
||||
|
||||
return df
|
|
@ -0,0 +1,186 @@
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class CensusACS2010ETL(ExtractTransformLoad):
|
||||
"""Extract ACS data from 2010 or approximately that year.
|
||||
|
||||
Note: Census ACS 2010 uses different fields than those captured in CensusACSETL.
|
||||
|
||||
To support this, we created a separate class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ACS_YEAR = 2010
|
||||
self.ACS_TYPE = "acs5"
|
||||
self.OUTPUT_PATH = (
|
||||
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
|
||||
)
|
||||
|
||||
# Employment fields
|
||||
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED = (
|
||||
"B23006_007E"
|
||||
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED = (
|
||||
"B23006_014E"
|
||||
# Estimate!!Total!!High school graduate!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED = (
|
||||
"B23006_021E"
|
||||
# Estimate!!Total!!Some college or associate's degree!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_COLLEGE_UNEMPLOYED = (
|
||||
"B23006_028E"
|
||||
# Estimate!!Total!!Bachelor's degree or higher!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
self.UNEMPLOYED_FIELDS = [
|
||||
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED,
|
||||
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED,
|
||||
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED,
|
||||
self.EMPLOYMENT_COLLEGE_UNEMPLOYED,
|
||||
]
|
||||
|
||||
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE = (
|
||||
# TODO: FIX!!!!!!
|
||||
"B23006_005E"
|
||||
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian
|
||||
)
|
||||
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE = (
|
||||
"B23006_010E"
|
||||
# Estimate!!Total!!High school graduate!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE = (
|
||||
"B23006_017E"
|
||||
# Estimate!!Total!!Some college or associate's degree!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE = (
|
||||
"B23006_024E"
|
||||
# Estimate!!Total!!Bachelor's degree or higher!!In labor force
|
||||
)
|
||||
|
||||
self.IN_LABOR_FORCE_FIELDS = [
|
||||
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE,
|
||||
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE,
|
||||
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE,
|
||||
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE,
|
||||
]
|
||||
|
||||
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
||||
|
||||
self.POVERTY_FIELDS = [
|
||||
"C17002_001E", # Estimate!!Total,
|
||||
"C17002_002E", # Estimate!!Total!!Under .50
|
||||
"C17002_003E", # Estimate!!Total!!.50 to .99
|
||||
"C17002_004E", # Estimate!!Total!!1.00 to 1.24
|
||||
"C17002_005E", # Estimate!!Total!!1.25 to 1.49
|
||||
"C17002_006E", # Estimate!!Total!!1.50 to 1.84
|
||||
"C17002_007E", # Estimate!!Total!!1.85 to 1.99
|
||||
]
|
||||
|
||||
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME = (
|
||||
"Percent of individuals < 100% Federal Poverty Line"
|
||||
)
|
||||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME = (
|
||||
"Percent of individuals < 150% Federal Poverty Line"
|
||||
)
|
||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
|
||||
"Percent of individuals < 200% Federal Poverty Line"
|
||||
)
|
||||
|
||||
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
||||
|
||||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
# Define the variables to retrieve
|
||||
variables = (
|
||||
self.UNEMPLOYED_FIELDS
|
||||
+ self.IN_LABOR_FORCE_FIELDS
|
||||
+ self.POVERTY_FIELDS
|
||||
)
|
||||
|
||||
# Use the method defined on CensusACSETL to reduce coding redundancy.
|
||||
self.df = retrieve_census_acs_data(
|
||||
acs_year=self.ACS_YEAR,
|
||||
variables=variables,
|
||||
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||
data_path_for_fips_codes=self.DATA_PATH,
|
||||
acs_type=self.ACS_TYPE,
|
||||
raise_errors=False,
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting Census ACS Transform")
|
||||
|
||||
df = self.df
|
||||
|
||||
# Calculate percent unemployment.
|
||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||
unemployed_totals = df[self.UNEMPLOYED_FIELDS].sum(axis=1)
|
||||
labor_force_totals = df[self.IN_LABOR_FORCE_FIELDS].sum(axis=1)
|
||||
|
||||
df[self.UNEMPLOYED_FIELD_NAME] = unemployed_totals / labor_force_totals
|
||||
|
||||
# Calculate percent at different poverty thresholds
|
||||
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"] + df["C17002_003E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
+ df["C17002_006E"]
|
||||
+ df["C17002_007E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
# Save results to self.
|
||||
self.df = df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census ACS Data")
|
||||
|
||||
# mkdir census
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
columns_to_include = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
self.UNEMPLOYED_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||
]
|
||||
|
||||
output_df = self.df[columns_to_include]
|
||||
|
||||
# Add the year to the end of every column, so when it's all joined in the
|
||||
# score df, it's obvious which year this data is from.
|
||||
for column in columns_to_include:
|
||||
if column != self.GEOID_TRACT_FIELD_NAME:
|
||||
output_df = output_df.rename(
|
||||
columns={
|
||||
column: f"{column} in {self.ACS_YEAR}",
|
||||
}
|
||||
)
|
||||
|
||||
output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Census ACS Data")
|
||||
|
||||
pass
|
|
@ -27,12 +27,21 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
# https://api.census.gov/data/2010/dec/gu/variables.html
|
||||
# https://api.census.gov/data/2010/dec/mp/variables.html
|
||||
# https://api.census.gov/data/2010/dec/vi/variables.html
|
||||
|
||||
# Total population field is the same in all island areas
|
||||
self.TOTAL_POP_FIELD = self.TOTAL_POP_VI_FIELD = "P001001"
|
||||
self.TOTAL_POP_FIELD_NAME = "Total population in 2009"
|
||||
|
||||
self.MEDIAN_INCOME_FIELD = "PBG049001"
|
||||
self.MEDIAN_INCOME_VI_FIELD = "PBG047001"
|
||||
self.MEDIAN_INCOME_FIELD_NAME = (
|
||||
"MEDIAN HOUSEHOLD INCOME IN 2009 (DOLLARS)"
|
||||
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in 2009 ($)"
|
||||
self.AREA_MEDIAN_INCOME_FIELD_NAME = (
|
||||
"Median household income as a percent of "
|
||||
"territory median income in 2009"
|
||||
)
|
||||
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD = "Territory Median Income"
|
||||
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001"
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = (
|
||||
"PBG077001"
|
||||
|
@ -48,7 +57,39 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL"
|
||||
"Percentage households below 200% of federal poverty line in 2009"
|
||||
)
|
||||
|
||||
# We will combine three fields to get households < 100% FPL.
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE = (
|
||||
"PBG083002" # Total!!Under .50
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO = (
|
||||
"PBG083003" # Total!!.50 to .74
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE = (
|
||||
"PBG083004" # Total!!.75 to .99
|
||||
)
|
||||
|
||||
# Same fields, for Virgin Islands.
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE = (
|
||||
"PBG077002" # Total!!Under .50
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO = (
|
||||
"PBG077003" # Total!!.50 to .74
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE = (
|
||||
"PBG077004" # Total!!.75 to .99
|
||||
)
|
||||
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
||||
)
|
||||
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Percentage households below 100% of federal poverty line in 2009"
|
||||
)
|
||||
|
||||
# High School Education Fields
|
||||
|
@ -70,9 +111,37 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||
)
|
||||
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
||||
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree in 2009"
|
||||
|
||||
# Employment fields
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD = (
|
||||
"PBG038003" # Total!!Male!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD = (
|
||||
"PBG038007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD = (
|
||||
"PBG038010" # Total!!Female!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD = (
|
||||
"PBG038014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
# Same fields, Virgin Islands.
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||
"PBG036003" # Total!!Male!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD = (
|
||||
"PBG036007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||
"PBG036010" # Total!!Female!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD = (
|
||||
"PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009"
|
||||
|
||||
var_list = [
|
||||
self.MEDIAN_INCOME_FIELD,
|
||||
|
@ -81,6 +150,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.TOTAL_POPULATION_FIELD,
|
||||
self.MALE_HIGH_SCHOOL_ED_FIELD,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
self.TOTAL_POP_FIELD,
|
||||
]
|
||||
var_list = ",".join(var_list)
|
||||
|
||||
|
@ -91,6 +168,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.TOTAL_POPULATION_VI_FIELD,
|
||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
|
||||
self.TOTAL_POP_VI_FIELD,
|
||||
]
|
||||
var_list_vi = ",".join(var_list_vi)
|
||||
|
||||
|
@ -107,6 +192,20 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
}
|
||||
|
||||
# To do: Ask Census Slack Group about whether you need to hardcode the county fips
|
||||
|
@ -117,24 +216,30 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
"fips": "60",
|
||||
"county_fips": ["010", "020", "030", "040", "050"],
|
||||
"var_list": var_list,
|
||||
# Note: we hardcode the median income for each territory in this dict,
|
||||
# because that data is hard to programmatically access.
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 23892,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "gu",
|
||||
"fips": "66",
|
||||
"county_fips": ["010"],
|
||||
"var_list": var_list,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 48274,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "mp",
|
||||
"fips": "69",
|
||||
"county_fips": ["085", "100", "110", "120"],
|
||||
"var_list": var_list,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 19958,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "vi",
|
||||
"fips": "78",
|
||||
"county_fips": ["010", "020", "030"],
|
||||
"var_list": var_list_vi,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 37254,
|
||||
},
|
||||
]
|
||||
|
||||
|
@ -198,6 +303,11 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
# Combine the dfs after renaming
|
||||
self.df_all = pd.concat([self.df, self.df_vi])
|
||||
|
||||
# Rename total population:
|
||||
self.df_all[self.TOTAL_POP_FIELD_NAME] = self.df_all[
|
||||
self.TOTAL_POP_FIELD
|
||||
]
|
||||
|
||||
# Percentage of households below 200% which is
|
||||
# [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total)
|
||||
self.df_all[
|
||||
|
@ -211,6 +321,25 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||
]
|
||||
|
||||
# Percentage of households below 100% FPL
|
||||
# which we get by adding `Total!!Under .50`, `Total!!.50 to .74`, ` Total!!.75 to .99`,
|
||||
# and then dividing by PBG083001 (total)
|
||||
self.df_all[
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME
|
||||
] = (
|
||||
self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE
|
||||
]
|
||||
+ self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO
|
||||
]
|
||||
+ self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE
|
||||
]
|
||||
) / self.df_all[
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||
]
|
||||
|
||||
# Percentage High School Achievement is
|
||||
# Percentage = (Male + Female) / (Total)
|
||||
self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = (
|
||||
|
@ -218,6 +347,28 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
+ self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
||||
) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME]
|
||||
|
||||
# Calculate employment.
|
||||
self.df_all[self.UNEMPLOYMENT_FIELD_NAME] = (
|
||||
self.df_all[self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD]
|
||||
+ self.df_all[self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD]
|
||||
) / (
|
||||
self.df_all[self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD]
|
||||
+ self.df_all[self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD]
|
||||
)
|
||||
|
||||
# Calculate area median income
|
||||
median_income_df = pd.DataFrame(self.ISLAND_TERRITORIES)
|
||||
median_income_df = median_income_df[
|
||||
["fips", self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||
]
|
||||
self.df_all = self.df_all.merge(
|
||||
right=median_income_df, left_on="state", right_on="fips", how="left"
|
||||
)
|
||||
self.df_all[self.AREA_MEDIAN_INCOME_FIELD_NAME] = (
|
||||
self.df_all[self.MEDIAN_INCOME_FIELD_NAME]
|
||||
/ self.df_all[self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||
)
|
||||
|
||||
# Creating Geo ID (Census Block Group) Field Name
|
||||
self.df_all[self.GEOID_TRACT_FIELD_NAME] = (
|
||||
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
||||
|
@ -238,9 +389,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
|
||||
columns_to_include = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
self.TOTAL_POP_FIELD_NAME,
|
||||
self.MEDIAN_INCOME_FIELD_NAME,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD,
|
||||
self.AREA_MEDIAN_INCOME_FIELD_NAME,
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.UNEMPLOYMENT_FIELD_NAME,
|
||||
]
|
||||
|
||||
self.df_all[columns_to_include].to_csv(
|
||||
|
|
|
@ -12,15 +12,15 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
|||
def __init__(self):
|
||||
self.DOE_FILE_URL = (
|
||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||
+ "/DOE_LEAD_with_EJSCREEN.csv.zip"
|
||||
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
|
||||
)
|
||||
|
||||
self.OUTPUT_PATH: Path = (
|
||||
self.DATA_PATH / "dataset" / "doe_energy_burden"
|
||||
)
|
||||
|
||||
self.TRACT_INPUT_COLUMN_NAME = "GEOID"
|
||||
self.ENERGY_BURDEN_FIELD_NAME = "Energy burden"
|
||||
self.TRACT_INPUT_COLUMN_NAME = "FIP"
|
||||
self.ENERGY_BURDEN_FIELD_NAME = "BURDEN"
|
||||
|
||||
# Constants for output
|
||||
self.COLUMNS_TO_KEEP = [
|
||||
|
@ -61,11 +61,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
|||
}
|
||||
)
|
||||
|
||||
# Convert energy burden to a fraction, since we represent all other percentages as fractions.
|
||||
output_df[self.ENERGY_BURDEN_FIELD_NAME] = (
|
||||
output_df[self.ENERGY_BURDEN_FIELD_NAME] / 100
|
||||
)
|
||||
|
||||
# Left-pad the tracts with 0s
|
||||
expected_length_of_census_tract_field = 11
|
||||
output_df[self.GEOID_TRACT_FIELD_NAME] = (
|
||||
|
|
|
@ -14,6 +14,27 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
|
||||
self.df: pd.DataFrame
|
||||
|
||||
self.COLUMNS_TO_KEEP = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
field_names.TOTAL_POP_FIELD,
|
||||
# pylint: disable=duplicate-code
|
||||
field_names.AIR_TOXICS_CANCER_RISK_FIELD,
|
||||
field_names.RESPITORY_HAZARD_FIELD,
|
||||
field_names.DIESEL_FIELD,
|
||||
field_names.PM25_FIELD,
|
||||
field_names.OZONE_FIELD,
|
||||
field_names.TRAFFIC_FIELD,
|
||||
field_names.RMP_FIELD,
|
||||
field_names.TSDF_FIELD,
|
||||
field_names.NPL_FIELD,
|
||||
field_names.WASTEWATER_FIELD,
|
||||
field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
|
||||
field_names.POVERTY_FIELD,
|
||||
field_names.OVER_64_FIELD,
|
||||
field_names.UNDER_5_FIELD,
|
||||
field_names.LEAD_PAINT_FIELD,
|
||||
]
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading EJScreen Data")
|
||||
super().extract(
|
||||
|
@ -51,7 +72,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
"PWDIS": field_names.WASTEWATER_FIELD,
|
||||
"LINGISOPCT": field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
|
||||
"LOWINCPCT": field_names.POVERTY_FIELD,
|
||||
"LESSHSPCT": field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
"OVER64PCT": field_names.OVER_64_FIELD,
|
||||
"UNDER5PCT": field_names.UNDER_5_FIELD,
|
||||
"PRE1960PCT": field_names.LEAD_PAINT_FIELD,
|
||||
|
@ -63,4 +83,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
logger.info("Saving EJScreen CSV")
|
||||
# write nationwide csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
self.CSV_PATH / "usa.csv", index=False
|
||||
)
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ACS_YEAR = 2019\n",
|
||||
"ACS_YEAR = 2010\n",
|
||||
"\n",
|
||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
|
||||
|
@ -45,11 +45,13 @@
|
|||
"source": [
|
||||
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
|
||||
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
|
||||
"censusdata.printtable(\n",
|
||||
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
|
||||
")\n",
|
||||
"# censusdata.printtable(\n",
|
||||
"# censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
|
||||
"# )\n",
|
||||
"\n",
|
||||
"# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')"
|
||||
"censusdata.search(\n",
|
||||
" src=\"acs5\", year=ACS_YEAR, field=\"label\", criterion=\"employment status\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51412a14",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -49,7 +48,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e3234c61",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -81,7 +79,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b1b5ccf",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -108,7 +105,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1b1083e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -142,7 +138,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fec0ed63",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -165,7 +160,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d9968187",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
|
@ -192,7 +186,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a7cfeb3c",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
|
@ -222,7 +215,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "df458f08",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -255,7 +247,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a6c85d87",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
|
@ -282,7 +273,7 @@
|
|||
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
|
||||
"\n",
|
||||
"if len(merged_df) > ExtractTransformLoad.EXPECTED_MAX_CENSUS_TRACTS:\n",
|
||||
" raise ValueError(\"Too many rows in the join.\")\n",
|
||||
" raise ValueError(f\"Too many rows in the join: {len(merged_df)}.\")\n",
|
||||
"\n",
|
||||
"merged_df.head()"
|
||||
]
|
||||
|
@ -290,7 +281,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "274f6bc6",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -393,17 +383,17 @@
|
|||
"ejscreen_areas_of_concern_census_block_group_indices = [\n",
|
||||
" Index(\n",
|
||||
" method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
|
||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n",
|
||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD,\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
|
||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n",
|
||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD,\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
|
||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n",
|
||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD,\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
|
@ -439,7 +429,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bfae9cf5",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -457,7 +446,8 @@
|
|||
"\n",
|
||||
" # Calculate the population included as priority communities per tract. Will either be 0 or the population.\n",
|
||||
" df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
|
||||
" df[priority_communities_field] * df[field_names.TOTAL_POP_FIELD]\n",
|
||||
" df[priority_communities_field]\n",
|
||||
" * df[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def calculate_state_comparison(\n",
|
||||
|
@ -496,7 +486,9 @@
|
|||
" summary_dict[\"Geography name\"] = division_id\n",
|
||||
"\n",
|
||||
" total_tracts_in_geography = len(frame)\n",
|
||||
" total_population_in_geography = frame[field_names.TOTAL_POP_FIELD].sum()\n",
|
||||
" total_population_in_geography = frame[\n",
|
||||
" field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010\n",
|
||||
" ].sum()\n",
|
||||
"\n",
|
||||
" if geography_field == field_names.URBAN_HEURISTIC_FIELD:\n",
|
||||
" urban_flag = frame[field_names.URBAN_HEURISTIC_FIELD].unique()[0]\n",
|
||||
|
@ -719,7 +711,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c4d0e783",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -825,7 +816,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8790cd64",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -1024,7 +1014,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eeb9699d",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -1201,7 +1190,6 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "983abcea",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
|
@ -57,13 +57,13 @@ AMI_FIELD = "Area Median Income (State or metropolitan)"
|
|||
|
||||
# Climate
|
||||
FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score"
|
||||
EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = (
|
||||
EXPECTED_BUILDING_LOSS_RATE_FIELD = (
|
||||
"Expected building loss rate (Natural Hazards Risk Index)"
|
||||
)
|
||||
EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = (
|
||||
EXPECTED_AGRICULTURE_LOSS_RATE_FIELD = (
|
||||
"Expected agricultural loss rate (Natural Hazards Risk Index)"
|
||||
)
|
||||
EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = (
|
||||
EXPECTED_POPULATION_LOSS_RATE_FIELD = (
|
||||
"Expected population loss rate (Natural Hazards Risk Index)"
|
||||
)
|
||||
|
||||
|
@ -117,6 +117,34 @@ AGGREGATION_POPULATION_FIELD = "Population Characteristics"
|
|||
UNDER_5_FIELD = "Individuals under 5 years old"
|
||||
OVER_64_FIELD = "Individuals over 64 years old"
|
||||
|
||||
# Fields from 2010 decennial census (generally only loaded for the territories)
|
||||
CENSUS_DECENNIAL_MEDIAN_INCOME_2009 = "Median household income in 2009 ($)"
|
||||
CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = (
|
||||
"Median household income as a percent of territory median income in 2009"
|
||||
)
|
||||
CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = (
|
||||
"Percentage households below 100% of federal poverty line in 2009"
|
||||
)
|
||||
CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009"
|
||||
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009"
|
||||
CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009"
|
||||
|
||||
# Fields from 2010 ACS (loaded for comparison with the territories)
|
||||
CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010"
|
||||
CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
|
||||
"Percent of individuals < 100% Federal Poverty Line in 2010"
|
||||
)
|
||||
|
||||
# Combined fields that merge island areas and states data
|
||||
COMBINED_CENSUS_TOTAL_POPULATION_2010 = (
|
||||
"Total population in 2009 (island areas) and 2019 (states and PR)"
|
||||
)
|
||||
COMBINED_UNEMPLOYMENT_2010 = "Unemployed civilians (percent) in 2009 (island areas) and 2010 (states and PR)"
|
||||
COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
|
||||
"Percentage households below 100% of federal poverty line in 2009 (island areas) "
|
||||
"and 2010 (states and PR)"
|
||||
)
|
||||
|
||||
# Urban Rural Map
|
||||
URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
|
||||
|
||||
|
@ -124,39 +152,39 @@ URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
|
|||
MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
|
||||
|
||||
# EJSCREEN Areas of Concern
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, National, 70th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, National, 75th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, National, 80th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, National, 85th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, National, 90th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, National, 95th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, State, 70th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, State, 75th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, State, 80th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, State, 85th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, State, 90th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||
"EJSCREEN Areas of Concern, State, 95th percentile (communities)"
|
||||
)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.score.score import Score
|
||||
|
@ -12,8 +13,86 @@ class ScoreL(Score):
|
|||
self.LOW_INCOME_THRESHOLD: float = 0.65
|
||||
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
|
||||
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
||||
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
|
||||
|
||||
super().__init__(df)
|
||||
|
||||
def _combine_island_areas_with_states_and_set_thresholds(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
column_from_island_areas: str,
|
||||
column_from_decennial_census: str,
|
||||
combined_column_name: str,
|
||||
threshold_cutoff_for_island_areas: float,
|
||||
) -> (pd.DataFrame, str):
|
||||
"""Steps to set thresholds for island areas.
|
||||
|
||||
This function is fairly logically complicated. It takes the following steps:
|
||||
|
||||
1. Combine the two different fields into a single field.
|
||||
2. Calculate the 90th percentile cutoff raw value for the combined field.
|
||||
3. Create a boolean series that is true for any census tract in the island
|
||||
areas (and only the island areas) that exceeds this cutoff.
|
||||
|
||||
For step one, it combines data that is either the island area's Decennial Census
|
||||
value in 2009 or the state's value in 5-year ACS ending in 2010.
|
||||
|
||||
This will be used to generate the percentile cutoff for the 90th percentile.
|
||||
|
||||
The stateside decennial census stopped asking economic comparisons,
|
||||
so this is as close to apples-to-apples as we get. We use 5-year ACS for data
|
||||
robustness over 1-year ACS.
|
||||
"""
|
||||
# Create the combined field.
|
||||
# There should only be one entry in either 2009 or 2019 fields, not one in both.
|
||||
# But just to be safe, we take the mean and ignore null values so if there
|
||||
# *were* entries in both, this result would make sense.
|
||||
df[combined_column_name] = df[
|
||||
[column_from_island_areas, column_from_decennial_census]
|
||||
].mean(axis=1, skipna=True)
|
||||
|
||||
logger.info(
|
||||
f"Combined field `{combined_column_name}` has "
|
||||
f"{df[combined_column_name].isnull().sum()} "
|
||||
f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
|
||||
f"missing values for census tracts. "
|
||||
)
|
||||
|
||||
# Calculate the percentile threshold raw value.
|
||||
raw_threshold = np.nanquantile(
|
||||
a=df[combined_column_name], q=threshold_cutoff_for_island_areas
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"For combined field `{combined_column_name}`, "
|
||||
f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
|
||||
f"raw value of {raw_threshold:.3f}."
|
||||
)
|
||||
|
||||
threshold_column_name = (
|
||||
f"{column_from_island_areas} exceeds "
|
||||
f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
|
||||
)
|
||||
|
||||
df[threshold_column_name] = (
|
||||
df[column_from_island_areas] >= raw_threshold
|
||||
)
|
||||
|
||||
percent_of_tracts_highlighted = (
|
||||
100
|
||||
* df[threshold_column_name].sum()
|
||||
/ df[column_from_island_areas].notnull().sum()
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"For `{threshold_column_name}`, "
|
||||
f"{df[threshold_column_name].sum()} ("
|
||||
f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data "
|
||||
f"in the column) have a value of TRUE."
|
||||
)
|
||||
|
||||
return df, threshold_column_name
|
||||
|
||||
def add_columns(self) -> pd.DataFrame:
|
||||
logger.info("Adding Score L")
|
||||
|
||||
|
@ -67,21 +146,21 @@ class ScoreL(Score):
|
|||
climate_criteria = (
|
||||
(
|
||||
self.df[
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
| (
|
||||
self.df[
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
| (
|
||||
self.df[
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
|
@ -204,14 +283,24 @@ class ScoreL(Score):
|
|||
# poverty level. Source: Census's American Community Survey]
|
||||
|
||||
pollution_criteria = (
|
||||
self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
) | (
|
||||
self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
) | (
|
||||
self.df[field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
(
|
||||
self.df[
|
||||
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
| (
|
||||
self.df[
|
||||
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
| (
|
||||
self.df[
|
||||
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
)
|
||||
|
||||
return pollution_criteria & (
|
||||
|
@ -306,7 +395,7 @@ class ScoreL(Score):
|
|||
# AND
|
||||
# Where the high school degree achievement rates for adults 25 years and older is less than 95%
|
||||
# (necessary to screen out university block groups)
|
||||
workforce_criteria = (
|
||||
workforce_criteria_for_states = (
|
||||
(
|
||||
self.df[
|
||||
field_names.UNEMPLOYMENT_FIELD
|
||||
|
@ -338,6 +427,76 @@ class ScoreL(Score):
|
|||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
)
|
||||
workforce_combined_criteria_for_states = (
|
||||
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
|
||||
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
|
||||
) & workforce_criteria_for_states
|
||||
|
||||
# Now, calculate workforce criteria for island territories.
|
||||
|
||||
# F a couple of values, create a combined field and criteria field.
|
||||
# First, combine unemployment.
|
||||
(
|
||||
self.df,
|
||||
unemployment_island_areas_criteria_field_name,
|
||||
) = self._combine_island_areas_with_states_and_set_thresholds(
|
||||
df=self.df,
|
||||
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
|
||||
column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
|
||||
combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
|
||||
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
|
||||
# Next, combine poverty.
|
||||
(
|
||||
self.df,
|
||||
poverty_island_areas_criteria_field_name,
|
||||
) = self._combine_island_areas_with_states_and_set_thresholds(
|
||||
df=self.df,
|
||||
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
|
||||
column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||
)
|
||||
|
||||
workforce_combined_criteria_for_island_areas = (
|
||||
self.df[unemployment_island_areas_criteria_field_name]
|
||||
| self.df[poverty_island_areas_criteria_field_name]
|
||||
# Also check whether area median income is 10th percentile or lower
|
||||
# within the islands.
|
||||
| (
|
||||
self.df[
|
||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
# Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
|
||||
# and then look for median income lower than that (not greater than).
|
||||
< 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||
)
|
||||
) & (
|
||||
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
|
||||
> self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
|
||||
)
|
||||
|
||||
percent_of_island_tracts_highlighted = (
|
||||
100
|
||||
* workforce_combined_criteria_for_island_areas.sum()
|
||||
# Choosing a random column from island areas to calculate the denominator.
|
||||
/ self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
|
||||
.notnull()
|
||||
.sum()
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"For workforce criteria in island areas, "
|
||||
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
||||
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
||||
f"in the column) have a value of TRUE."
|
||||
)
|
||||
|
||||
# A tract is included if it meets either the states tract criteria or the
|
||||
# island areas tract criteria.
|
||||
return (
|
||||
self.df[field_names.HIGH_SCHOOL_ED_FIELD] >= 0.10
|
||||
) & workforce_criteria
|
||||
workforce_combined_criteria_for_states
|
||||
| workforce_combined_criteria_for_island_areas
|
||||
)
|
||||
|
|
|
@ -67,6 +67,9 @@ disable = [
|
|||
"C0115", # Disables missing class docstring
|
||||
"R0915", # Disables too many statements (score generation transform)
|
||||
"W0231", # Disables super init not called
|
||||
"R0801", # Disables duplicate code. There are a couple places we have similar code and
|
||||
# unfortunately you can't disable this rule for individual lines or files, it's a
|
||||
# known bug. https://github.com/PyCQA/pylint/issues/214#
|
||||
]
|
||||
|
||||
[tool.pylint.FORMAT]
|
||||
|
|
Loading…
Add table
Reference in a new issue