mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 18:14:19 -08:00
Merge branch 'usds:main' into main
This commit is contained in:
commit
12456c8dc5
16 changed files with 885 additions and 161 deletions
|
@ -32,10 +32,15 @@ class ExtractTransformLoad:
|
||||||
FILES_PATH: Path = settings.APP_ROOT / "files"
|
FILES_PATH: Path = settings.APP_ROOT / "files"
|
||||||
GEOID_FIELD_NAME: str = "GEOID10"
|
GEOID_FIELD_NAME: str = "GEOID10"
|
||||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
|
|
||||||
|
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||||
|
# be from CBGs at different time periods.
|
||||||
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
|
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
|
||||||
# TODO: investigate. Census says there are only 73,057 tracts in the US. This might be from tracts at different time periods.
|
|
||||||
EXPECTED_MAX_CENSUS_TRACTS: int = 74027
|
# TODO: investigate. Census says there are only 74,134 tracts in the US,
|
||||||
|
# Puerto Rico, and island areas. This might be from tracts at different time
|
||||||
|
# periods. https://github.com/usds/justice40-tool/issues/964
|
||||||
|
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
|
||||||
|
|
||||||
def __init__(self, config_path: Path) -> None:
|
def __init__(self, config_path: Path) -> None:
|
||||||
"""Inits the class with instance specific variables"""
|
"""Inits the class with instance specific variables"""
|
||||||
|
|
|
@ -4,6 +4,11 @@ DATASET_LIST = [
|
||||||
"module_dir": "census_acs",
|
"module_dir": "census_acs",
|
||||||
"class_name": "CensusACSETL",
|
"class_name": "CensusACSETL",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "census_acs_2010",
|
||||||
|
"module_dir": "census_acs_2010",
|
||||||
|
"class_name": "CensusACS2010ETL",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "ejscreen",
|
"name": "ejscreen",
|
||||||
"module_dir": "ejscreen",
|
"module_dir": "ejscreen",
|
||||||
|
@ -14,16 +19,6 @@ DATASET_LIST = [
|
||||||
"module_dir": "hud_housing",
|
"module_dir": "hud_housing",
|
||||||
"class_name": "HudHousingETL",
|
"class_name": "HudHousingETL",
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "calenviroscreen",
|
|
||||||
"module_dir": "calenviroscreen",
|
|
||||||
"class_name": "CalEnviroScreenETL",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "hud_recap",
|
|
||||||
"module_dir": "hud_recap",
|
|
||||||
"class_name": "HudRecapETL",
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "cdc_places",
|
"name": "cdc_places",
|
||||||
"module_dir": "cdc_places",
|
"module_dir": "cdc_places",
|
||||||
|
@ -74,6 +69,16 @@ DATASET_LIST = [
|
||||||
"module_dir": "housing_and_transportation",
|
"module_dir": "housing_and_transportation",
|
||||||
"class_name": "HousingTransportationETL",
|
"class_name": "HousingTransportationETL",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "calenviroscreen",
|
||||||
|
"module_dir": "calenviroscreen",
|
||||||
|
"class_name": "CalEnviroScreenETL",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "hud_recap",
|
||||||
|
"module_dir": "hud_recap",
|
||||||
|
"class_name": "HudRecapETL",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "tree_equity_score",
|
"name": "tree_equity_score",
|
||||||
"module_dir": "tree_equity_score",
|
"module_dir": "tree_equity_score",
|
||||||
|
|
|
@ -27,6 +27,8 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
self.national_risk_index_df: pd.DataFrame
|
self.national_risk_index_df: pd.DataFrame
|
||||||
self.geocorr_urban_rural_df: pd.DataFrame
|
self.geocorr_urban_rural_df: pd.DataFrame
|
||||||
self.persistent_poverty_df: pd.DataFrame
|
self.persistent_poverty_df: pd.DataFrame
|
||||||
|
self.census_decennial_df: pd.DataFrame
|
||||||
|
self.census_2010_df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Loading data sets from disk.")
|
logger.info("Loading data sets from disk.")
|
||||||
|
@ -137,6 +139,29 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Load decennial census data
|
||||||
|
census_decennial_csv = (
|
||||||
|
constants.DATA_PATH
|
||||||
|
/ "dataset"
|
||||||
|
/ "census_decennial_2010"
|
||||||
|
/ "usa.csv"
|
||||||
|
)
|
||||||
|
self.census_decennial_df = pd.read_csv(
|
||||||
|
census_decennial_csv,
|
||||||
|
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
|
low_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load 2010 ACS data from states
|
||||||
|
census_2010_csv = (
|
||||||
|
constants.DATA_PATH / "dataset" / "census_acs_2010" / "usa.csv"
|
||||||
|
)
|
||||||
|
self.census_2010_df = pd.read_csv(
|
||||||
|
census_2010_csv,
|
||||||
|
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
|
low_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
||||||
logger.info("Joining Census Tract dataframes")
|
logger.info("Joining Census Tract dataframes")
|
||||||
|
|
||||||
|
@ -228,6 +253,8 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
self.persistent_poverty_df,
|
self.persistent_poverty_df,
|
||||||
self.national_risk_index_df,
|
self.national_risk_index_df,
|
||||||
self.census_acs_median_incomes_df,
|
self.census_acs_median_incomes_df,
|
||||||
|
self.census_decennial_df,
|
||||||
|
self.census_2010_df,
|
||||||
]
|
]
|
||||||
|
|
||||||
# Sanity check each data frame before merging.
|
# Sanity check each data frame before merging.
|
||||||
|
@ -296,9 +323,16 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
field_names.HIGH_SCHOOL_ED_FIELD,
|
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||||
field_names.UNEMPLOYMENT_FIELD,
|
field_names.UNEMPLOYMENT_FIELD,
|
||||||
field_names.MEDIAN_HOUSE_VALUE_FIELD,
|
field_names.MEDIAN_HOUSE_VALUE_FIELD,
|
||||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME,
|
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
|
||||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME,
|
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
|
||||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME,
|
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
|
||||||
|
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
|
||||||
|
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
|
||||||
|
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
|
||||||
|
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||||
|
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
|
||||||
|
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
|
||||||
]
|
]
|
||||||
|
|
||||||
non_numeric_columns = [
|
non_numeric_columns = [
|
||||||
|
@ -315,9 +349,9 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
# Convert all columns to numeric and do math
|
# Convert all columns to numeric and do math
|
||||||
for col in numeric_columns:
|
for col in numeric_columns:
|
||||||
# Calculate percentiles
|
# Calculate percentiles
|
||||||
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[col].rank(
|
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[
|
||||||
pct=True
|
col
|
||||||
)
|
].rank(pct=True)
|
||||||
|
|
||||||
# Min-max normalization:
|
# Min-max normalization:
|
||||||
# (
|
# (
|
||||||
|
@ -341,6 +375,20 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
df_copy[col] - min_value
|
df_copy[col] - min_value
|
||||||
) / (max_value - min_value)
|
) / (max_value - min_value)
|
||||||
|
|
||||||
|
# Special logic: create a combined population field.
|
||||||
|
# We sometimes run analytics on "population", and this makes a single field
|
||||||
|
# that is either the island area's population in 2009 or the state's
|
||||||
|
# population in 2019.
|
||||||
|
# There should only be one entry in either 2009 or 2019, not one in both.
|
||||||
|
# But just to be safe, we take the mean and ignore null values so if there
|
||||||
|
# *were* entries in both fields, this result would make sense.
|
||||||
|
df_copy[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010] = df_copy[
|
||||||
|
[
|
||||||
|
field_names.TOTAL_POP_FIELD,
|
||||||
|
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
|
||||||
|
]
|
||||||
|
].mean(axis=1, skipna=True)
|
||||||
|
|
||||||
return df_copy
|
return df_copy
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import censusdata
|
|
||||||
|
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
@ -14,7 +13,15 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
self.OUTPUT_PATH = (
|
self.OUTPUT_PATH = (
|
||||||
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
|
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
|
||||||
|
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
|
||||||
|
self.EMPLOYMENT_FIELDS = [
|
||||||
|
self.TOTAL_UNEMPLOYED_FIELD,
|
||||||
|
self.TOTAL_IN_LABOR_FORCE,
|
||||||
|
]
|
||||||
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
||||||
|
|
||||||
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
|
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
|
||||||
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
|
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
|
||||||
"Linguistic isolation (total)"
|
"Linguistic isolation (total)"
|
||||||
|
@ -55,59 +62,89 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
"Median value ($) of owner-occupied housing units"
|
"Median value ($) of owner-occupied housing units"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Educational attainment figures
|
||||||
|
self.EDUCATION_POPULATION_OVER_25 = "B15003_001E" # Estimate!!Total
|
||||||
|
self.EDUCATION_NO_SCHOOLING = (
|
||||||
|
"B15003_002E" # Estimate!!Total!!No schooling completed
|
||||||
|
)
|
||||||
|
self.EDUCATION_NURSERY = (
|
||||||
|
"B15003_003E" # Estimate!!Total!!Nursery school
|
||||||
|
)
|
||||||
|
self.EDUCATION_KINDERGARTEN = (
|
||||||
|
"B15003_004E" # Estimate!!Total!!Kindergarten
|
||||||
|
)
|
||||||
|
self.EDUCATION_FIRST = "B15003_005E" # Estimate!!Total!!1st grade
|
||||||
|
self.EDUCATION_SECOND = "B15003_006E" # Estimate!!Total!!2nd grade
|
||||||
|
self.EDUCATION_THIRD = "B15003_007E" # Estimate!!Total!!3rd grade
|
||||||
|
self.EDUCATION_FOURTH = "B15003_008E" # Estimate!!Total!!4th grade
|
||||||
|
self.EDUCATION_FIFTH = "B15003_009E" # Estimate!!Total!!5th grade
|
||||||
|
self.EDUCATION_SIXTH = "B15003_010E" # Estimate!!Total!!6th grade
|
||||||
|
self.EDUCATION_SEVENTH = "B15003_011E" # Estimate!!Total!!7th grade
|
||||||
|
self.EDUCATION_EIGHTH = "B15003_012E" # Estimate!!Total!!8th grade
|
||||||
|
self.EDUCATION_NINTH = "B15003_013E" # Estimate!!Total!!9th grade
|
||||||
|
self.EDUCATION_TENTH = "B15003_014E" # Estimate!!Total!!10th grade
|
||||||
|
self.EDUCATION_ELEVENTH = "B15003_015E" # Estimate!!Total!!11th grade
|
||||||
|
self.EDUCATION_TWELFTH_NO_DIPLOMA = (
|
||||||
|
"B15003_016E" # Estimate!!Total!!12th grade, no diploma
|
||||||
|
)
|
||||||
|
|
||||||
|
self.EDUCATIONAL_FIELDS = [
|
||||||
|
self.EDUCATION_POPULATION_OVER_25,
|
||||||
|
self.EDUCATION_NO_SCHOOLING,
|
||||||
|
self.EDUCATION_NURSERY,
|
||||||
|
self.EDUCATION_KINDERGARTEN,
|
||||||
|
self.EDUCATION_FIRST,
|
||||||
|
self.EDUCATION_SECOND,
|
||||||
|
self.EDUCATION_THIRD,
|
||||||
|
self.EDUCATION_FOURTH,
|
||||||
|
self.EDUCATION_FIFTH,
|
||||||
|
self.EDUCATION_SIXTH,
|
||||||
|
self.EDUCATION_SEVENTH,
|
||||||
|
self.EDUCATION_EIGHTH,
|
||||||
|
self.EDUCATION_NINTH,
|
||||||
|
self.EDUCATION_TENTH,
|
||||||
|
self.EDUCATION_ELEVENTH,
|
||||||
|
self.EDUCATION_TWELFTH_NO_DIPLOMA,
|
||||||
|
]
|
||||||
|
|
||||||
|
self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD = (
|
||||||
|
"Individuals age 25 or over with less than high school degree"
|
||||||
|
)
|
||||||
|
self.HIGH_SCHOOL_ED_FIELD = "Percent individuals age 25 or over with less than high school degree"
|
||||||
|
|
||||||
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def _fips_from_censusdata_censusgeo(
|
|
||||||
self, censusgeo: censusdata.censusgeo
|
|
||||||
) -> str:
|
|
||||||
"""Create a FIPS code from the proprietary censusgeo index."""
|
|
||||||
fips = "".join([value for (key, value) in censusgeo.params()])
|
|
||||||
return fips
|
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
dfs = []
|
# Define the variables to retrieve
|
||||||
for fips in get_state_fips_codes(self.DATA_PATH):
|
variables = (
|
||||||
logger.info(
|
[
|
||||||
f"Downloading data for state/territory with FIPS code {fips}"
|
# Income field
|
||||||
)
|
self.MEDIAN_INCOME_FIELD,
|
||||||
|
# House value
|
||||||
|
self.MEDIAN_HOUSE_VALUE_FIELD,
|
||||||
|
]
|
||||||
|
+ self.EMPLOYMENT_FIELDS
|
||||||
|
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||||
|
+ self.POVERTY_FIELDS
|
||||||
|
+ self.EDUCATIONAL_FIELDS
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
self.df = retrieve_census_acs_data(
|
||||||
response = censusdata.download(
|
acs_year=self.ACS_YEAR,
|
||||||
src="acs5",
|
variables=variables,
|
||||||
year=self.ACS_YEAR,
|
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||||
geo=censusdata.censusgeo(
|
data_path_for_fips_codes=self.DATA_PATH,
|
||||||
[("state", fips), ("county", "*"), ("tract", "*")]
|
|
||||||
),
|
|
||||||
var=[
|
|
||||||
# Emploment fields
|
|
||||||
"B23025_005E",
|
|
||||||
"B23025_003E",
|
|
||||||
# Income field
|
|
||||||
self.MEDIAN_INCOME_FIELD,
|
|
||||||
# House value
|
|
||||||
self.MEDIAN_HOUSE_VALUE_FIELD,
|
|
||||||
]
|
|
||||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
|
||||||
+ self.POVERTY_FIELDS,
|
|
||||||
)
|
|
||||||
dfs.append(response)
|
|
||||||
except ValueError:
|
|
||||||
logger.error(
|
|
||||||
f"Could not download data for state/territory with FIPS code {fips}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.df = pd.concat(dfs)
|
|
||||||
|
|
||||||
self.df[self.GEOID_TRACT_FIELD_NAME] = self.df.index.to_series().apply(
|
|
||||||
func=self._fips_from_censusdata_censusgeo
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting Census ACS Transform")
|
logger.info("Starting Census ACS Transform")
|
||||||
|
|
||||||
|
df = self.df
|
||||||
|
|
||||||
# Rename two fields.
|
# Rename two fields.
|
||||||
self.df = self.df.rename(
|
df = df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||||
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
||||||
|
@ -119,19 +156,17 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
self.MEDIAN_INCOME_FIELD_NAME,
|
self.MEDIAN_INCOME_FIELD_NAME,
|
||||||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||||
]:
|
]:
|
||||||
missing_value_count = sum(self.df[field] == -666666666)
|
missing_value_count = sum(df[field] == -666666666)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
|
f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
|
||||||
+ f"`{field}` being marked as null values."
|
+ f"`{field}` being marked as null values."
|
||||||
)
|
)
|
||||||
self.df[field] = self.df[field].replace(
|
df[field] = df[field].replace(to_replace=-666666666, value=None)
|
||||||
to_replace=-666666666, value=None
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate percent unemployment.
|
# Calculate percent unemployment.
|
||||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||||
self.df[self.UNEMPLOYED_FIELD_NAME] = (
|
df[self.UNEMPLOYED_FIELD_NAME] = (
|
||||||
self.df.B23025_005E / self.df.B23025_003E
|
df[self.TOTAL_UNEMPLOYED_FIELD] / df[self.TOTAL_IN_LABOR_FORCE]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate linguistic isolation.
|
# Calculate linguistic isolation.
|
||||||
|
@ -142,34 +177,64 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
"C16002_013E",
|
"C16002_013E",
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = self.df[
|
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[
|
||||||
individual_limited_english_fields
|
individual_limited_english_fields
|
||||||
].sum(axis=1, skipna=True)
|
].sum(axis=1, skipna=True)
|
||||||
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
|
df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
|
||||||
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
|
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
|
||||||
/ self.df["C16002_001E"]
|
/ df["C16002_001E"]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate percent at different poverty thresholds
|
# Calculate percent at different poverty thresholds
|
||||||
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||||
self.df["C17002_002E"] + self.df["C17002_003E"]
|
df["C17002_002E"] + df["C17002_003E"]
|
||||||
) / self.df["C17002_001E"]
|
) / df["C17002_001E"]
|
||||||
|
|
||||||
self.df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||||
self.df["C17002_002E"]
|
df["C17002_002E"]
|
||||||
+ self.df["C17002_003E"]
|
+ df["C17002_003E"]
|
||||||
+ self.df["C17002_004E"]
|
+ df["C17002_004E"]
|
||||||
+ self.df["C17002_005E"]
|
+ df["C17002_005E"]
|
||||||
) / self.df["C17002_001E"]
|
) / df["C17002_001E"]
|
||||||
|
|
||||||
self.df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||||
self.df["C17002_002E"]
|
df["C17002_002E"]
|
||||||
+ self.df["C17002_003E"]
|
+ df["C17002_003E"]
|
||||||
+ self.df["C17002_004E"]
|
+ df["C17002_004E"]
|
||||||
+ self.df["C17002_005E"]
|
+ df["C17002_005E"]
|
||||||
+ self.df["C17002_006E"]
|
+ df["C17002_006E"]
|
||||||
+ self.df["C17002_007E"]
|
+ df["C17002_007E"]
|
||||||
) / self.df["C17002_001E"]
|
) / df["C17002_001E"]
|
||||||
|
|
||||||
|
# Calculate educational attainment
|
||||||
|
educational_numerator_fields = [
|
||||||
|
self.EDUCATION_NO_SCHOOLING,
|
||||||
|
self.EDUCATION_NURSERY,
|
||||||
|
self.EDUCATION_KINDERGARTEN,
|
||||||
|
self.EDUCATION_FIRST,
|
||||||
|
self.EDUCATION_SECOND,
|
||||||
|
self.EDUCATION_THIRD,
|
||||||
|
self.EDUCATION_FOURTH,
|
||||||
|
self.EDUCATION_FIFTH,
|
||||||
|
self.EDUCATION_SIXTH,
|
||||||
|
self.EDUCATION_SEVENTH,
|
||||||
|
self.EDUCATION_EIGHTH,
|
||||||
|
self.EDUCATION_NINTH,
|
||||||
|
self.EDUCATION_TENTH,
|
||||||
|
self.EDUCATION_ELEVENTH,
|
||||||
|
self.EDUCATION_TWELFTH_NO_DIPLOMA,
|
||||||
|
]
|
||||||
|
|
||||||
|
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD] = df[
|
||||||
|
educational_numerator_fields
|
||||||
|
].sum(axis=1)
|
||||||
|
df[self.HIGH_SCHOOL_ED_FIELD] = (
|
||||||
|
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD]
|
||||||
|
/ df[self.EDUCATION_POPULATION_OVER_25]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save results to self.
|
||||||
|
self.df = df
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Census ACS Data")
|
logger.info("Saving Census ACS Data")
|
||||||
|
@ -186,6 +251,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
||||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||||
|
self.HIGH_SCHOOL_ED_FIELD,
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df[columns_to_include].to_csv(
|
self.df[columns_to_include].to_csv(
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
import censusdata
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||||
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _fips_from_censusdata_censusgeo(
|
||||||
|
censusgeo: censusdata.censusgeo
|
||||||
|
) -> str:
|
||||||
|
"""Create a FIPS code from the proprietary censusgeo index."""
|
||||||
|
fips = "".join([value for (key, value) in censusgeo.params()])
|
||||||
|
return fips
|
||||||
|
|
||||||
|
|
||||||
|
# pylint: disable=too-many-arguments
|
||||||
|
def retrieve_census_acs_data(
|
||||||
|
acs_year: int,
|
||||||
|
variables: List[str],
|
||||||
|
tract_output_field_name: str,
|
||||||
|
data_path_for_fips_codes: Path,
|
||||||
|
acs_type="acs5",
|
||||||
|
raise_errors: bool = False,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Retrieves and combines census ACS data for a given year."""
|
||||||
|
dfs = []
|
||||||
|
for fips in get_state_fips_codes(data_path_for_fips_codes):
|
||||||
|
logger.info(
|
||||||
|
f"Downloading data for state/territory with FIPS code {fips}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = censusdata.download(
|
||||||
|
src=acs_type,
|
||||||
|
year=acs_year,
|
||||||
|
geo=censusdata.censusgeo(
|
||||||
|
[("state", fips), ("county", "*"), ("tract", "*")]
|
||||||
|
),
|
||||||
|
var=variables,
|
||||||
|
)
|
||||||
|
dfs.append(response)
|
||||||
|
|
||||||
|
except ValueError as e:
|
||||||
|
logger.error(
|
||||||
|
f"Could not download data for state/territory with FIPS code {fips}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if raise_errors:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
df = pd.concat(dfs)
|
||||||
|
|
||||||
|
df[tract_output_field_name] = df.index.to_series().apply(
|
||||||
|
func=_fips_from_censusdata_censusgeo
|
||||||
|
)
|
||||||
|
|
||||||
|
return df
|
|
@ -0,0 +1,186 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
|
||||||
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CensusACS2010ETL(ExtractTransformLoad):
|
||||||
|
"""Extract ACS data from 2010 or approximately that year.
|
||||||
|
|
||||||
|
Note: Census ACS 2010 uses different fields than those captured in CensusACSETL.
|
||||||
|
|
||||||
|
To support this, we created a separate class.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.ACS_YEAR = 2010
|
||||||
|
self.ACS_TYPE = "acs5"
|
||||||
|
self.OUTPUT_PATH = (
|
||||||
|
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Employment fields
|
||||||
|
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED = (
|
||||||
|
"B23006_007E"
|
||||||
|
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED = (
|
||||||
|
"B23006_014E"
|
||||||
|
# Estimate!!Total!!High school graduate!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED = (
|
||||||
|
"B23006_021E"
|
||||||
|
# Estimate!!Total!!Some college or associate's degree!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_COLLEGE_UNEMPLOYED = (
|
||||||
|
"B23006_028E"
|
||||||
|
# Estimate!!Total!!Bachelor's degree or higher!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
|
||||||
|
self.UNEMPLOYED_FIELDS = [
|
||||||
|
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED,
|
||||||
|
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED,
|
||||||
|
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED,
|
||||||
|
self.EMPLOYMENT_COLLEGE_UNEMPLOYED,
|
||||||
|
]
|
||||||
|
|
||||||
|
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE = (
|
||||||
|
# TODO: FIX!!!!!!
|
||||||
|
"B23006_005E"
|
||||||
|
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE = (
|
||||||
|
"B23006_010E"
|
||||||
|
# Estimate!!Total!!High school graduate!!In labor force
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE = (
|
||||||
|
"B23006_017E"
|
||||||
|
# Estimate!!Total!!Some college or associate's degree!!In labor force
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE = (
|
||||||
|
"B23006_024E"
|
||||||
|
# Estimate!!Total!!Bachelor's degree or higher!!In labor force
|
||||||
|
)
|
||||||
|
|
||||||
|
self.IN_LABOR_FORCE_FIELDS = [
|
||||||
|
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE,
|
||||||
|
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE,
|
||||||
|
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE,
|
||||||
|
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE,
|
||||||
|
]
|
||||||
|
|
||||||
|
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
||||||
|
|
||||||
|
self.POVERTY_FIELDS = [
|
||||||
|
"C17002_001E", # Estimate!!Total,
|
||||||
|
"C17002_002E", # Estimate!!Total!!Under .50
|
||||||
|
"C17002_003E", # Estimate!!Total!!.50 to .99
|
||||||
|
"C17002_004E", # Estimate!!Total!!1.00 to 1.24
|
||||||
|
"C17002_005E", # Estimate!!Total!!1.25 to 1.49
|
||||||
|
"C17002_006E", # Estimate!!Total!!1.50 to 1.84
|
||||||
|
"C17002_007E", # Estimate!!Total!!1.85 to 1.99
|
||||||
|
]
|
||||||
|
|
||||||
|
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME = (
|
||||||
|
"Percent of individuals < 100% Federal Poverty Line"
|
||||||
|
)
|
||||||
|
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME = (
|
||||||
|
"Percent of individuals < 150% Federal Poverty Line"
|
||||||
|
)
|
||||||
|
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
|
||||||
|
"Percent of individuals < 200% Federal Poverty Line"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
||||||
|
|
||||||
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
def extract(self) -> None:
|
||||||
|
# Define the variables to retrieve
|
||||||
|
variables = (
|
||||||
|
self.UNEMPLOYED_FIELDS
|
||||||
|
+ self.IN_LABOR_FORCE_FIELDS
|
||||||
|
+ self.POVERTY_FIELDS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the method defined on CensusACSETL to reduce coding redundancy.
|
||||||
|
self.df = retrieve_census_acs_data(
|
||||||
|
acs_year=self.ACS_YEAR,
|
||||||
|
variables=variables,
|
||||||
|
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
data_path_for_fips_codes=self.DATA_PATH,
|
||||||
|
acs_type=self.ACS_TYPE,
|
||||||
|
raise_errors=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
logger.info("Starting Census ACS Transform")
|
||||||
|
|
||||||
|
df = self.df
|
||||||
|
|
||||||
|
# Calculate percent unemployment.
|
||||||
|
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||||
|
unemployed_totals = df[self.UNEMPLOYED_FIELDS].sum(axis=1)
|
||||||
|
labor_force_totals = df[self.IN_LABOR_FORCE_FIELDS].sum(axis=1)
|
||||||
|
|
||||||
|
df[self.UNEMPLOYED_FIELD_NAME] = unemployed_totals / labor_force_totals
|
||||||
|
|
||||||
|
# Calculate percent at different poverty thresholds
|
||||||
|
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||||
|
df["C17002_002E"] + df["C17002_003E"]
|
||||||
|
) / df["C17002_001E"]
|
||||||
|
|
||||||
|
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||||
|
df["C17002_002E"]
|
||||||
|
+ df["C17002_003E"]
|
||||||
|
+ df["C17002_004E"]
|
||||||
|
+ df["C17002_005E"]
|
||||||
|
) / df["C17002_001E"]
|
||||||
|
|
||||||
|
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||||
|
df["C17002_002E"]
|
||||||
|
+ df["C17002_003E"]
|
||||||
|
+ df["C17002_004E"]
|
||||||
|
+ df["C17002_005E"]
|
||||||
|
+ df["C17002_006E"]
|
||||||
|
+ df["C17002_007E"]
|
||||||
|
) / df["C17002_001E"]
|
||||||
|
|
||||||
|
# Save results to self.
|
||||||
|
self.df = df
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
logger.info("Saving Census ACS Data")
|
||||||
|
|
||||||
|
# mkdir census
|
||||||
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
columns_to_include = [
|
||||||
|
self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
self.UNEMPLOYED_FIELD_NAME,
|
||||||
|
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
|
||||||
|
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
||||||
|
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||||
|
]
|
||||||
|
|
||||||
|
output_df = self.df[columns_to_include]
|
||||||
|
|
||||||
|
# Add the year to the end of every column, so when it's all joined in the
|
||||||
|
# score df, it's obvious which year this data is from.
|
||||||
|
for column in columns_to_include:
|
||||||
|
if column != self.GEOID_TRACT_FIELD_NAME:
|
||||||
|
output_df = output_df.rename(
|
||||||
|
columns={
|
||||||
|
column: f"{column} in {self.ACS_YEAR}",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||||
|
|
||||||
|
def validate(self) -> None:
|
||||||
|
logger.info("Validating Census ACS Data")
|
||||||
|
|
||||||
|
pass
|
|
@ -27,12 +27,21 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
# https://api.census.gov/data/2010/dec/gu/variables.html
|
# https://api.census.gov/data/2010/dec/gu/variables.html
|
||||||
# https://api.census.gov/data/2010/dec/mp/variables.html
|
# https://api.census.gov/data/2010/dec/mp/variables.html
|
||||||
# https://api.census.gov/data/2010/dec/vi/variables.html
|
# https://api.census.gov/data/2010/dec/vi/variables.html
|
||||||
|
|
||||||
|
# Total population field is the same in all island areas
|
||||||
|
self.TOTAL_POP_FIELD = self.TOTAL_POP_VI_FIELD = "P001001"
|
||||||
|
self.TOTAL_POP_FIELD_NAME = "Total population in 2009"
|
||||||
|
|
||||||
self.MEDIAN_INCOME_FIELD = "PBG049001"
|
self.MEDIAN_INCOME_FIELD = "PBG049001"
|
||||||
self.MEDIAN_INCOME_VI_FIELD = "PBG047001"
|
self.MEDIAN_INCOME_VI_FIELD = "PBG047001"
|
||||||
self.MEDIAN_INCOME_FIELD_NAME = (
|
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in 2009 ($)"
|
||||||
"MEDIAN HOUSEHOLD INCOME IN 2009 (DOLLARS)"
|
self.AREA_MEDIAN_INCOME_FIELD_NAME = (
|
||||||
|
"Median household income as a percent of "
|
||||||
|
"territory median income in 2009"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.TERRITORY_MEDIAN_INCOME_FIELD = "Territory Median Income"
|
||||||
|
|
||||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001"
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001"
|
||||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = (
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = (
|
||||||
"PBG077001"
|
"PBG077001"
|
||||||
|
@ -48,7 +57,39 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||||
"PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL"
|
"Percentage households below 200% of federal poverty line in 2009"
|
||||||
|
)
|
||||||
|
|
||||||
|
# We will combine three fields to get households < 100% FPL.
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE = (
|
||||||
|
"PBG083002" # Total!!Under .50
|
||||||
|
)
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO = (
|
||||||
|
"PBG083003" # Total!!.50 to .74
|
||||||
|
)
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE = (
|
||||||
|
"PBG083004" # Total!!.75 to .99
|
||||||
|
)
|
||||||
|
|
||||||
|
# Same fields, for Virgin Islands.
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE = (
|
||||||
|
"PBG077002" # Total!!Under .50
|
||||||
|
)
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO = (
|
||||||
|
"PBG077003" # Total!!.50 to .74
|
||||||
|
)
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE = (
|
||||||
|
"PBG077004" # Total!!.75 to .99
|
||||||
|
)
|
||||||
|
|
||||||
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
|
||||||
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
|
||||||
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||||
|
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||||
|
"Percentage households below 100% of federal poverty line in 2009"
|
||||||
)
|
)
|
||||||
|
|
||||||
# High School Education Fields
|
# High School Education Fields
|
||||||
|
@ -70,9 +111,37 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree in 2009"
|
||||||
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"
|
|
||||||
|
# Employment fields
|
||||||
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD = (
|
||||||
|
"PBG038003" # Total!!Male!!In labor force
|
||||||
)
|
)
|
||||||
|
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD = (
|
||||||
|
"PBG038007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD = (
|
||||||
|
"PBG038010" # Total!!Female!!In labor force
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD = (
|
||||||
|
"PBG038014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
|
||||||
|
# Same fields, Virgin Islands.
|
||||||
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||||
|
"PBG036003" # Total!!Male!!In labor force
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD = (
|
||||||
|
"PBG036007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||||
|
"PBG036010" # Total!!Female!!In labor force
|
||||||
|
)
|
||||||
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD = (
|
||||||
|
"PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||||
|
)
|
||||||
|
|
||||||
|
self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009"
|
||||||
|
|
||||||
var_list = [
|
var_list = [
|
||||||
self.MEDIAN_INCOME_FIELD,
|
self.MEDIAN_INCOME_FIELD,
|
||||||
|
@ -81,6 +150,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
self.TOTAL_POPULATION_FIELD,
|
self.TOTAL_POPULATION_FIELD,
|
||||||
self.MALE_HIGH_SCHOOL_ED_FIELD,
|
self.MALE_HIGH_SCHOOL_ED_FIELD,
|
||||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD,
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||||
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||||
|
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||||
|
self.TOTAL_POP_FIELD,
|
||||||
]
|
]
|
||||||
var_list = ",".join(var_list)
|
var_list = ",".join(var_list)
|
||||||
|
|
||||||
|
@ -91,6 +168,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
self.TOTAL_POPULATION_VI_FIELD,
|
self.TOTAL_POPULATION_VI_FIELD,
|
||||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD,
|
self.MALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD,
|
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE,
|
||||||
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD,
|
||||||
|
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
|
||||||
|
self.TOTAL_POP_VI_FIELD,
|
||||||
]
|
]
|
||||||
var_list_vi = ",".join(var_list_vi)
|
var_list_vi = ",".join(var_list_vi)
|
||||||
|
|
||||||
|
@ -107,6 +192,20 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||||
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||||
|
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||||
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||||
|
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||||
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||||
}
|
}
|
||||||
|
|
||||||
# To do: Ask Census Slack Group about whether you need to hardcode the county fips
|
# To do: Ask Census Slack Group about whether you need to hardcode the county fips
|
||||||
|
@ -117,24 +216,30 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
"fips": "60",
|
"fips": "60",
|
||||||
"county_fips": ["010", "020", "030", "040", "050"],
|
"county_fips": ["010", "020", "030", "040", "050"],
|
||||||
"var_list": var_list,
|
"var_list": var_list,
|
||||||
|
# Note: we hardcode the median income for each territory in this dict,
|
||||||
|
# because that data is hard to programmatically access.
|
||||||
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 23892,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"state_abbreviation": "gu",
|
"state_abbreviation": "gu",
|
||||||
"fips": "66",
|
"fips": "66",
|
||||||
"county_fips": ["010"],
|
"county_fips": ["010"],
|
||||||
"var_list": var_list,
|
"var_list": var_list,
|
||||||
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 48274,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"state_abbreviation": "mp",
|
"state_abbreviation": "mp",
|
||||||
"fips": "69",
|
"fips": "69",
|
||||||
"county_fips": ["085", "100", "110", "120"],
|
"county_fips": ["085", "100", "110", "120"],
|
||||||
"var_list": var_list,
|
"var_list": var_list,
|
||||||
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 19958,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"state_abbreviation": "vi",
|
"state_abbreviation": "vi",
|
||||||
"fips": "78",
|
"fips": "78",
|
||||||
"county_fips": ["010", "020", "030"],
|
"county_fips": ["010", "020", "030"],
|
||||||
"var_list": var_list_vi,
|
"var_list": var_list_vi,
|
||||||
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 37254,
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -198,6 +303,11 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
# Combine the dfs after renaming
|
# Combine the dfs after renaming
|
||||||
self.df_all = pd.concat([self.df, self.df_vi])
|
self.df_all = pd.concat([self.df, self.df_vi])
|
||||||
|
|
||||||
|
# Rename total population:
|
||||||
|
self.df_all[self.TOTAL_POP_FIELD_NAME] = self.df_all[
|
||||||
|
self.TOTAL_POP_FIELD
|
||||||
|
]
|
||||||
|
|
||||||
# Percentage of households below 200% which is
|
# Percentage of households below 200% which is
|
||||||
# [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total)
|
# [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total)
|
||||||
self.df_all[
|
self.df_all[
|
||||||
|
@ -211,6 +321,25 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Percentage of households below 100% FPL
|
||||||
|
# which we get by adding `Total!!Under .50`, `Total!!.50 to .74`, ` Total!!.75 to .99`,
|
||||||
|
# and then dividing by PBG083001 (total)
|
||||||
|
self.df_all[
|
||||||
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME
|
||||||
|
] = (
|
||||||
|
self.df_all[
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE
|
||||||
|
]
|
||||||
|
+ self.df_all[
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO
|
||||||
|
]
|
||||||
|
+ self.df_all[
|
||||||
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE
|
||||||
|
]
|
||||||
|
) / self.df_all[
|
||||||
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||||
|
]
|
||||||
|
|
||||||
# Percentage High School Achievement is
|
# Percentage High School Achievement is
|
||||||
# Percentage = (Male + Female) / (Total)
|
# Percentage = (Male + Female) / (Total)
|
||||||
self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = (
|
self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = (
|
||||||
|
@ -218,6 +347,28 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
+ self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
+ self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
||||||
) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME]
|
) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME]
|
||||||
|
|
||||||
|
# Calculate employment.
|
||||||
|
self.df_all[self.UNEMPLOYMENT_FIELD_NAME] = (
|
||||||
|
self.df_all[self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD]
|
||||||
|
+ self.df_all[self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD]
|
||||||
|
) / (
|
||||||
|
self.df_all[self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD]
|
||||||
|
+ self.df_all[self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate area median income
|
||||||
|
median_income_df = pd.DataFrame(self.ISLAND_TERRITORIES)
|
||||||
|
median_income_df = median_income_df[
|
||||||
|
["fips", self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||||
|
]
|
||||||
|
self.df_all = self.df_all.merge(
|
||||||
|
right=median_income_df, left_on="state", right_on="fips", how="left"
|
||||||
|
)
|
||||||
|
self.df_all[self.AREA_MEDIAN_INCOME_FIELD_NAME] = (
|
||||||
|
self.df_all[self.MEDIAN_INCOME_FIELD_NAME]
|
||||||
|
/ self.df_all[self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||||
|
)
|
||||||
|
|
||||||
# Creating Geo ID (Census Block Group) Field Name
|
# Creating Geo ID (Census Block Group) Field Name
|
||||||
self.df_all[self.GEOID_TRACT_FIELD_NAME] = (
|
self.df_all[self.GEOID_TRACT_FIELD_NAME] = (
|
||||||
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
||||||
|
@ -238,9 +389,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
|
|
||||||
columns_to_include = [
|
columns_to_include = [
|
||||||
self.GEOID_TRACT_FIELD_NAME,
|
self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
self.TOTAL_POP_FIELD_NAME,
|
||||||
self.MEDIAN_INCOME_FIELD_NAME,
|
self.MEDIAN_INCOME_FIELD_NAME,
|
||||||
|
self.TERRITORY_MEDIAN_INCOME_FIELD,
|
||||||
|
self.AREA_MEDIAN_INCOME_FIELD_NAME,
|
||||||
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
|
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||||
|
self.UNEMPLOYMENT_FIELD_NAME,
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df_all[columns_to_include].to_csv(
|
self.df_all[columns_to_include].to_csv(
|
||||||
|
|
|
@ -12,15 +12,15 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.DOE_FILE_URL = (
|
self.DOE_FILE_URL = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/DOE_LEAD_with_EJSCREEN.csv.zip"
|
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.OUTPUT_PATH: Path = (
|
self.OUTPUT_PATH: Path = (
|
||||||
self.DATA_PATH / "dataset" / "doe_energy_burden"
|
self.DATA_PATH / "dataset" / "doe_energy_burden"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.TRACT_INPUT_COLUMN_NAME = "GEOID"
|
self.TRACT_INPUT_COLUMN_NAME = "FIP"
|
||||||
self.ENERGY_BURDEN_FIELD_NAME = "Energy burden"
|
self.ENERGY_BURDEN_FIELD_NAME = "BURDEN"
|
||||||
|
|
||||||
# Constants for output
|
# Constants for output
|
||||||
self.COLUMNS_TO_KEEP = [
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
@ -61,11 +61,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert energy burden to a fraction, since we represent all other percentages as fractions.
|
|
||||||
output_df[self.ENERGY_BURDEN_FIELD_NAME] = (
|
|
||||||
output_df[self.ENERGY_BURDEN_FIELD_NAME] / 100
|
|
||||||
)
|
|
||||||
|
|
||||||
# Left-pad the tracts with 0s
|
# Left-pad the tracts with 0s
|
||||||
expected_length_of_census_tract_field = 11
|
expected_length_of_census_tract_field = 11
|
||||||
output_df[self.GEOID_TRACT_FIELD_NAME] = (
|
output_df[self.GEOID_TRACT_FIELD_NAME] = (
|
||||||
|
|
|
@ -14,6 +14,27 @@ class EJSCREENETL(ExtractTransformLoad):
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
field_names.TOTAL_POP_FIELD,
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
field_names.AIR_TOXICS_CANCER_RISK_FIELD,
|
||||||
|
field_names.RESPITORY_HAZARD_FIELD,
|
||||||
|
field_names.DIESEL_FIELD,
|
||||||
|
field_names.PM25_FIELD,
|
||||||
|
field_names.OZONE_FIELD,
|
||||||
|
field_names.TRAFFIC_FIELD,
|
||||||
|
field_names.RMP_FIELD,
|
||||||
|
field_names.TSDF_FIELD,
|
||||||
|
field_names.NPL_FIELD,
|
||||||
|
field_names.WASTEWATER_FIELD,
|
||||||
|
field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
|
||||||
|
field_names.POVERTY_FIELD,
|
||||||
|
field_names.OVER_64_FIELD,
|
||||||
|
field_names.UNDER_5_FIELD,
|
||||||
|
field_names.LEAD_PAINT_FIELD,
|
||||||
|
]
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading EJScreen Data")
|
logger.info("Downloading EJScreen Data")
|
||||||
super().extract(
|
super().extract(
|
||||||
|
@ -51,7 +72,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
||||||
"PWDIS": field_names.WASTEWATER_FIELD,
|
"PWDIS": field_names.WASTEWATER_FIELD,
|
||||||
"LINGISOPCT": field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
|
"LINGISOPCT": field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
|
||||||
"LOWINCPCT": field_names.POVERTY_FIELD,
|
"LOWINCPCT": field_names.POVERTY_FIELD,
|
||||||
"LESSHSPCT": field_names.HIGH_SCHOOL_ED_FIELD,
|
|
||||||
"OVER64PCT": field_names.OVER_64_FIELD,
|
"OVER64PCT": field_names.OVER_64_FIELD,
|
||||||
"UNDER5PCT": field_names.UNDER_5_FIELD,
|
"UNDER5PCT": field_names.UNDER_5_FIELD,
|
||||||
"PRE1960PCT": field_names.LEAD_PAINT_FIELD,
|
"PRE1960PCT": field_names.LEAD_PAINT_FIELD,
|
||||||
|
@ -63,4 +83,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
||||||
logger.info("Saving EJScreen CSV")
|
logger.info("Saving EJScreen CSV")
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
self.CSV_PATH / "usa.csv", index=False
|
||||||
|
)
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n",
|
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"ACS_YEAR = 2019\n",
|
"ACS_YEAR = 2010\n",
|
||||||
"\n",
|
"\n",
|
||||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||||
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
|
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
|
||||||
|
@ -45,11 +45,13 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
|
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
|
||||||
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
|
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
|
||||||
"censusdata.printtable(\n",
|
"# censusdata.printtable(\n",
|
||||||
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
|
"# censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
|
||||||
")\n",
|
"# )\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')"
|
"censusdata.search(\n",
|
||||||
|
" src=\"acs5\", year=ACS_YEAR, field=\"label\", criterion=\"employment status\"\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "51412a14",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
|
@ -49,7 +48,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "e3234c61",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
|
@ -81,7 +79,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "3b1b5ccf",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
|
@ -108,7 +105,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "1b1083e8",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -142,7 +138,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "fec0ed63",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -165,7 +160,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "d9968187",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": false
|
"scrolled": false
|
||||||
},
|
},
|
||||||
|
@ -192,7 +186,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "a7cfeb3c",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": false
|
"scrolled": false
|
||||||
},
|
},
|
||||||
|
@ -222,7 +215,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "df458f08",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -255,7 +247,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "a6c85d87",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": false
|
"scrolled": false
|
||||||
},
|
},
|
||||||
|
@ -282,7 +273,7 @@
|
||||||
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
|
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if len(merged_df) > ExtractTransformLoad.EXPECTED_MAX_CENSUS_TRACTS:\n",
|
"if len(merged_df) > ExtractTransformLoad.EXPECTED_MAX_CENSUS_TRACTS:\n",
|
||||||
" raise ValueError(\"Too many rows in the join.\")\n",
|
" raise ValueError(f\"Too many rows in the join: {len(merged_df)}.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"merged_df.head()"
|
"merged_df.head()"
|
||||||
]
|
]
|
||||||
|
@ -290,7 +281,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "274f6bc6",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
|
@ -393,17 +383,17 @@
|
||||||
"ejscreen_areas_of_concern_census_block_group_indices = [\n",
|
"ejscreen_areas_of_concern_census_block_group_indices = [\n",
|
||||||
" Index(\n",
|
" Index(\n",
|
||||||
" method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
|
" method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
|
||||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n",
|
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD,\n",
|
||||||
" other_census_tract_fields_to_keep=[],\n",
|
" other_census_tract_fields_to_keep=[],\n",
|
||||||
" ),\n",
|
" ),\n",
|
||||||
" Index(\n",
|
" Index(\n",
|
||||||
" method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
|
" method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
|
||||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n",
|
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD,\n",
|
||||||
" other_census_tract_fields_to_keep=[],\n",
|
" other_census_tract_fields_to_keep=[],\n",
|
||||||
" ),\n",
|
" ),\n",
|
||||||
" Index(\n",
|
" Index(\n",
|
||||||
" method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
|
" method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
|
||||||
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,\n",
|
" priority_communities_field=field_names.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD,\n",
|
||||||
" other_census_tract_fields_to_keep=[],\n",
|
" other_census_tract_fields_to_keep=[],\n",
|
||||||
" ),\n",
|
" ),\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
|
@ -439,7 +429,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "bfae9cf5",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
|
@ -457,7 +446,8 @@
|
||||||
"\n",
|
"\n",
|
||||||
" # Calculate the population included as priority communities per tract. Will either be 0 or the population.\n",
|
" # Calculate the population included as priority communities per tract. Will either be 0 or the population.\n",
|
||||||
" df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
|
" df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
|
||||||
" df[priority_communities_field] * df[field_names.TOTAL_POP_FIELD]\n",
|
" df[priority_communities_field]\n",
|
||||||
|
" * df[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010]\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def calculate_state_comparison(\n",
|
" def calculate_state_comparison(\n",
|
||||||
|
@ -496,7 +486,9 @@
|
||||||
" summary_dict[\"Geography name\"] = division_id\n",
|
" summary_dict[\"Geography name\"] = division_id\n",
|
||||||
"\n",
|
"\n",
|
||||||
" total_tracts_in_geography = len(frame)\n",
|
" total_tracts_in_geography = len(frame)\n",
|
||||||
" total_population_in_geography = frame[field_names.TOTAL_POP_FIELD].sum()\n",
|
" total_population_in_geography = frame[\n",
|
||||||
|
" field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010\n",
|
||||||
|
" ].sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" if geography_field == field_names.URBAN_HEURISTIC_FIELD:\n",
|
" if geography_field == field_names.URBAN_HEURISTIC_FIELD:\n",
|
||||||
" urban_flag = frame[field_names.URBAN_HEURISTIC_FIELD].unique()[0]\n",
|
" urban_flag = frame[field_names.URBAN_HEURISTIC_FIELD].unique()[0]\n",
|
||||||
|
@ -719,7 +711,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "c4d0e783",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -825,7 +816,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "8790cd64",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
|
@ -1024,7 +1014,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "eeb9699d",
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
|
@ -1201,7 +1190,6 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "983abcea",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
|
|
@ -57,13 +57,13 @@ AMI_FIELD = "Area Median Income (State or metropolitan)"
|
||||||
|
|
||||||
# Climate
|
# Climate
|
||||||
FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score"
|
FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score"
|
||||||
EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = (
|
EXPECTED_BUILDING_LOSS_RATE_FIELD = (
|
||||||
"Expected building loss rate (Natural Hazards Risk Index)"
|
"Expected building loss rate (Natural Hazards Risk Index)"
|
||||||
)
|
)
|
||||||
EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = (
|
EXPECTED_AGRICULTURE_LOSS_RATE_FIELD = (
|
||||||
"Expected agricultural loss rate (Natural Hazards Risk Index)"
|
"Expected agricultural loss rate (Natural Hazards Risk Index)"
|
||||||
)
|
)
|
||||||
EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = (
|
EXPECTED_POPULATION_LOSS_RATE_FIELD = (
|
||||||
"Expected population loss rate (Natural Hazards Risk Index)"
|
"Expected population loss rate (Natural Hazards Risk Index)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -117,6 +117,34 @@ AGGREGATION_POPULATION_FIELD = "Population Characteristics"
|
||||||
UNDER_5_FIELD = "Individuals under 5 years old"
|
UNDER_5_FIELD = "Individuals under 5 years old"
|
||||||
OVER_64_FIELD = "Individuals over 64 years old"
|
OVER_64_FIELD = "Individuals over 64 years old"
|
||||||
|
|
||||||
|
# Fields from 2010 decennial census (generally only loaded for the territories)
|
||||||
|
CENSUS_DECENNIAL_MEDIAN_INCOME_2009 = "Median household income in 2009 ($)"
|
||||||
|
CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = (
|
||||||
|
"Median household income as a percent of territory median income in 2009"
|
||||||
|
)
|
||||||
|
CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = (
|
||||||
|
"Percentage households below 100% of federal poverty line in 2009"
|
||||||
|
)
|
||||||
|
CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009"
|
||||||
|
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009"
|
||||||
|
CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009"
|
||||||
|
|
||||||
|
# Fields from 2010 ACS (loaded for comparison with the territories)
|
||||||
|
CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010"
|
||||||
|
CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
|
||||||
|
"Percent of individuals < 100% Federal Poverty Line in 2010"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combined fields that merge island areas and states data
|
||||||
|
COMBINED_CENSUS_TOTAL_POPULATION_2010 = (
|
||||||
|
"Total population in 2009 (island areas) and 2019 (states and PR)"
|
||||||
|
)
|
||||||
|
COMBINED_UNEMPLOYMENT_2010 = "Unemployed civilians (percent) in 2009 (island areas) and 2010 (states and PR)"
|
||||||
|
COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
|
||||||
|
"Percentage households below 100% of federal poverty line in 2009 (island areas) "
|
||||||
|
"and 2010 (states and PR)"
|
||||||
|
)
|
||||||
|
|
||||||
# Urban Rural Map
|
# Urban Rural Map
|
||||||
URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
|
URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
|
||||||
|
|
||||||
|
@ -124,39 +152,39 @@ URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
|
||||||
MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
|
MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
|
||||||
|
|
||||||
# EJSCREEN Areas of Concern
|
# EJSCREEN Areas of Concern
|
||||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, National, 70th percentile (communities)"
|
"EJSCREEN Areas of Concern, National, 70th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, National, 75th percentile (communities)"
|
"EJSCREEN Areas of Concern, National, 75th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, National, 80th percentile (communities)"
|
"EJSCREEN Areas of Concern, National, 80th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, National, 85th percentile (communities)"
|
"EJSCREEN Areas of Concern, National, 85th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, National, 90th percentile (communities)"
|
"EJSCREEN Areas of Concern, National, 90th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, National, 95th percentile (communities)"
|
"EJSCREEN Areas of Concern, National, 95th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, State, 70th percentile (communities)"
|
"EJSCREEN Areas of Concern, State, 70th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, State, 75th percentile (communities)"
|
"EJSCREEN Areas of Concern, State, 75th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, State, 80th percentile (communities)"
|
"EJSCREEN Areas of Concern, State, 80th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, State, 85th percentile (communities)"
|
"EJSCREEN Areas of Concern, State, 85th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, State, 90th percentile (communities)"
|
"EJSCREEN Areas of Concern, State, 90th percentile (communities)"
|
||||||
)
|
)
|
||||||
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = (
|
||||||
"EJSCREEN Areas of Concern, State, 95th percentile (communities)"
|
"EJSCREEN Areas of Concern, State, 95th percentile (communities)"
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.score.score import Score
|
from data_pipeline.score.score import Score
|
||||||
|
@ -12,8 +13,86 @@ class ScoreL(Score):
|
||||||
self.LOW_INCOME_THRESHOLD: float = 0.65
|
self.LOW_INCOME_THRESHOLD: float = 0.65
|
||||||
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
|
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
|
||||||
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
|
||||||
|
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
|
||||||
|
|
||||||
super().__init__(df)
|
super().__init__(df)
|
||||||
|
|
||||||
|
def _combine_island_areas_with_states_and_set_thresholds(
|
||||||
|
self,
|
||||||
|
df: pd.DataFrame,
|
||||||
|
column_from_island_areas: str,
|
||||||
|
column_from_decennial_census: str,
|
||||||
|
combined_column_name: str,
|
||||||
|
threshold_cutoff_for_island_areas: float,
|
||||||
|
) -> (pd.DataFrame, str):
|
||||||
|
"""Steps to set thresholds for island areas.
|
||||||
|
|
||||||
|
This function is fairly logically complicated. It takes the following steps:
|
||||||
|
|
||||||
|
1. Combine the two different fields into a single field.
|
||||||
|
2. Calculate the 90th percentile cutoff raw value for the combined field.
|
||||||
|
3. Create a boolean series that is true for any census tract in the island
|
||||||
|
areas (and only the island areas) that exceeds this cutoff.
|
||||||
|
|
||||||
|
For step one, it combines data that is either the island area's Decennial Census
|
||||||
|
value in 2009 or the state's value in 5-year ACS ending in 2010.
|
||||||
|
|
||||||
|
This will be used to generate the percentile cutoff for the 90th percentile.
|
||||||
|
|
||||||
|
The stateside decennial census stopped asking economic comparisons,
|
||||||
|
so this is as close to apples-to-apples as we get. We use 5-year ACS for data
|
||||||
|
robustness over 1-year ACS.
|
||||||
|
"""
|
||||||
|
# Create the combined field.
|
||||||
|
# There should only be one entry in either 2009 or 2019 fields, not one in both.
|
||||||
|
# But just to be safe, we take the mean and ignore null values so if there
|
||||||
|
# *were* entries in both, this result would make sense.
|
||||||
|
df[combined_column_name] = df[
|
||||||
|
[column_from_island_areas, column_from_decennial_census]
|
||||||
|
].mean(axis=1, skipna=True)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Combined field `{combined_column_name}` has "
|
||||||
|
f"{df[combined_column_name].isnull().sum()} "
|
||||||
|
f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
|
||||||
|
f"missing values for census tracts. "
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate the percentile threshold raw value.
|
||||||
|
raw_threshold = np.nanquantile(
|
||||||
|
a=df[combined_column_name], q=threshold_cutoff_for_island_areas
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"For combined field `{combined_column_name}`, "
|
||||||
|
f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
|
||||||
|
f"raw value of {raw_threshold:.3f}."
|
||||||
|
)
|
||||||
|
|
||||||
|
threshold_column_name = (
|
||||||
|
f"{column_from_island_areas} exceeds "
|
||||||
|
f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
|
||||||
|
)
|
||||||
|
|
||||||
|
df[threshold_column_name] = (
|
||||||
|
df[column_from_island_areas] >= raw_threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
percent_of_tracts_highlighted = (
|
||||||
|
100
|
||||||
|
* df[threshold_column_name].sum()
|
||||||
|
/ df[column_from_island_areas].notnull().sum()
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"For `{threshold_column_name}`, "
|
||||||
|
f"{df[threshold_column_name].sum()} ("
|
||||||
|
f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data "
|
||||||
|
f"in the column) have a value of TRUE."
|
||||||
|
)
|
||||||
|
|
||||||
|
return df, threshold_column_name
|
||||||
|
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score L")
|
logger.info("Adding Score L")
|
||||||
|
|
||||||
|
@ -67,21 +146,21 @@ class ScoreL(Score):
|
||||||
climate_criteria = (
|
climate_criteria = (
|
||||||
(
|
(
|
||||||
self.df[
|
self.df[
|
||||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME
|
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
]
|
]
|
||||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
)
|
)
|
||||||
| (
|
| (
|
||||||
self.df[
|
self.df[
|
||||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
|
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
]
|
]
|
||||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
)
|
)
|
||||||
| (
|
| (
|
||||||
self.df[
|
self.df[
|
||||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME
|
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
|
||||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
]
|
]
|
||||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
|
@ -204,14 +283,24 @@ class ScoreL(Score):
|
||||||
# poverty level. Source: Census's American Community Survey]
|
# poverty level. Source: Census's American Community Survey]
|
||||||
|
|
||||||
pollution_criteria = (
|
pollution_criteria = (
|
||||||
self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
|
(
|
||||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
self.df[
|
||||||
) | (
|
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
|
]
|
||||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
) | (
|
)
|
||||||
self.df[field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
|
| (
|
||||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
self.df[
|
||||||
|
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
|
]
|
||||||
|
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
|
)
|
||||||
|
| (
|
||||||
|
self.df[
|
||||||
|
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
|
]
|
||||||
|
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return pollution_criteria & (
|
return pollution_criteria & (
|
||||||
|
@ -306,7 +395,7 @@ class ScoreL(Score):
|
||||||
# AND
|
# AND
|
||||||
# Where the high school degree achievement rates for adults 25 years and older is less than 95%
|
# Where the high school degree achievement rates for adults 25 years and older is less than 95%
|
||||||
# (necessary to screen out university block groups)
|
# (necessary to screen out university block groups)
|
||||||
workforce_criteria = (
|
workforce_criteria_for_states = (
|
||||||
(
|
(
|
||||||
self.df[
|
self.df[
|
||||||
field_names.UNEMPLOYMENT_FIELD
|
field_names.UNEMPLOYMENT_FIELD
|
||||||
|
@ -338,6 +427,76 @@ class ScoreL(Score):
|
||||||
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
workforce_combined_criteria_for_states = (
|
||||||
|
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
|
||||||
|
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
|
||||||
|
) & workforce_criteria_for_states
|
||||||
|
|
||||||
|
# Now, calculate workforce criteria for island territories.
|
||||||
|
|
||||||
|
# F a couple of values, create a combined field and criteria field.
|
||||||
|
# First, combine unemployment.
|
||||||
|
(
|
||||||
|
self.df,
|
||||||
|
unemployment_island_areas_criteria_field_name,
|
||||||
|
) = self._combine_island_areas_with_states_and_set_thresholds(
|
||||||
|
df=self.df,
|
||||||
|
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
|
||||||
|
column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
|
||||||
|
combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
|
||||||
|
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Next, combine poverty.
|
||||||
|
(
|
||||||
|
self.df,
|
||||||
|
poverty_island_areas_criteria_field_name,
|
||||||
|
) = self._combine_island_areas_with_states_and_set_thresholds(
|
||||||
|
df=self.df,
|
||||||
|
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
|
||||||
|
column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||||
|
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||||
|
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
|
||||||
|
)
|
||||||
|
|
||||||
|
workforce_combined_criteria_for_island_areas = (
|
||||||
|
self.df[unemployment_island_areas_criteria_field_name]
|
||||||
|
| self.df[poverty_island_areas_criteria_field_name]
|
||||||
|
# Also check whether area median income is 10th percentile or lower
|
||||||
|
# within the islands.
|
||||||
|
| (
|
||||||
|
self.df[
|
||||||
|
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
|
||||||
|
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
|
]
|
||||||
|
# Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
|
||||||
|
# and then look for median income lower than that (not greater than).
|
||||||
|
< 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
|
||||||
|
)
|
||||||
|
) & (
|
||||||
|
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
|
||||||
|
> self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
|
||||||
|
)
|
||||||
|
|
||||||
|
percent_of_island_tracts_highlighted = (
|
||||||
|
100
|
||||||
|
* workforce_combined_criteria_for_island_areas.sum()
|
||||||
|
# Choosing a random column from island areas to calculate the denominator.
|
||||||
|
/ self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
|
||||||
|
.notnull()
|
||||||
|
.sum()
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"For workforce criteria in island areas, "
|
||||||
|
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
||||||
|
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
||||||
|
f"in the column) have a value of TRUE."
|
||||||
|
)
|
||||||
|
|
||||||
|
# A tract is included if it meets either the states tract criteria or the
|
||||||
|
# island areas tract criteria.
|
||||||
return (
|
return (
|
||||||
self.df[field_names.HIGH_SCHOOL_ED_FIELD] >= 0.10
|
workforce_combined_criteria_for_states
|
||||||
) & workforce_criteria
|
| workforce_combined_criteria_for_island_areas
|
||||||
|
)
|
||||||
|
|
|
@ -67,6 +67,9 @@ disable = [
|
||||||
"C0115", # Disables missing class docstring
|
"C0115", # Disables missing class docstring
|
||||||
"R0915", # Disables too many statements (score generation transform)
|
"R0915", # Disables too many statements (score generation transform)
|
||||||
"W0231", # Disables super init not called
|
"W0231", # Disables super init not called
|
||||||
|
"R0801", # Disables duplicate code. There are a couple places we have similar code and
|
||||||
|
# unfortunately you can't disable this rule for individual lines or files, it's a
|
||||||
|
# known bug. https://github.com/PyCQA/pylint/issues/214#
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.pylint.FORMAT]
|
[tool.pylint.FORMAT]
|
||||||
|
|
Loading…
Add table
Reference in a new issue