mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 10:51:16 -07:00
Issue 844: Add island areas to Definition L (#957)
This ended up being a pretty large task. Here's what this PR does: 1. Pulls in Vincent's data from island areas into the score ETL. This is from the 2010 decennial census, the last census of any kind in the island areas. 2. Grabs a few new fields from 2010 island areas decennial census. 3. Calculates area median income for island areas. 4. Stops using EJSCREEN as the source of our high school education data and directly pulls that from census (this was related to this project so I went ahead and fixed it). 5. Grabs a bunch of data from the 2010 ACS in the states/Puerto Rico/DC, so that we can create percentiles comparing apples-to-apples (ish) from 2010 island areas decennial census data to 2010 ACS data. This required creating a new class because all the ACS fields are different between 2010 and 2019, so it wasn't as simple as looping over a year parameter. 6. Creates a combined population field of island areas and mainland so we can use those stats in our comparison tool, and updates the comparison tool accordingly.
This commit is contained in:
parent
8cb9d197df
commit
1d101c93d2
15 changed files with 882 additions and 153 deletions
|
@ -32,10 +32,15 @@ class ExtractTransformLoad:
|
|||
FILES_PATH: Path = settings.APP_ROOT / "files"
|
||||
GEOID_FIELD_NAME: str = "GEOID10"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
|
||||
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||
# be from CBGs at different time periods.
|
||||
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
|
||||
# TODO: investigate. Census says there are only 73,057 tracts in the US. This might be from tracts at different time periods.
|
||||
EXPECTED_MAX_CENSUS_TRACTS: int = 74027
|
||||
|
||||
# TODO: investigate. Census says there are only 74,134 tracts in the US,
|
||||
# Puerto Rico, and island areas. This might be from tracts at different time
|
||||
# periods. https://github.com/usds/justice40-tool/issues/964
|
||||
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
|
||||
|
||||
def __init__(self, config_path: Path) -> None:
|
||||
"""Inits the class with instance specific variables"""
|
||||
|
|
|
@ -4,6 +4,11 @@ DATASET_LIST = [
|
|||
"module_dir": "census_acs",
|
||||
"class_name": "CensusACSETL",
|
||||
},
|
||||
{
|
||||
"name": "census_acs_2010",
|
||||
"module_dir": "census_acs_2010",
|
||||
"class_name": "CensusACS2010ETL",
|
||||
},
|
||||
{
|
||||
"name": "ejscreen",
|
||||
"module_dir": "ejscreen",
|
||||
|
@ -14,16 +19,6 @@ DATASET_LIST = [
|
|||
"module_dir": "hud_housing",
|
||||
"class_name": "HudHousingETL",
|
||||
},
|
||||
{
|
||||
"name": "calenviroscreen",
|
||||
"module_dir": "calenviroscreen",
|
||||
"class_name": "CalEnviroScreenETL",
|
||||
},
|
||||
{
|
||||
"name": "hud_recap",
|
||||
"module_dir": "hud_recap",
|
||||
"class_name": "HudRecapETL",
|
||||
},
|
||||
{
|
||||
"name": "cdc_places",
|
||||
"module_dir": "cdc_places",
|
||||
|
@ -74,6 +69,16 @@ DATASET_LIST = [
|
|||
"module_dir": "housing_and_transportation",
|
||||
"class_name": "HousingTransportationETL",
|
||||
},
|
||||
{
|
||||
"name": "calenviroscreen",
|
||||
"module_dir": "calenviroscreen",
|
||||
"class_name": "CalEnviroScreenETL",
|
||||
},
|
||||
{
|
||||
"name": "hud_recap",
|
||||
"module_dir": "hud_recap",
|
||||
"class_name": "HudRecapETL",
|
||||
},
|
||||
{
|
||||
"name": "tree_equity_score",
|
||||
"module_dir": "tree_equity_score",
|
||||
|
|
|
@ -27,6 +27,8 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.national_risk_index_df: pd.DataFrame
|
||||
self.geocorr_urban_rural_df: pd.DataFrame
|
||||
self.persistent_poverty_df: pd.DataFrame
|
||||
self.census_decennial_df: pd.DataFrame
|
||||
self.census_2010_df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Loading data sets from disk.")
|
||||
|
@ -137,6 +139,29 @@ class ScoreETL(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
|
||||
# Load decennial census data
|
||||
census_decennial_csv = (
|
||||
constants.DATA_PATH
|
||||
/ "dataset"
|
||||
/ "census_decennial_2010"
|
||||
/ "usa.csv"
|
||||
)
|
||||
self.census_decennial_df = pd.read_csv(
|
||||
census_decennial_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
# Load 2010 ACS data from states
|
||||
census_2010_csv = (
|
||||
constants.DATA_PATH / "dataset" / "census_acs_2010" / "usa.csv"
|
||||
)
|
||||
self.census_2010_df = pd.read_csv(
|
||||
census_2010_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
||||
logger.info("Joining Census Tract dataframes")
|
||||
|
||||
|
@ -228,6 +253,8 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.persistent_poverty_df,
|
||||
self.national_risk_index_df,
|
||||
self.census_acs_median_incomes_df,
|
||||
self.census_decennial_df,
|
||||
self.census_2010_df,
|
||||
]
|
||||
|
||||
# Sanity check each data frame before merging.
|
||||
|
@ -296,9 +323,16 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.MEDIAN_HOUSE_VALUE_FIELD,
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME,
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME,
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME,
|
||||
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD,
|
||||
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD,
|
||||
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
|
||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
|
||||
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
|
||||
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
|
||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
|
||||
]
|
||||
|
||||
non_numeric_columns = [
|
||||
|
@ -315,9 +349,9 @@ class ScoreETL(ExtractTransformLoad):
|
|||
# Convert all columns to numeric and do math
|
||||
for col in numeric_columns:
|
||||
# Calculate percentiles
|
||||
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[col].rank(
|
||||
pct=True
|
||||
)
|
||||
df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[
|
||||
col
|
||||
].rank(pct=True)
|
||||
|
||||
# Min-max normalization:
|
||||
# (
|
||||
|
@ -341,6 +375,20 @@ class ScoreETL(ExtractTransformLoad):
|
|||
df_copy[col] - min_value
|
||||
) / (max_value - min_value)
|
||||
|
||||
# Special logic: create a combined population field.
|
||||
# We sometimes run analytics on "population", and this makes a single field
|
||||
# that is either the island area's population in 2009 or the state's
|
||||
# population in 2019.
|
||||
# There should only be one entry in either 2009 or 2019, not one in both.
|
||||
# But just to be safe, we take the mean and ignore null values so if there
|
||||
# *were* entries in both fields, this result would make sense.
|
||||
df_copy[field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010] = df_copy[
|
||||
[
|
||||
field_names.TOTAL_POP_FIELD,
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
|
||||
]
|
||||
].mean(axis=1, skipna=True)
|
||||
|
||||
return df_copy
|
||||
|
||||
def transform(self) -> None:
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
import pandas as pd
|
||||
import censusdata
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
@ -14,7 +13,15 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.OUTPUT_PATH = (
|
||||
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
|
||||
)
|
||||
|
||||
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
|
||||
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
|
||||
self.EMPLOYMENT_FIELDS = [
|
||||
self.TOTAL_UNEMPLOYED_FIELD,
|
||||
self.TOTAL_IN_LABOR_FORCE,
|
||||
]
|
||||
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
||||
|
||||
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
|
||||
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
|
||||
"Linguistic isolation (total)"
|
||||
|
@ -55,59 +62,89 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
"Median value ($) of owner-occupied housing units"
|
||||
)
|
||||
|
||||
# Educational attainment figures
|
||||
self.EDUCATION_POPULATION_OVER_25 = "B15003_001E" # Estimate!!Total
|
||||
self.EDUCATION_NO_SCHOOLING = (
|
||||
"B15003_002E" # Estimate!!Total!!No schooling completed
|
||||
)
|
||||
self.EDUCATION_NURSERY = (
|
||||
"B15003_003E" # Estimate!!Total!!Nursery school
|
||||
)
|
||||
self.EDUCATION_KINDERGARTEN = (
|
||||
"B15003_004E" # Estimate!!Total!!Kindergarten
|
||||
)
|
||||
self.EDUCATION_FIRST = "B15003_005E" # Estimate!!Total!!1st grade
|
||||
self.EDUCATION_SECOND = "B15003_006E" # Estimate!!Total!!2nd grade
|
||||
self.EDUCATION_THIRD = "B15003_007E" # Estimate!!Total!!3rd grade
|
||||
self.EDUCATION_FOURTH = "B15003_008E" # Estimate!!Total!!4th grade
|
||||
self.EDUCATION_FIFTH = "B15003_009E" # Estimate!!Total!!5th grade
|
||||
self.EDUCATION_SIXTH = "B15003_010E" # Estimate!!Total!!6th grade
|
||||
self.EDUCATION_SEVENTH = "B15003_011E" # Estimate!!Total!!7th grade
|
||||
self.EDUCATION_EIGHTH = "B15003_012E" # Estimate!!Total!!8th grade
|
||||
self.EDUCATION_NINTH = "B15003_013E" # Estimate!!Total!!9th grade
|
||||
self.EDUCATION_TENTH = "B15003_014E" # Estimate!!Total!!10th grade
|
||||
self.EDUCATION_ELEVENTH = "B15003_015E" # Estimate!!Total!!11th grade
|
||||
self.EDUCATION_TWELFTH_NO_DIPLOMA = (
|
||||
"B15003_016E" # Estimate!!Total!!12th grade, no diploma
|
||||
)
|
||||
|
||||
self.EDUCATIONAL_FIELDS = [
|
||||
self.EDUCATION_POPULATION_OVER_25,
|
||||
self.EDUCATION_NO_SCHOOLING,
|
||||
self.EDUCATION_NURSERY,
|
||||
self.EDUCATION_KINDERGARTEN,
|
||||
self.EDUCATION_FIRST,
|
||||
self.EDUCATION_SECOND,
|
||||
self.EDUCATION_THIRD,
|
||||
self.EDUCATION_FOURTH,
|
||||
self.EDUCATION_FIFTH,
|
||||
self.EDUCATION_SIXTH,
|
||||
self.EDUCATION_SEVENTH,
|
||||
self.EDUCATION_EIGHTH,
|
||||
self.EDUCATION_NINTH,
|
||||
self.EDUCATION_TENTH,
|
||||
self.EDUCATION_ELEVENTH,
|
||||
self.EDUCATION_TWELFTH_NO_DIPLOMA,
|
||||
]
|
||||
|
||||
self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD = (
|
||||
"Individuals age 25 or over with less than high school degree"
|
||||
)
|
||||
self.HIGH_SCHOOL_ED_FIELD = "Percent individuals age 25 or over with less than high school degree"
|
||||
|
||||
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
||||
|
||||
self.df: pd.DataFrame
|
||||
|
||||
def _fips_from_censusdata_censusgeo(
|
||||
self, censusgeo: censusdata.censusgeo
|
||||
) -> str:
|
||||
"""Create a FIPS code from the proprietary censusgeo index."""
|
||||
fips = "".join([value for (key, value) in censusgeo.params()])
|
||||
return fips
|
||||
|
||||
def extract(self) -> None:
|
||||
dfs = []
|
||||
for fips in get_state_fips_codes(self.DATA_PATH):
|
||||
logger.info(
|
||||
f"Downloading data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
# Define the variables to retrieve
|
||||
variables = (
|
||||
[
|
||||
# Income field
|
||||
self.MEDIAN_INCOME_FIELD,
|
||||
# House value
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD,
|
||||
]
|
||||
+ self.EMPLOYMENT_FIELDS
|
||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||
+ self.POVERTY_FIELDS
|
||||
+ self.EDUCATIONAL_FIELDS
|
||||
)
|
||||
|
||||
try:
|
||||
response = censusdata.download(
|
||||
src="acs5",
|
||||
year=self.ACS_YEAR,
|
||||
geo=censusdata.censusgeo(
|
||||
[("state", fips), ("county", "*"), ("tract", "*")]
|
||||
),
|
||||
var=[
|
||||
# Emploment fields
|
||||
"B23025_005E",
|
||||
"B23025_003E",
|
||||
# Income field
|
||||
self.MEDIAN_INCOME_FIELD,
|
||||
# House value
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD,
|
||||
]
|
||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||
+ self.POVERTY_FIELDS,
|
||||
)
|
||||
dfs.append(response)
|
||||
except ValueError:
|
||||
logger.error(
|
||||
f"Could not download data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
self.df = pd.concat(dfs)
|
||||
|
||||
self.df[self.GEOID_TRACT_FIELD_NAME] = self.df.index.to_series().apply(
|
||||
func=self._fips_from_censusdata_censusgeo
|
||||
self.df = retrieve_census_acs_data(
|
||||
acs_year=self.ACS_YEAR,
|
||||
variables=variables,
|
||||
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||
data_path_for_fips_codes=self.DATA_PATH,
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting Census ACS Transform")
|
||||
|
||||
df = self.df
|
||||
|
||||
# Rename two fields.
|
||||
self.df = self.df.rename(
|
||||
df = df.rename(
|
||||
columns={
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
||||
|
@ -119,19 +156,17 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.MEDIAN_INCOME_FIELD_NAME,
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||
]:
|
||||
missing_value_count = sum(self.df[field] == -666666666)
|
||||
missing_value_count = sum(df[field] == -666666666)
|
||||
logger.info(
|
||||
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
|
||||
f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
|
||||
+ f"`{field}` being marked as null values."
|
||||
)
|
||||
self.df[field] = self.df[field].replace(
|
||||
to_replace=-666666666, value=None
|
||||
)
|
||||
df[field] = df[field].replace(to_replace=-666666666, value=None)
|
||||
|
||||
# Calculate percent unemployment.
|
||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||
self.df[self.UNEMPLOYED_FIELD_NAME] = (
|
||||
self.df.B23025_005E / self.df.B23025_003E
|
||||
df[self.UNEMPLOYED_FIELD_NAME] = (
|
||||
df[self.TOTAL_UNEMPLOYED_FIELD] / df[self.TOTAL_IN_LABOR_FORCE]
|
||||
)
|
||||
|
||||
# Calculate linguistic isolation.
|
||||
|
@ -142,34 +177,64 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
"C16002_013E",
|
||||
]
|
||||
|
||||
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = self.df[
|
||||
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[
|
||||
individual_limited_english_fields
|
||||
].sum(axis=1, skipna=True)
|
||||
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
|
||||
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
|
||||
/ self.df["C16002_001E"]
|
||||
df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
|
||||
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
|
||||
/ df["C16002_001E"]
|
||||
)
|
||||
|
||||
# Calculate percent at different poverty thresholds
|
||||
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||
self.df["C17002_002E"] + self.df["C17002_003E"]
|
||||
) / self.df["C17002_001E"]
|
||||
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"] + df["C17002_003E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
self.df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||
self.df["C17002_002E"]
|
||||
+ self.df["C17002_003E"]
|
||||
+ self.df["C17002_004E"]
|
||||
+ self.df["C17002_005E"]
|
||||
) / self.df["C17002_001E"]
|
||||
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
self.df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||
self.df["C17002_002E"]
|
||||
+ self.df["C17002_003E"]
|
||||
+ self.df["C17002_004E"]
|
||||
+ self.df["C17002_005E"]
|
||||
+ self.df["C17002_006E"]
|
||||
+ self.df["C17002_007E"]
|
||||
) / self.df["C17002_001E"]
|
||||
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
+ df["C17002_006E"]
|
||||
+ df["C17002_007E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
# Calculate educational attainment
|
||||
educational_numerator_fields = [
|
||||
self.EDUCATION_NO_SCHOOLING,
|
||||
self.EDUCATION_NURSERY,
|
||||
self.EDUCATION_KINDERGARTEN,
|
||||
self.EDUCATION_FIRST,
|
||||
self.EDUCATION_SECOND,
|
||||
self.EDUCATION_THIRD,
|
||||
self.EDUCATION_FOURTH,
|
||||
self.EDUCATION_FIFTH,
|
||||
self.EDUCATION_SIXTH,
|
||||
self.EDUCATION_SEVENTH,
|
||||
self.EDUCATION_EIGHTH,
|
||||
self.EDUCATION_NINTH,
|
||||
self.EDUCATION_TENTH,
|
||||
self.EDUCATION_ELEVENTH,
|
||||
self.EDUCATION_TWELFTH_NO_DIPLOMA,
|
||||
]
|
||||
|
||||
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD] = df[
|
||||
educational_numerator_fields
|
||||
].sum(axis=1)
|
||||
df[self.HIGH_SCHOOL_ED_FIELD] = (
|
||||
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD]
|
||||
/ df[self.EDUCATION_POPULATION_OVER_25]
|
||||
)
|
||||
|
||||
# Save results to self.
|
||||
self.df = df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census ACS Data")
|
||||
|
@ -186,6 +251,7 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||
self.HIGH_SCHOOL_ED_FIELD,
|
||||
]
|
||||
|
||||
self.df[columns_to_include].to_csv(
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
from pathlib import Path
|
||||
from typing import List
|
||||
import censusdata
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def _fips_from_censusdata_censusgeo(
|
||||
censusgeo: censusdata.censusgeo
|
||||
) -> str:
|
||||
"""Create a FIPS code from the proprietary censusgeo index."""
|
||||
fips = "".join([value for (key, value) in censusgeo.params()])
|
||||
return fips
|
||||
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def retrieve_census_acs_data(
|
||||
acs_year: int,
|
||||
variables: List[str],
|
||||
tract_output_field_name: str,
|
||||
data_path_for_fips_codes: Path,
|
||||
acs_type="acs5",
|
||||
raise_errors: bool = False,
|
||||
) -> pd.DataFrame:
|
||||
"""Retrieves and combines census ACS data for a given year."""
|
||||
dfs = []
|
||||
for fips in get_state_fips_codes(data_path_for_fips_codes):
|
||||
logger.info(
|
||||
f"Downloading data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
try:
|
||||
response = censusdata.download(
|
||||
src=acs_type,
|
||||
year=acs_year,
|
||||
geo=censusdata.censusgeo(
|
||||
[("state", fips), ("county", "*"), ("tract", "*")]
|
||||
),
|
||||
var=variables,
|
||||
)
|
||||
dfs.append(response)
|
||||
|
||||
except ValueError as e:
|
||||
logger.error(
|
||||
f"Could not download data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
if raise_errors:
|
||||
raise e
|
||||
|
||||
df = pd.concat(dfs)
|
||||
|
||||
df[tract_output_field_name] = df.index.to_series().apply(
|
||||
func=_fips_from_censusdata_censusgeo
|
||||
)
|
||||
|
||||
return df
|
|
@ -0,0 +1,186 @@
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class CensusACS2010ETL(ExtractTransformLoad):
|
||||
"""Extract ACS data from 2010 or approximately that year.
|
||||
|
||||
Note: Census ACS 2010 uses different fields than those captured in CensusACSETL.
|
||||
|
||||
To support this, we created a separate class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ACS_YEAR = 2010
|
||||
self.ACS_TYPE = "acs5"
|
||||
self.OUTPUT_PATH = (
|
||||
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
|
||||
)
|
||||
|
||||
# Employment fields
|
||||
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED = (
|
||||
"B23006_007E"
|
||||
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED = (
|
||||
"B23006_014E"
|
||||
# Estimate!!Total!!High school graduate!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED = (
|
||||
"B23006_021E"
|
||||
# Estimate!!Total!!Some college or associate's degree!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_COLLEGE_UNEMPLOYED = (
|
||||
"B23006_028E"
|
||||
# Estimate!!Total!!Bachelor's degree or higher!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
self.UNEMPLOYED_FIELDS = [
|
||||
self.EMPLOYMENT_LESS_THAN_HS_UNEMPLOYED,
|
||||
self.EMPLOYMENT_HS_GRADUATE_UNEMPLOYED,
|
||||
self.EMPLOYMENT_SOME_COLLEGE_UNEMPLOYED,
|
||||
self.EMPLOYMENT_COLLEGE_UNEMPLOYED,
|
||||
]
|
||||
|
||||
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE = (
|
||||
# TODO: FIX!!!!!!
|
||||
"B23006_005E"
|
||||
# Estimate!!Total!!Less than high school graduate!!In labor force!!Civilian
|
||||
)
|
||||
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE = (
|
||||
"B23006_010E"
|
||||
# Estimate!!Total!!High school graduate!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE = (
|
||||
"B23006_017E"
|
||||
# Estimate!!Total!!Some college or associate's degree!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE = (
|
||||
"B23006_024E"
|
||||
# Estimate!!Total!!Bachelor's degree or higher!!In labor force
|
||||
)
|
||||
|
||||
self.IN_LABOR_FORCE_FIELDS = [
|
||||
self.EMPLOYMENT_LESS_THAN_HS_IN_LABOR_FORCE,
|
||||
self.EMPLOYMENT_HS_GRADUATE_IN_LABOR_FORCE,
|
||||
self.EMPLOYMENT_SOME_COLLEGE_IN_LABOR_FORCE,
|
||||
self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE,
|
||||
]
|
||||
|
||||
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
|
||||
|
||||
self.POVERTY_FIELDS = [
|
||||
"C17002_001E", # Estimate!!Total,
|
||||
"C17002_002E", # Estimate!!Total!!Under .50
|
||||
"C17002_003E", # Estimate!!Total!!.50 to .99
|
||||
"C17002_004E", # Estimate!!Total!!1.00 to 1.24
|
||||
"C17002_005E", # Estimate!!Total!!1.25 to 1.49
|
||||
"C17002_006E", # Estimate!!Total!!1.50 to 1.84
|
||||
"C17002_007E", # Estimate!!Total!!1.85 to 1.99
|
||||
]
|
||||
|
||||
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME = (
|
||||
"Percent of individuals < 100% Federal Poverty Line"
|
||||
)
|
||||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME = (
|
||||
"Percent of individuals < 150% Federal Poverty Line"
|
||||
)
|
||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (
|
||||
"Percent of individuals < 200% Federal Poverty Line"
|
||||
)
|
||||
|
||||
self.STATE_GEOID_FIELD_NAME = "GEOID2"
|
||||
|
||||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
# Define the variables to retrieve
|
||||
variables = (
|
||||
self.UNEMPLOYED_FIELDS
|
||||
+ self.IN_LABOR_FORCE_FIELDS
|
||||
+ self.POVERTY_FIELDS
|
||||
)
|
||||
|
||||
# Use the method defined on CensusACSETL to reduce coding redundancy.
|
||||
self.df = retrieve_census_acs_data(
|
||||
acs_year=self.ACS_YEAR,
|
||||
variables=variables,
|
||||
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||
data_path_for_fips_codes=self.DATA_PATH,
|
||||
acs_type=self.ACS_TYPE,
|
||||
raise_errors=False,
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting Census ACS Transform")
|
||||
|
||||
df = self.df
|
||||
|
||||
# Calculate percent unemployment.
|
||||
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
|
||||
unemployed_totals = df[self.UNEMPLOYED_FIELDS].sum(axis=1)
|
||||
labor_force_totals = df[self.IN_LABOR_FORCE_FIELDS].sum(axis=1)
|
||||
|
||||
df[self.UNEMPLOYED_FIELD_NAME] = unemployed_totals / labor_force_totals
|
||||
|
||||
# Calculate percent at different poverty thresholds
|
||||
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"] + df["C17002_003E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
|
||||
df["C17002_002E"]
|
||||
+ df["C17002_003E"]
|
||||
+ df["C17002_004E"]
|
||||
+ df["C17002_005E"]
|
||||
+ df["C17002_006E"]
|
||||
+ df["C17002_007E"]
|
||||
) / df["C17002_001E"]
|
||||
|
||||
# Save results to self.
|
||||
self.df = df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census ACS Data")
|
||||
|
||||
# mkdir census
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
columns_to_include = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
self.UNEMPLOYED_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
|
||||
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||
]
|
||||
|
||||
output_df = self.df[columns_to_include]
|
||||
|
||||
# Add the year to the end of every column, so when it's all joined in the
|
||||
# score df, it's obvious which year this data is from.
|
||||
for column in columns_to_include:
|
||||
if column != self.GEOID_TRACT_FIELD_NAME:
|
||||
output_df = output_df.rename(
|
||||
columns={
|
||||
column: f"{column} in {self.ACS_YEAR}",
|
||||
}
|
||||
)
|
||||
|
||||
output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Census ACS Data")
|
||||
|
||||
pass
|
|
@ -27,12 +27,21 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
# https://api.census.gov/data/2010/dec/gu/variables.html
|
||||
# https://api.census.gov/data/2010/dec/mp/variables.html
|
||||
# https://api.census.gov/data/2010/dec/vi/variables.html
|
||||
|
||||
# Total population field is the same in all island areas
|
||||
self.TOTAL_POP_FIELD = self.TOTAL_POP_VI_FIELD = "P001001"
|
||||
self.TOTAL_POP_FIELD_NAME = "Total population in 2009"
|
||||
|
||||
self.MEDIAN_INCOME_FIELD = "PBG049001"
|
||||
self.MEDIAN_INCOME_VI_FIELD = "PBG047001"
|
||||
self.MEDIAN_INCOME_FIELD_NAME = (
|
||||
"MEDIAN HOUSEHOLD INCOME IN 2009 (DOLLARS)"
|
||||
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in 2009 ($)"
|
||||
self.AREA_MEDIAN_INCOME_FIELD_NAME = (
|
||||
"Median household income as a percent of "
|
||||
"territory median income in 2009"
|
||||
)
|
||||
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD = "Territory Median Income"
|
||||
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001"
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = (
|
||||
"PBG077001"
|
||||
|
@ -48,7 +57,39 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL"
|
||||
"Percentage households below 200% of federal poverty line in 2009"
|
||||
)
|
||||
|
||||
# We will combine three fields to get households < 100% FPL.
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE = (
|
||||
"PBG083002" # Total!!Under .50
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO = (
|
||||
"PBG083003" # Total!!.50 to .74
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE = (
|
||||
"PBG083004" # Total!!.75 to .99
|
||||
)
|
||||
|
||||
# Same fields, for Virgin Islands.
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE = (
|
||||
"PBG077002" # Total!!Under .50
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO = (
|
||||
"PBG077003" # Total!!.50 to .74
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE = (
|
||||
"PBG077004" # Total!!.75 to .99
|
||||
)
|
||||
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
||||
)
|
||||
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Percentage households below 100% of federal poverty line in 2009"
|
||||
)
|
||||
|
||||
# High School Education Fields
|
||||
|
@ -70,9 +111,37 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||
)
|
||||
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
||||
"PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree in 2009"
|
||||
|
||||
# Employment fields
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD = (
|
||||
"PBG038003" # Total!!Male!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD = (
|
||||
"PBG038007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD = (
|
||||
"PBG038010" # Total!!Female!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD = (
|
||||
"PBG038014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
# Same fields, Virgin Islands.
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||
"PBG036003" # Total!!Male!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD = (
|
||||
"PBG036007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||
"PBG036010" # Total!!Female!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD = (
|
||||
"PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009"
|
||||
|
||||
var_list = [
|
||||
self.MEDIAN_INCOME_FIELD,
|
||||
|
@ -81,6 +150,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.TOTAL_POPULATION_FIELD,
|
||||
self.MALE_HIGH_SCHOOL_ED_FIELD,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
self.TOTAL_POP_FIELD,
|
||||
]
|
||||
var_list = ",".join(var_list)
|
||||
|
||||
|
@ -91,6 +168,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.TOTAL_POPULATION_VI_FIELD,
|
||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
|
||||
self.TOTAL_POP_VI_FIELD,
|
||||
]
|
||||
var_list_vi = ",".join(var_list_vi)
|
||||
|
||||
|
@ -107,6 +192,20 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
}
|
||||
|
||||
# To do: Ask Census Slack Group about whether you need to hardcode the county fips
|
||||
|
@ -117,24 +216,30 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
"fips": "60",
|
||||
"county_fips": ["010", "020", "030", "040", "050"],
|
||||
"var_list": var_list,
|
||||
# Note: we hardcode the median income for each territory in this dict,
|
||||
# because that data is hard to programmatically access.
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 23892,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "gu",
|
||||
"fips": "66",
|
||||
"county_fips": ["010"],
|
||||
"var_list": var_list,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 48274,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "mp",
|
||||
"fips": "69",
|
||||
"county_fips": ["085", "100", "110", "120"],
|
||||
"var_list": var_list,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 19958,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "vi",
|
||||
"fips": "78",
|
||||
"county_fips": ["010", "020", "030"],
|
||||
"var_list": var_list_vi,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 37254,
|
||||
},
|
||||
]
|
||||
|
||||
|
@ -198,6 +303,11 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
# Combine the dfs after renaming
|
||||
self.df_all = pd.concat([self.df, self.df_vi])
|
||||
|
||||
# Rename total population:
|
||||
self.df_all[self.TOTAL_POP_FIELD_NAME] = self.df_all[
|
||||
self.TOTAL_POP_FIELD
|
||||
]
|
||||
|
||||
# Percentage of households below 200% which is
|
||||
# [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total)
|
||||
self.df_all[
|
||||
|
@ -211,6 +321,25 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||
]
|
||||
|
||||
# Percentage of households below 100% FPL
|
||||
# which we get by adding `Total!!Under .50`, `Total!!.50 to .74`, ` Total!!.75 to .99`,
|
||||
# and then dividing by PBG083001 (total)
|
||||
self.df_all[
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME
|
||||
] = (
|
||||
self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE
|
||||
]
|
||||
+ self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO
|
||||
]
|
||||
+ self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE
|
||||
]
|
||||
) / self.df_all[
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||
]
|
||||
|
||||
# Percentage High School Achievement is
|
||||
# Percentage = (Male + Female) / (Total)
|
||||
self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = (
|
||||
|
@ -218,6 +347,28 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
+ self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
||||
) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME]
|
||||
|
||||
# Calculate employment.
|
||||
self.df_all[self.UNEMPLOYMENT_FIELD_NAME] = (
|
||||
self.df_all[self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD]
|
||||
+ self.df_all[self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD]
|
||||
) / (
|
||||
self.df_all[self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD]
|
||||
+ self.df_all[self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD]
|
||||
)
|
||||
|
||||
# Calculate area median income
|
||||
median_income_df = pd.DataFrame(self.ISLAND_TERRITORIES)
|
||||
median_income_df = median_income_df[
|
||||
["fips", self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||
]
|
||||
self.df_all = self.df_all.merge(
|
||||
right=median_income_df, left_on="state", right_on="fips", how="left"
|
||||
)
|
||||
self.df_all[self.AREA_MEDIAN_INCOME_FIELD_NAME] = (
|
||||
self.df_all[self.MEDIAN_INCOME_FIELD_NAME]
|
||||
/ self.df_all[self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||
)
|
||||
|
||||
# Creating Geo ID (Census Block Group) Field Name
|
||||
self.df_all[self.GEOID_TRACT_FIELD_NAME] = (
|
||||
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
||||
|
@ -238,9 +389,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
|
||||
columns_to_include = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
self.TOTAL_POP_FIELD_NAME,
|
||||
self.MEDIAN_INCOME_FIELD_NAME,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD,
|
||||
self.AREA_MEDIAN_INCOME_FIELD_NAME,
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.UNEMPLOYMENT_FIELD_NAME,
|
||||
]
|
||||
|
||||
self.df_all[columns_to_include].to_csv(
|
||||
|
|
|
@ -14,6 +14,27 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
|
||||
self.df: pd.DataFrame
|
||||
|
||||
self.COLUMNS_TO_KEEP = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
field_names.TOTAL_POP_FIELD,
|
||||
# pylint: disable=duplicate-code
|
||||
field_names.AIR_TOXICS_CANCER_RISK_FIELD,
|
||||
field_names.RESPITORY_HAZARD_FIELD,
|
||||
field_names.DIESEL_FIELD,
|
||||
field_names.PM25_FIELD,
|
||||
field_names.OZONE_FIELD,
|
||||
field_names.TRAFFIC_FIELD,
|
||||
field_names.RMP_FIELD,
|
||||
field_names.TSDF_FIELD,
|
||||
field_names.NPL_FIELD,
|
||||
field_names.WASTEWATER_FIELD,
|
||||
field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
|
||||
field_names.POVERTY_FIELD,
|
||||
field_names.OVER_64_FIELD,
|
||||
field_names.UNDER_5_FIELD,
|
||||
field_names.LEAD_PAINT_FIELD,
|
||||
]
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Downloading EJScreen Data")
|
||||
super().extract(
|
||||
|
@ -51,7 +72,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
"PWDIS": field_names.WASTEWATER_FIELD,
|
||||
"LINGISOPCT": field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD,
|
||||
"LOWINCPCT": field_names.POVERTY_FIELD,
|
||||
"LESSHSPCT": field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
"OVER64PCT": field_names.OVER_64_FIELD,
|
||||
"UNDER5PCT": field_names.UNDER_5_FIELD,
|
||||
"PRE1960PCT": field_names.LEAD_PAINT_FIELD,
|
||||
|
@ -63,4 +83,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
|||
logger.info("Saving EJScreen CSV")
|
||||
# write nationwide csv
|
||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
self.CSV_PATH / "usa.csv", index=False
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue