Issue 844: Add island areas to Definition L (#957)

This ended up being a pretty large task. Here's what this PR does:

1. Pulls in Vincent's data from island areas into the score ETL. This is from the 2010 decennial census, the last census of any kind in the island areas.
2. Grabs a few new fields from 2010 island areas decennial census.
3. Calculates area median income for island areas.
4. Stops using EJSCREEN as the source of our high school education data and directly pulls that from census (this was related to this project so I went ahead and fixed it).
5. Grabs a bunch of data from the 2010 ACS in the states/Puerto Rico/DC, so that we can create percentiles comparing apples-to-apples (ish) from 2010 island areas decennial census data to 2010 ACS data. This required creating a new class because all the ACS fields are different between 2010 and 2019, so it wasn't as simple as looping over a year parameter.
6. Creates a combined population field of island areas and mainland so we can use those stats in our comparison tool, and updates the comparison tool accordingly.
This commit is contained in:
Lucas Merrill Brown 2021-12-03 15:46:10 -05:00 committed by GitHub
commit 1d101c93d2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 882 additions and 153 deletions

View file

@ -1,8 +1,7 @@
import pandas as pd
import censusdata
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -14,7 +13,15 @@ class CensusACSETL(ExtractTransformLoad):
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
)
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
self.EMPLOYMENT_FIELDS = [
self.TOTAL_UNEMPLOYED_FIELD,
self.TOTAL_IN_LABOR_FORCE,
]
self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
"Linguistic isolation (total)"
@ -55,59 +62,89 @@ class CensusACSETL(ExtractTransformLoad):
"Median value ($) of owner-occupied housing units"
)
# Educational attainment figures
self.EDUCATION_POPULATION_OVER_25 = "B15003_001E" # Estimate!!Total
self.EDUCATION_NO_SCHOOLING = (
"B15003_002E" # Estimate!!Total!!No schooling completed
)
self.EDUCATION_NURSERY = (
"B15003_003E" # Estimate!!Total!!Nursery school
)
self.EDUCATION_KINDERGARTEN = (
"B15003_004E" # Estimate!!Total!!Kindergarten
)
self.EDUCATION_FIRST = "B15003_005E" # Estimate!!Total!!1st grade
self.EDUCATION_SECOND = "B15003_006E" # Estimate!!Total!!2nd grade
self.EDUCATION_THIRD = "B15003_007E" # Estimate!!Total!!3rd grade
self.EDUCATION_FOURTH = "B15003_008E" # Estimate!!Total!!4th grade
self.EDUCATION_FIFTH = "B15003_009E" # Estimate!!Total!!5th grade
self.EDUCATION_SIXTH = "B15003_010E" # Estimate!!Total!!6th grade
self.EDUCATION_SEVENTH = "B15003_011E" # Estimate!!Total!!7th grade
self.EDUCATION_EIGHTH = "B15003_012E" # Estimate!!Total!!8th grade
self.EDUCATION_NINTH = "B15003_013E" # Estimate!!Total!!9th grade
self.EDUCATION_TENTH = "B15003_014E" # Estimate!!Total!!10th grade
self.EDUCATION_ELEVENTH = "B15003_015E" # Estimate!!Total!!11th grade
self.EDUCATION_TWELFTH_NO_DIPLOMA = (
"B15003_016E" # Estimate!!Total!!12th grade, no diploma
)
self.EDUCATIONAL_FIELDS = [
self.EDUCATION_POPULATION_OVER_25,
self.EDUCATION_NO_SCHOOLING,
self.EDUCATION_NURSERY,
self.EDUCATION_KINDERGARTEN,
self.EDUCATION_FIRST,
self.EDUCATION_SECOND,
self.EDUCATION_THIRD,
self.EDUCATION_FOURTH,
self.EDUCATION_FIFTH,
self.EDUCATION_SIXTH,
self.EDUCATION_SEVENTH,
self.EDUCATION_EIGHTH,
self.EDUCATION_NINTH,
self.EDUCATION_TENTH,
self.EDUCATION_ELEVENTH,
self.EDUCATION_TWELFTH_NO_DIPLOMA,
]
self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD = (
"Individuals age 25 or over with less than high school degree"
)
self.HIGH_SCHOOL_ED_FIELD = "Percent individuals age 25 or over with less than high school degree"
self.STATE_GEOID_FIELD_NAME = "GEOID2"
self.df: pd.DataFrame
def _fips_from_censusdata_censusgeo(
self, censusgeo: censusdata.censusgeo
) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
def extract(self) -> None:
dfs = []
for fips in get_state_fips_codes(self.DATA_PATH):
logger.info(
f"Downloading data for state/territory with FIPS code {fips}"
)
# Define the variables to retrieve
variables = (
[
# Income field
self.MEDIAN_INCOME_FIELD,
# House value
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
)
try:
response = censusdata.download(
src="acs5",
year=self.ACS_YEAR,
geo=censusdata.censusgeo(
[("state", fips), ("county", "*"), ("tract", "*")]
),
var=[
# Emploment fields
"B23025_005E",
"B23025_003E",
# Income field
self.MEDIAN_INCOME_FIELD,
# House value
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS,
)
dfs.append(response)
except ValueError:
logger.error(
f"Could not download data for state/territory with FIPS code {fips}"
)
self.df = pd.concat(dfs)
self.df[self.GEOID_TRACT_FIELD_NAME] = self.df.index.to_series().apply(
func=self._fips_from_censusdata_censusgeo
self.df = retrieve_census_acs_data(
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
)
def transform(self) -> None:
logger.info("Starting Census ACS Transform")
df = self.df
# Rename two fields.
self.df = self.df.rename(
df = df.rename(
columns={
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
@ -119,19 +156,17 @@ class CensusACSETL(ExtractTransformLoad):
self.MEDIAN_INCOME_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
]:
missing_value_count = sum(self.df[field] == -666666666)
missing_value_count = sum(df[field] == -666666666)
logger.info(
f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
+ f"`{field}` being marked as null values."
)
self.df[field] = self.df[field].replace(
to_replace=-666666666, value=None
)
df[field] = df[field].replace(to_replace=-666666666, value=None)
# Calculate percent unemployment.
# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
self.df[self.UNEMPLOYED_FIELD_NAME] = (
self.df.B23025_005E / self.df.B23025_003E
df[self.UNEMPLOYED_FIELD_NAME] = (
df[self.TOTAL_UNEMPLOYED_FIELD] / df[self.TOTAL_IN_LABOR_FORCE]
)
# Calculate linguistic isolation.
@ -142,34 +177,64 @@ class CensusACSETL(ExtractTransformLoad):
"C16002_013E",
]
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = self.df[
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[
individual_limited_english_fields
].sum(axis=1, skipna=True)
self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
self.df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
/ self.df["C16002_001E"]
df[self.LINGUISTIC_ISOLATION_FIELD_NAME] = (
df[self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float)
/ df["C16002_001E"]
)
# Calculate percent at different poverty thresholds
self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"] + self.df["C17002_003E"]
) / self.df["C17002_001E"]
df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
df["C17002_002E"] + df["C17002_003E"]
) / df["C17002_001E"]
self.df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"]
+ self.df["C17002_003E"]
+ self.df["C17002_004E"]
+ self.df["C17002_005E"]
) / self.df["C17002_001E"]
df[self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME] = (
df["C17002_002E"]
+ df["C17002_003E"]
+ df["C17002_004E"]
+ df["C17002_005E"]
) / df["C17002_001E"]
self.df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
self.df["C17002_002E"]
+ self.df["C17002_003E"]
+ self.df["C17002_004E"]
+ self.df["C17002_005E"]
+ self.df["C17002_006E"]
+ self.df["C17002_007E"]
) / self.df["C17002_001E"]
df[self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME] = (
df["C17002_002E"]
+ df["C17002_003E"]
+ df["C17002_004E"]
+ df["C17002_005E"]
+ df["C17002_006E"]
+ df["C17002_007E"]
) / df["C17002_001E"]
# Calculate educational attainment
educational_numerator_fields = [
self.EDUCATION_NO_SCHOOLING,
self.EDUCATION_NURSERY,
self.EDUCATION_KINDERGARTEN,
self.EDUCATION_FIRST,
self.EDUCATION_SECOND,
self.EDUCATION_THIRD,
self.EDUCATION_FOURTH,
self.EDUCATION_FIFTH,
self.EDUCATION_SIXTH,
self.EDUCATION_SEVENTH,
self.EDUCATION_EIGHTH,
self.EDUCATION_NINTH,
self.EDUCATION_TENTH,
self.EDUCATION_ELEVENTH,
self.EDUCATION_TWELFTH_NO_DIPLOMA,
]
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD] = df[
educational_numerator_fields
].sum(axis=1)
df[self.HIGH_SCHOOL_ED_FIELD] = (
df[self.HIGH_SCHOOL_ED_RAW_COUNT_FIELD]
/ df[self.EDUCATION_POPULATION_OVER_25]
)
# Save results to self.
self.df = df
def load(self) -> None:
logger.info("Saving Census ACS Data")
@ -186,6 +251,7 @@ class CensusACSETL(ExtractTransformLoad):
self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.HIGH_SCHOOL_ED_FIELD,
]
self.df[columns_to_include].to_csv(

View file

@ -0,0 +1,61 @@
from pathlib import Path
from typing import List
import censusdata
import pandas as pd
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
def _fips_from_censusdata_censusgeo(
censusgeo: censusdata.censusgeo
) -> str:
"""Create a FIPS code from the proprietary censusgeo index."""
fips = "".join([value for (key, value) in censusgeo.params()])
return fips
# pylint: disable=too-many-arguments
def retrieve_census_acs_data(
acs_year: int,
variables: List[str],
tract_output_field_name: str,
data_path_for_fips_codes: Path,
acs_type="acs5",
raise_errors: bool = False,
) -> pd.DataFrame:
"""Retrieves and combines census ACS data for a given year."""
dfs = []
for fips in get_state_fips_codes(data_path_for_fips_codes):
logger.info(
f"Downloading data for state/territory with FIPS code {fips}"
)
try:
response = censusdata.download(
src=acs_type,
year=acs_year,
geo=censusdata.censusgeo(
[("state", fips), ("county", "*"), ("tract", "*")]
),
var=variables,
)
dfs.append(response)
except ValueError as e:
logger.error(
f"Could not download data for state/territory with FIPS code {fips}"
)
if raise_errors:
raise e
df = pd.concat(dfs)
df[tract_output_field_name] = df.index.to_series().apply(
func=_fips_from_censusdata_censusgeo
)
return df