mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-24 18:44:20 -08:00
* Backfill population in island areas (#1882) * Update smoketest to account for backfills (#1882) As I wrote in the commend: We backfill island areas with data from the 2010 census, so if THOSE tracts have data beyond the data source, that's to be expected and is fine to pass. If some other state or territory does though, this should fail This ends up being a nice way of documenting that behavior i guess! * Fixup lint issues (#1882) * Add in race demos to 2010 census pull (#1851) * Add backfill data to score (#1851) * Change column name (#1851) * Fill demos after the score (#1851) * Add income back, adjust test (#1882) * Apply code-review feedback (#1851) * Add test for island area backfill (#1851) * Fix bad rename (#1851)
519 lines
23 KiB
Python
519 lines
23 KiB
Python
import json
|
|
from typing import List
|
|
import requests
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from data_pipeline.etl.base import ExtractTransformLoad
|
|
from data_pipeline.utils import get_module_logger
|
|
from data_pipeline.score import field_names
|
|
from data_pipeline.config import settings
|
|
|
|
pd.options.mode.chained_assignment = "raise"
|
|
|
|
logger = get_module_logger(__name__)
|
|
|
|
|
|
class CensusDecennialETL(ExtractTransformLoad):
|
|
def __init__(self):
|
|
self.DECENNIAL_YEAR = 2010
|
|
self.OUTPUT_PATH = (
|
|
self.DATA_PATH
|
|
/ "dataset"
|
|
/ f"census_decennial_{self.DECENNIAL_YEAR}"
|
|
)
|
|
|
|
# Income Fields
|
|
# AS, GU, and MP all share the same variable names, but VI is different
|
|
# https://api.census.gov/data/2010/dec/as.html
|
|
# https://api.census.gov/data/2010/dec/gu/variables.html
|
|
# https://api.census.gov/data/2010/dec/mp/variables.html
|
|
# https://api.census.gov/data/2010/dec/vi/variables.html
|
|
|
|
# Total population field is the same in all island areas
|
|
self.TOTAL_POP_FIELD = self.TOTAL_POP_VI_FIELD = "P001001"
|
|
self.TOTAL_POP_FIELD_NAME = "Total population in 2009"
|
|
|
|
self.MEDIAN_INCOME_FIELD = "PBG049001"
|
|
self.MEDIAN_INCOME_VI_FIELD = "PBG047001"
|
|
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in 2009 ($)"
|
|
self.AREA_MEDIAN_INCOME_FIELD_NAME = (
|
|
"Median household income as a percent of "
|
|
"territory median income in 2009"
|
|
)
|
|
|
|
self.TERRITORY_MEDIAN_INCOME_FIELD = "Territory Median Income"
|
|
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001"
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = (
|
|
"PBG077001"
|
|
)
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME = (
|
|
"TOTAL; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
|
)
|
|
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
|
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
|
)
|
|
|
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
|
"Percentage households below 200% of federal poverty line in 2009"
|
|
)
|
|
|
|
# We will combine three fields to get households < 100% FPL.
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE = (
|
|
"PBG083002" # Total!!Under .50
|
|
)
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO = (
|
|
"PBG083003" # Total!!.50 to .74
|
|
)
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE = (
|
|
"PBG083004" # Total!!.75 to .99
|
|
)
|
|
|
|
# Same fields, for Virgin Islands.
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE = (
|
|
"PBG077002" # Total!!Under .50
|
|
)
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO = (
|
|
"PBG077003" # Total!!.50 to .74
|
|
)
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE = (
|
|
"PBG077004" # Total!!.75 to .99
|
|
)
|
|
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
|
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
|
)
|
|
|
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
|
"Percentage households below 100% of federal poverty line in 2009"
|
|
)
|
|
|
|
# High School Education Fields
|
|
self.TOTAL_POPULATION_FIELD = "PBG026001"
|
|
self.TOTAL_POPULATION_VI_FIELD = "PCT032001"
|
|
self.TOTAL_POPULATION_FIELD_NAME = "Total; SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
|
|
|
self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005"
|
|
self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011"
|
|
self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
|
"Total!!Male!!High school graduate, GED, or alternative; "
|
|
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
|
)
|
|
|
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012"
|
|
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028"
|
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
|
"Total!!Female!!High school graduate, GED, or alternative; "
|
|
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
|
)
|
|
|
|
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree in 2009"
|
|
|
|
# Employment fields
|
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD = (
|
|
"PBG038003" # Total!!Male!!In labor force
|
|
)
|
|
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD = (
|
|
"PBG038007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
|
)
|
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD = (
|
|
"PBG038010" # Total!!Female!!In labor force
|
|
)
|
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD = (
|
|
"PBG038014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
|
)
|
|
|
|
# Same fields, Virgin Islands.
|
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD = (
|
|
"PBG036003" # Total!!Male!!In labor force
|
|
)
|
|
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD = (
|
|
"PBG036007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
|
)
|
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD = (
|
|
"PBG036010" # Total!!Female!!In labor force
|
|
)
|
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD = (
|
|
"PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
|
)
|
|
|
|
self.UNEMPLOYMENT_FIELD_NAME = (
|
|
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009
|
|
)
|
|
|
|
# Race/Ethnicity fields
|
|
self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total
|
|
self.ASIAN_FIELD = "PCT086002" # Total!!Asian
|
|
self.BLACK_FIELD = "PCT086003" # Total!!Black or African American
|
|
self.HAWAIIAN_FIELD = (
|
|
"PCT086004" # Total!!Native Hawaiian and Other Pacific Islander
|
|
)
|
|
# Note that the 2010 census for island araeas does not break out
|
|
# hispanic and non-hispanic white, so this is slightly different from
|
|
# our other demographic data
|
|
self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White
|
|
self.HISPANIC_FIELD = "PCT086006" # Total!!Hispanic or Latino
|
|
self.OTHER_RACE_FIELD = (
|
|
"PCT086007" # Total!!Other Ethnic Origin or Ra
|
|
)
|
|
|
|
self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total
|
|
self.BLACK_VI_FIELD = (
|
|
"P003003" # Total!!One race!!Black or African American alone
|
|
)
|
|
self.AMERICAN_INDIAN_VI_FIELD = "P003005" # Total!!One race!!American Indian and Alaska Native alone
|
|
self.ASIAN_VI_FIELD = "P003006" # Total!!One race!!Asian alone
|
|
self.HAWAIIAN_VI_FIELD = "P003007" # Total!!One race!!Native Hawaiian and Other Pacific Islander alone
|
|
self.TWO_OR_MORE_RACES_VI_FIELD = "P003009" # Total!!Two or More Races
|
|
self.NON_HISPANIC_WHITE_VI_FIELD = (
|
|
"P005006" # Total!!Not Hispanic or Latino!!One race!!White alone
|
|
)
|
|
self.HISPANIC_VI_FIELD = "P005002" # Total!!Hispanic or Latino
|
|
self.OTHER_RACE_VI_FIELD = (
|
|
"P003008" # Total!!One race!!Some Other Race alone
|
|
)
|
|
self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total
|
|
|
|
self.TOTAL_RACE_POPULATION_FIELD_NAME = (
|
|
"Total population surveyed on racial data"
|
|
)
|
|
self.BLACK_FIELD_NAME = "Black or African American"
|
|
self.AMERICAN_INDIAN_FIELD_NAME = "American Indian / Alaska Native"
|
|
self.ASIAN_FIELD_NAME = "Asian"
|
|
self.HAWAIIAN_FIELD_NAME = "Native Hawaiian or Pacific"
|
|
self.TWO_OR_MORE_RACES_FIELD_NAME = "two or more races"
|
|
self.NON_HISPANIC_WHITE_FIELD_NAME = "White"
|
|
self.HISPANIC_FIELD_NAME = "Hispanic or Latino"
|
|
# Note that `other` is lowercase because the whole field will show up in the download
|
|
# file as "Percent other races"
|
|
self.OTHER_RACE_FIELD_NAME = "other races"
|
|
|
|
# Name output demographics fields.
|
|
self.RE_OUTPUT_FIELDS = [
|
|
self.BLACK_FIELD_NAME,
|
|
self.AMERICAN_INDIAN_FIELD_NAME,
|
|
self.ASIAN_FIELD_NAME,
|
|
self.HAWAIIAN_FIELD_NAME,
|
|
self.TWO_OR_MORE_RACES_FIELD_NAME,
|
|
self.NON_HISPANIC_WHITE_FIELD_NAME,
|
|
self.HISPANIC_FIELD_NAME,
|
|
self.OTHER_RACE_FIELD_NAME,
|
|
]
|
|
|
|
var_list = [
|
|
self.MEDIAN_INCOME_FIELD,
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD,
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD,
|
|
self.TOTAL_POPULATION_FIELD,
|
|
self.MALE_HIGH_SCHOOL_ED_FIELD,
|
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
|
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
|
self.TOTAL_POP_FIELD,
|
|
self.TOTAL_RACE_POPULATION_FIELD,
|
|
self.ASIAN_FIELD,
|
|
self.BLACK_FIELD,
|
|
self.HAWAIIAN_FIELD,
|
|
self.NON_HISPANIC_WHITE_FIELD,
|
|
self.HISPANIC_FIELD,
|
|
self.OTHER_RACE_FIELD,
|
|
]
|
|
var_list = ",".join(var_list)
|
|
|
|
var_list_vi = [
|
|
self.MEDIAN_INCOME_VI_FIELD,
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD,
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD,
|
|
self.TOTAL_POPULATION_VI_FIELD,
|
|
self.MALE_HIGH_SCHOOL_ED_VI_FIELD,
|
|
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE,
|
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD,
|
|
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD,
|
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
|
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
|
|
self.TOTAL_POP_VI_FIELD,
|
|
self.BLACK_VI_FIELD,
|
|
self.AMERICAN_INDIAN_VI_FIELD,
|
|
self.ASIAN_VI_FIELD,
|
|
self.HAWAIIAN_VI_FIELD,
|
|
self.TWO_OR_MORE_RACES_VI_FIELD,
|
|
self.NON_HISPANIC_WHITE_VI_FIELD,
|
|
self.HISPANIC_VI_FIELD,
|
|
self.OTHER_RACE_VI_FIELD,
|
|
self.TOTAL_RACE_POPULATION_VI_FIELD,
|
|
]
|
|
var_list_vi = ",".join(var_list_vi)
|
|
|
|
self.FIELD_NAME_XWALK = {
|
|
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
|
self.MEDIAN_INCOME_VI_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD: self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME,
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD: self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME,
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD: self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
|
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD: self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
|
self.TOTAL_POPULATION_FIELD: self.TOTAL_POPULATION_FIELD_NAME,
|
|
self.TOTAL_POPULATION_VI_FIELD: self.TOTAL_POPULATION_FIELD_NAME,
|
|
self.MALE_HIGH_SCHOOL_ED_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
|
self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
|
self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
|
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
|
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
|
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
|
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
|
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
|
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
|
self.TOTAL_RACE_POPULATION_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
|
|
self.TOTAL_RACE_POPULATION_VI_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
|
|
# Note there is no American Indian data for AS/GU/MI
|
|
self.AMERICAN_INDIAN_VI_FIELD: self.AMERICAN_INDIAN_FIELD_NAME,
|
|
self.ASIAN_FIELD: self.ASIAN_FIELD_NAME,
|
|
self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME,
|
|
self.BLACK_FIELD: self.BLACK_FIELD_NAME,
|
|
self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME,
|
|
self.HAWAIIAN_FIELD: self.HAWAIIAN_FIELD_NAME,
|
|
self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME,
|
|
self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME,
|
|
self.NON_HISPANIC_WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
|
|
self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
|
|
self.HISPANIC_FIELD: self.HISPANIC_FIELD_NAME,
|
|
self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME,
|
|
self.OTHER_RACE_FIELD: self.OTHER_RACE_FIELD_NAME,
|
|
self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME,
|
|
}
|
|
|
|
# To do: Ask Census Slack Group about whether you need to hardcode the county fips
|
|
# https://uscensusbureau.slack.com/archives/C6DGLC05B/p1635218909012600
|
|
self.ISLAND_TERRITORIES = [
|
|
{
|
|
"state_abbreviation": "as",
|
|
"fips": "60",
|
|
"county_fips": ["010", "020", "030", "040", "050"],
|
|
"var_list": var_list,
|
|
# Note: we hardcode the median income for each territory in this dict,
|
|
# because that data is hard to programmatically access.
|
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 23892,
|
|
},
|
|
{
|
|
"state_abbreviation": "gu",
|
|
"fips": "66",
|
|
"county_fips": ["010"],
|
|
"var_list": var_list,
|
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 48274,
|
|
},
|
|
{
|
|
"state_abbreviation": "mp",
|
|
"fips": "69",
|
|
"county_fips": ["085", "100", "110", "120"],
|
|
"var_list": var_list,
|
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 19958,
|
|
},
|
|
{
|
|
"state_abbreviation": "vi",
|
|
"fips": "78",
|
|
"county_fips": ["010", "020", "030"],
|
|
"var_list": var_list_vi,
|
|
self.TERRITORY_MEDIAN_INCOME_FIELD: 37254,
|
|
},
|
|
]
|
|
|
|
self.API_URL = (
|
|
"https://api.census.gov/data/{}/dec/{}?get=NAME,{}"
|
|
+ "&for=tract:*&in=state:{}%20county:{}"
|
|
)
|
|
|
|
self.final_race_fields: List[str] = []
|
|
|
|
self.df: pd.DataFrame
|
|
self.df_vi: pd.DataFrame
|
|
self.df_all: pd.DataFrame
|
|
|
|
def extract(self) -> None:
|
|
dfs = []
|
|
dfs_vi = []
|
|
for island in self.ISLAND_TERRITORIES:
|
|
logger.info(
|
|
f"Downloading data for state/territory {island['state_abbreviation']}"
|
|
)
|
|
for county in island["county_fips"]:
|
|
api_url = self.API_URL.format(
|
|
self.DECENNIAL_YEAR,
|
|
island["state_abbreviation"],
|
|
island["var_list"],
|
|
island["fips"],
|
|
county,
|
|
)
|
|
logger.debug(f"CENSUS: Requesting {api_url}")
|
|
download = requests.get(
|
|
api_url,
|
|
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
|
)
|
|
|
|
df = json.loads(download.content)
|
|
# First row is the header
|
|
df = pd.DataFrame(df[1:], columns=df[0])
|
|
|
|
for col in island["var_list"].split(","):
|
|
# Converting appropriate variables to numeric.
|
|
# Also replacing 0s with NaNs
|
|
df[col] = pd.to_numeric(df[col])
|
|
|
|
# TO-DO: CHECK THIS. I think it makes sense to replace 0 with NaN
|
|
# because for our variables of interest (e.g. Median Household Income,
|
|
# it doesn't make sense for that to be 0.)
|
|
# Likely, it's actually missing but can't find a cite for that in the docs
|
|
df[col] = df[col].replace(0, np.nan)
|
|
|
|
if island["state_abbreviation"] == "vi":
|
|
dfs_vi.append(df)
|
|
else:
|
|
dfs.append(df)
|
|
|
|
self.df = pd.concat(dfs)
|
|
self.df_vi = pd.concat(dfs_vi)
|
|
|
|
def transform(self) -> None:
|
|
logger.info("Starting Census Decennial Transform")
|
|
|
|
# Rename All Fields
|
|
self.df.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
|
self.df_vi.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
|
|
|
# Combine the dfs after renaming
|
|
self.df_all = pd.concat([self.df, self.df_vi])
|
|
|
|
# Rename total population:
|
|
self.df_all[self.TOTAL_POP_FIELD_NAME] = self.df_all[
|
|
self.TOTAL_POP_FIELD
|
|
]
|
|
|
|
# Percentage of households below 200% which is
|
|
# [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total)
|
|
self.df_all[
|
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME
|
|
] = (
|
|
self.df_all[
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
|
]
|
|
- self.df_all[self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME]
|
|
) / self.df_all[
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
|
]
|
|
|
|
# Percentage of households below 100% FPL
|
|
# which we get by adding `Total!!Under .50`, `Total!!.50 to .74`, ` Total!!.75 to .99`,
|
|
# and then dividing by PBG083001 (total)
|
|
self.df_all[
|
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME
|
|
] = (
|
|
self.df_all[
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE
|
|
]
|
|
+ self.df_all[
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO
|
|
]
|
|
+ self.df_all[
|
|
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE
|
|
]
|
|
) / self.df_all[
|
|
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
|
]
|
|
|
|
# Percentage High School Achievement is
|
|
# Percentage = (Male + Female) / (Total)
|
|
self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = (
|
|
self.df_all[self.MALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
|
+ self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
|
) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME]
|
|
|
|
# Calculate employment.
|
|
self.df_all[self.UNEMPLOYMENT_FIELD_NAME] = (
|
|
self.df_all[self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD]
|
|
+ self.df_all[self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD]
|
|
) / (
|
|
self.df_all[self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD]
|
|
+ self.df_all[self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD]
|
|
)
|
|
|
|
# Calculate area median income
|
|
median_income_df = pd.DataFrame(self.ISLAND_TERRITORIES)
|
|
median_income_df = median_income_df[
|
|
["fips", self.TERRITORY_MEDIAN_INCOME_FIELD]
|
|
]
|
|
self.df_all = self.df_all.merge(
|
|
right=median_income_df, left_on="state", right_on="fips", how="left"
|
|
)
|
|
self.df_all[self.AREA_MEDIAN_INCOME_FIELD_NAME] = (
|
|
self.df_all[self.MEDIAN_INCOME_FIELD_NAME]
|
|
/ self.df_all[self.TERRITORY_MEDIAN_INCOME_FIELD]
|
|
)
|
|
|
|
# Creating Geo ID (Census Block Group) Field Name
|
|
self.df_all[self.GEOID_TRACT_FIELD_NAME] = (
|
|
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
|
)
|
|
|
|
# Calculate stats by race
|
|
for race_field_name in self.RE_OUTPUT_FIELDS:
|
|
output_field_name = (
|
|
field_names.PERCENT_PREFIX
|
|
+ race_field_name
|
|
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX
|
|
)
|
|
self.final_race_fields.append(output_field_name)
|
|
self.df_all[output_field_name] = (
|
|
self.df_all[race_field_name]
|
|
/ self.df_all[self.TOTAL_RACE_POPULATION_FIELD_NAME]
|
|
)
|
|
|
|
# Reporting Missing Values
|
|
for col in self.df_all.columns:
|
|
missing_value_count = self.df_all[col].isnull().sum()
|
|
logger.info(
|
|
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
|
|
)
|
|
|
|
def load(self) -> None:
|
|
logger.info("Saving Census Decennial Data")
|
|
|
|
# mkdir census
|
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
|
|
|
columns_to_include = [
|
|
self.GEOID_TRACT_FIELD_NAME,
|
|
self.TOTAL_POP_FIELD_NAME,
|
|
self.MEDIAN_INCOME_FIELD_NAME,
|
|
self.TERRITORY_MEDIAN_INCOME_FIELD,
|
|
self.AREA_MEDIAN_INCOME_FIELD_NAME,
|
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME,
|
|
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
|
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
|
|
self.UNEMPLOYMENT_FIELD_NAME,
|
|
] + self.final_race_fields
|
|
|
|
self.df_all[columns_to_include].to_csv(
|
|
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
|
)
|