mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 20:21:17 -07:00
Use Census Decennial 2020 data for territories
This commit is contained in:
parent
460449f989
commit
6436dfa683
12 changed files with 767 additions and 513 deletions
|
@ -1,10 +1,10 @@
|
|||
# Decennial Census of Island Areas
|
||||
|
||||
Adding a new ETL folder for Decennial Census of Island Areas since the Island areas aren't included in ACS.
|
||||
Decennial Census of Island Areas since the Island areas aren't included in ACS.
|
||||
|
||||
There's the American Samoa Summary File, the Guam Summary File, the Commonwealth of the Northern Mariana Islands Summary File, and the US Virgin Islands Summary File:
|
||||
|
||||
1. https://api.census.gov/data/2010/dec/as.html
|
||||
1. https://api.census.gov/data/2010/dec/gu.html
|
||||
1. https://api.census.gov/data/2010/dec/mp.html
|
||||
1. https://api.census.gov/data/2010/dec/vi.html
|
||||
1. https://api.census.gov/data/2020/dec/dhcas.html
|
||||
1. https://api.census.gov/data/2020/dec/dhcgu.html
|
||||
1. https://api.census.gov/data/2020/dec/dhcmp.html
|
||||
1. https://api.census.gov/data/2020/dec/dhcvi.html
|
||||
|
|
|
@ -0,0 +1,190 @@
|
|||
from enum import Enum
|
||||
from data_pipeline.score import field_names
|
||||
|
||||
|
||||
class DEC_FIELD_NAMES(str, Enum):
|
||||
"""Field/column names for the decennial data"""
|
||||
|
||||
MALE_HIGH_SCHOOL_ED = "Total male high school graduates 25 and over"
|
||||
FEMALE_HIGH_SCHOOL_ED = "Total female high school graduates 25 and over"
|
||||
IMPUTED_COLLEGE_ATTENDANCE = "Percent enrollment in college, graduate or professional school, imputed"
|
||||
TOTAL_RACE_POPULATION = "Total population surveyed on racial data"
|
||||
BLACK = "Black or African American"
|
||||
AMERICAN_INDIAN = "American Indian / Alaska Native"
|
||||
ASIAN = "Asian"
|
||||
HAWAIIAN = "Native Hawaiian or Pacific"
|
||||
TWO_OR_MORE_RACES = "two or more races"
|
||||
NON_HISPANIC_WHITE = "White"
|
||||
HISPANIC = "Hispanic or Latino"
|
||||
OTHER_RACE = "other races"
|
||||
HOUSEHOLD_POVERTY_LEVEL_UNDER_0_5 = (
|
||||
"Household poverty level Under 0.50 IN 2019"
|
||||
)
|
||||
HOUSEHOLD_POVERTY_LEVEL_UNDER_0_74 = (
|
||||
"Household poverty level Under 0.74 IN 2019"
|
||||
)
|
||||
HOUSEHOLD_POVERTY_LEVEL_UNDER_0_99 = (
|
||||
"Household poverty level Under 0.99 IN 2019"
|
||||
)
|
||||
HOUSEHOLD_POVERTY_LEVEL_OVER_2_0 = (
|
||||
"Household poverty level Over 2.0 IN 2019"
|
||||
)
|
||||
TOTAL_HOUSEHOLD_POVERTY_LEVEL = "Total Household poverty level IN 2019"
|
||||
TERRITORY_MEDIAN_INCOME = "Territory Median Income"
|
||||
EMPLOYMENT_MALE_UNEMPLOYED = "Total males not in labor force"
|
||||
EMPLOYMENT_FEMALE_UNEMPLOYED = "Total females not in labor force"
|
||||
EMPLOYMENT_MALE_IN_LABOR_FORCE = "Total males in labor force"
|
||||
EMPLOYMENT_FEMALE_IN_LABOR_FORCE = "Total females in labor force"
|
||||
COLLEGE_ATTENDANCE_TOTAL_ENROLLED = "Total asked enrolled in college or graduate school (excludes military housing)"
|
||||
COLLEGE_NON_ATTENDANCE = "Percent of population not currently enrolled in college, graduate or professional school"
|
||||
COLLEGE_ATTENDANCE_MALE_ENROLLED = "Males enrolled in college or graduate school (excludes military housing)"
|
||||
COLLEGE_ATTENDANCE_FEMALE_ENROLLED = "Females enrolled in college or graduate school (excludes military housing)"
|
||||
COLLEGE_ATTENDANCE_POPULATION = (
|
||||
"Population enrolled in college, graduate or professional school"
|
||||
)
|
||||
COLLEGE_ATTENDANCE_PERCENT = (
|
||||
"Percent enrollment in college, graduate or professional school"
|
||||
)
|
||||
COLLEGE_NON_ATTENDANCE_PERCENT = "Percent of population not currently enrolled in college, graduate or professional school"
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""This method removes the need to use the value attribute from the Enums"""
|
||||
return str.__str__(self)
|
||||
|
||||
|
||||
__FIELD_NAME_COMMON_XWALK = {
|
||||
"P1_001N": field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019,
|
||||
"PBG19_005N": DEC_FIELD_NAMES.MALE_HIGH_SCHOOL_ED,
|
||||
"PBG19_012N": DEC_FIELD_NAMES.FEMALE_HIGH_SCHOOL_ED,
|
||||
"PCT31_001N": DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_TOTAL_ENROLLED,
|
||||
"PBG32_003N": DEC_FIELD_NAMES.EMPLOYMENT_MALE_IN_LABOR_FORCE,
|
||||
"PBG32_007N": DEC_FIELD_NAMES.EMPLOYMENT_MALE_UNEMPLOYED,
|
||||
"PBG32_010N": DEC_FIELD_NAMES.EMPLOYMENT_FEMALE_IN_LABOR_FORCE,
|
||||
"PBG32_014N": DEC_FIELD_NAMES.EMPLOYMENT_FEMALE_UNEMPLOYED,
|
||||
"PCT34_003N": DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_MALE_ENROLLED,
|
||||
"PCT34_016N": DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_FEMALE_ENROLLED,
|
||||
"PBG43_001N": field_names.CENSUS_DECENNIAL_MEDIAN_INCOME_2019,
|
||||
"PBG74_001N": DEC_FIELD_NAMES.TOTAL_HOUSEHOLD_POVERTY_LEVEL,
|
||||
"PBG74_002N": DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_UNDER_0_5,
|
||||
"PBG74_003N": DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_UNDER_0_74,
|
||||
"PBG74_004N": DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_UNDER_0_99,
|
||||
"PBG74_010N": DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_OVER_2_0,
|
||||
}
|
||||
"""
|
||||
Census variable to text column name mapping. For details on Census variables see:
|
||||
https://api.census.gov/data/2020/dec/dhcas/variables.html
|
||||
https://api.census.gov/data/2020/dec/dhcgu/variables.html
|
||||
https://api.census.gov/data/2020/dec/dhcmp/variables.html
|
||||
https://api.census.gov/data/2020/dec/dhcvi/variables.html
|
||||
"""
|
||||
|
||||
# Note that the 2010 census for island areas does not break out
|
||||
# hispanic and non-hispanic white, so this is slightly different from
|
||||
# our other demographic data
|
||||
__FIELD_NAME_AS_XWALK = {
|
||||
"PCT9_001N": DEC_FIELD_NAMES.TOTAL_RACE_POPULATION,
|
||||
"PCT9_003N": DEC_FIELD_NAMES.HAWAIIAN,
|
||||
"PCT9_079N": DEC_FIELD_NAMES.ASIAN,
|
||||
"PCT9_130N": DEC_FIELD_NAMES.NON_HISPANIC_WHITE,
|
||||
"PCT9_155N": DEC_FIELD_NAMES.BLACK,
|
||||
"PCT9_180N": DEC_FIELD_NAMES.AMERICAN_INDIAN,
|
||||
"PCT9_205N": DEC_FIELD_NAMES.OTHER_RACE,
|
||||
"PCT9_230N": DEC_FIELD_NAMES.TWO_OR_MORE_RACES,
|
||||
"P5_002N": DEC_FIELD_NAMES.HISPANIC,
|
||||
}
|
||||
"""American Samoa specific race fields."""
|
||||
|
||||
__FIELD_NAME_VI_XWALK = {
|
||||
"PCT7_001N": DEC_FIELD_NAMES.TOTAL_RACE_POPULATION,
|
||||
"PCT7_003N": DEC_FIELD_NAMES.BLACK,
|
||||
"PCT7_205N": DEC_FIELD_NAMES.ASIAN,
|
||||
"PCT7_230N": DEC_FIELD_NAMES.AMERICAN_INDIAN,
|
||||
"PCT7_255N": DEC_FIELD_NAMES.HAWAIIAN,
|
||||
"PCT7_280N": DEC_FIELD_NAMES.OTHER_RACE,
|
||||
"PCT7_305N": DEC_FIELD_NAMES.TWO_OR_MORE_RACES,
|
||||
"P5_021N": DEC_FIELD_NAMES.NON_HISPANIC_WHITE,
|
||||
"PCT6_003N": DEC_FIELD_NAMES.HISPANIC,
|
||||
}
|
||||
"""US Virgin Islands specific race fields."""
|
||||
|
||||
__FIELD_NAME_GU_XWALK = {
|
||||
"PCT10_001N": DEC_FIELD_NAMES.TOTAL_RACE_POPULATION,
|
||||
"PCT10_003N": DEC_FIELD_NAMES.HAWAIIAN,
|
||||
"PCT10_204N": DEC_FIELD_NAMES.ASIAN,
|
||||
"PCT10_330N": DEC_FIELD_NAMES.BLACK,
|
||||
"PCT10_355N": DEC_FIELD_NAMES.AMERICAN_INDIAN,
|
||||
"PCT10_380N": DEC_FIELD_NAMES.OTHER_RACE,
|
||||
"PCT10_405N": DEC_FIELD_NAMES.TWO_OR_MORE_RACES,
|
||||
"P5_026N": DEC_FIELD_NAMES.NON_HISPANIC_WHITE,
|
||||
"PCT9_003N": DEC_FIELD_NAMES.HISPANIC,
|
||||
}
|
||||
"""Guam specific race fields."""
|
||||
|
||||
__FIELD_NAME_MP_XWALK = {
|
||||
"PCT9_001N": DEC_FIELD_NAMES.TOTAL_RACE_POPULATION,
|
||||
"PCT9_003N": DEC_FIELD_NAMES.ASIAN,
|
||||
"PCT9_129N": DEC_FIELD_NAMES.HAWAIIAN,
|
||||
"PCT9_330N": DEC_FIELD_NAMES.BLACK,
|
||||
"PCT9_355N": DEC_FIELD_NAMES.AMERICAN_INDIAN,
|
||||
"PCT9_380N": DEC_FIELD_NAMES.OTHER_RACE,
|
||||
"PCT9_405N": DEC_FIELD_NAMES.TWO_OR_MORE_RACES,
|
||||
"P5_002N": DEC_FIELD_NAMES.HISPANIC,
|
||||
"P5_024N": DEC_FIELD_NAMES.NON_HISPANIC_WHITE,
|
||||
}
|
||||
"""Northern Mariana Islands specific race fields."""
|
||||
|
||||
OUTPUT_RACE_FIELDS = [
|
||||
DEC_FIELD_NAMES.BLACK,
|
||||
DEC_FIELD_NAMES.AMERICAN_INDIAN,
|
||||
DEC_FIELD_NAMES.ASIAN,
|
||||
DEC_FIELD_NAMES.HAWAIIAN,
|
||||
DEC_FIELD_NAMES.TWO_OR_MORE_RACES,
|
||||
DEC_FIELD_NAMES.NON_HISPANIC_WHITE,
|
||||
DEC_FIELD_NAMES.HISPANIC,
|
||||
DEC_FIELD_NAMES.OTHER_RACE,
|
||||
]
|
||||
"""Race fields to output in the results."""
|
||||
|
||||
DEC_TERRITORY_PARAMS = [
|
||||
{
|
||||
"state_abbreviation": "as",
|
||||
"fips": "60",
|
||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st60_as_cou2020.txt
|
||||
"county_fips": ["010", "020", "030", "040", "050"],
|
||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_AS_XWALK,
|
||||
# Note: we hardcode the median income for each territory in this dict,
|
||||
# because that data is hard to programmatically access.
|
||||
# https://www.ruralhealthinfo.org/states/american-samoa
|
||||
"median_income": 26352,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "gu",
|
||||
"fips": "66",
|
||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st66_gu_cou2020.txt
|
||||
"county_fips": ["010"],
|
||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_GU_XWALK,
|
||||
# https://www.ruralhealthinfo.org/states/guam
|
||||
# https://data.census.gov/table/DECENNIALDPGU2020.DP3?g=040XX00US66&d=DECIA%20Guam%20Demographic%20Profile
|
||||
"median_income": 58289,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "mp",
|
||||
"fips": "69",
|
||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st69_mp_cou2020.txt
|
||||
"county_fips": ["085", "100", "110", "120"],
|
||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_MP_XWALK,
|
||||
# https://www.ruralhealthinfo.org/states/northern-mariana
|
||||
# https://data.census.gov/table/DECENNIALDPMP2020.DP3?d=DECIA%20Commonwealth%20of%20the%20Northern%20Mariana%20Islands%20Demographic%20Profile
|
||||
"median_income": 31362,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "vi",
|
||||
"fips": "78",
|
||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st78_vi_cou2020.txt
|
||||
"county_fips": ["010", "020", "030"],
|
||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_VI_XWALK,
|
||||
# https://www.ruralhealthinfo.org/states/us-virgin-islands
|
||||
"median_income": 40408,
|
||||
},
|
||||
]
|
||||
"""List of territories to process."""
|
|
@ -1,14 +1,19 @@
|
|||
import json
|
||||
from typing import List
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from data_pipeline.etl.sources.census_decennial.constants import (
|
||||
DEC_TERRITORY_PARAMS,
|
||||
DEC_FIELD_NAMES,
|
||||
OUTPUT_RACE_FIELDS,
|
||||
)
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.etl.datasource import DataSource
|
||||
from data_pipeline.etl.datasource import FileDataSource
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
pd.options.mode.chained_assignment = "raise"
|
||||
|
||||
|
@ -16,514 +21,209 @@ logger = get_module_logger(__name__)
|
|||
|
||||
|
||||
class CensusDecennialETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
self.DECENNIAL_YEAR = 2010
|
||||
self.OUTPUT_PATH = (
|
||||
self.DATA_PATH
|
||||
/ "dataset"
|
||||
/ f"census_decennial_{self.DECENNIAL_YEAR}"
|
||||
)
|
||||
DECENNIAL_YEAR = 2020
|
||||
OUTPUT_PATH = (
|
||||
ExtractTransformLoad.DATA_PATH
|
||||
/ "dataset"
|
||||
/ f"census_decennial_{DECENNIAL_YEAR}"
|
||||
)
|
||||
|
||||
# Income Fields
|
||||
# AS, GU, and MP all share the same variable names, but VI is different
|
||||
# https://api.census.gov/data/2010/dec/as.html
|
||||
# https://api.census.gov/data/2010/dec/gu/variables.html
|
||||
# https://api.census.gov/data/2010/dec/mp/variables.html
|
||||
# https://api.census.gov/data/2010/dec/vi/variables.html
|
||||
|
||||
# Total population field is the same in all island areas
|
||||
self.TOTAL_POP_FIELD = self.TOTAL_POP_VI_FIELD = "P001001"
|
||||
self.TOTAL_POP_FIELD_NAME = "Total population in 2009"
|
||||
|
||||
self.MEDIAN_INCOME_FIELD = "PBG049001"
|
||||
self.MEDIAN_INCOME_VI_FIELD = "PBG047001"
|
||||
self.MEDIAN_INCOME_FIELD_NAME = "Median household income in 2009 ($)"
|
||||
self.AREA_MEDIAN_INCOME_FIELD_NAME = (
|
||||
"Median household income as a percent of "
|
||||
"territory median income in 2009"
|
||||
def __get_api_url(
|
||||
self,
|
||||
state_abbreviation: str,
|
||||
name_list: List[str],
|
||||
fips: str,
|
||||
county: str,
|
||||
) -> str:
|
||||
url = (
|
||||
f"https://api.census.gov/data/{self.DECENNIAL_YEAR}/dec/dhc{state_abbreviation}?get=NAME,{name_list}"
|
||||
+ f"&for=tract:*&in=state:{fips}%20county:{county}"
|
||||
)
|
||||
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD = "Territory Median Income"
|
||||
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD = "PBG083001"
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD = (
|
||||
"PBG077001"
|
||||
)
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"TOTAL; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
||||
)
|
||||
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
||||
)
|
||||
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Percentage households below 200% of federal poverty line in 2009"
|
||||
)
|
||||
|
||||
# We will combine three fields to get households < 100% FPL.
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE = (
|
||||
"PBG083002" # Total!!Under .50
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO = (
|
||||
"PBG083003" # Total!!.50 to .74
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE = (
|
||||
"PBG083004" # Total!!.75 to .99
|
||||
)
|
||||
|
||||
# Same fields, for Virgin Islands.
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE = (
|
||||
"PBG077002" # Total!!Under .50
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO = (
|
||||
"PBG077003" # Total!!.50 to .74
|
||||
)
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE = (
|
||||
"PBG077004" # Total!!.75 to .99
|
||||
)
|
||||
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD = "PBG083010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD = "PBG077010"
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Total!!2.00 and over; RATIO OF INCOME TO POVERTY LEVEL IN 2009"
|
||||
)
|
||||
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME = (
|
||||
"Percentage households below 100% of federal poverty line in 2009"
|
||||
)
|
||||
|
||||
# High School Education Fields
|
||||
self.TOTAL_POPULATION_FIELD = "PBG026001"
|
||||
self.TOTAL_POPULATION_VI_FIELD = "PCT032001"
|
||||
self.TOTAL_POPULATION_FIELD_NAME = "Total; SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||
|
||||
self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005"
|
||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011"
|
||||
self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
||||
"Total!!Male!!High school graduate, GED, or alternative; "
|
||||
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||
)
|
||||
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012"
|
||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028"
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = (
|
||||
"Total!!Female!!High school graduate, GED, or alternative; "
|
||||
"SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
|
||||
)
|
||||
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree in 2009"
|
||||
|
||||
# Employment fields
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD = (
|
||||
"PBG038003" # Total!!Male!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD = (
|
||||
"PBG038007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD = (
|
||||
"PBG038010" # Total!!Female!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD = (
|
||||
"PBG038014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
# Same fields, Virgin Islands.
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||
"PBG036003" # Total!!Male!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD = (
|
||||
"PBG036007" # Total!!Male!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD = (
|
||||
"PBG036010" # Total!!Female!!In labor force
|
||||
)
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD = (
|
||||
"PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed
|
||||
)
|
||||
|
||||
self.UNEMPLOYMENT_FIELD_NAME = (
|
||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009
|
||||
)
|
||||
|
||||
# Race/Ethnicity fields
|
||||
self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total
|
||||
self.ASIAN_FIELD = "PCT086002" # Total!!Asian
|
||||
self.BLACK_FIELD = "PCT086003" # Total!!Black or African American
|
||||
self.HAWAIIAN_FIELD = (
|
||||
"PCT086004" # Total!!Native Hawaiian and Other Pacific Islander
|
||||
)
|
||||
# Note that the 2010 census for island araeas does not break out
|
||||
# hispanic and non-hispanic white, so this is slightly different from
|
||||
# our other demographic data
|
||||
self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White
|
||||
self.HISPANIC_FIELD = "PCT086006" # Total!!Hispanic or Latino
|
||||
self.OTHER_RACE_FIELD = "PCT086007" # Total!!Other Ethnic Origin or Ra
|
||||
|
||||
self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total
|
||||
self.BLACK_VI_FIELD = (
|
||||
"P003003" # Total!!One race!!Black or African American alone
|
||||
)
|
||||
self.AMERICAN_INDIAN_VI_FIELD = "P003005" # Total!!One race!!American Indian and Alaska Native alone
|
||||
self.ASIAN_VI_FIELD = "P003006" # Total!!One race!!Asian alone
|
||||
self.HAWAIIAN_VI_FIELD = "P003007" # Total!!One race!!Native Hawaiian and Other Pacific Islander alone
|
||||
self.TWO_OR_MORE_RACES_VI_FIELD = "P003009" # Total!!Two or More Races
|
||||
self.NON_HISPANIC_WHITE_VI_FIELD = (
|
||||
"P005006" # Total!!Not Hispanic or Latino!!One race!!White alone
|
||||
)
|
||||
self.HISPANIC_VI_FIELD = "P005002" # Total!!Hispanic or Latino
|
||||
self.OTHER_RACE_VI_FIELD = (
|
||||
"P003008" # Total!!One race!!Some Other Race alone
|
||||
)
|
||||
self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total
|
||||
|
||||
self.TOTAL_RACE_POPULATION_FIELD_NAME = (
|
||||
"Total population surveyed on racial data"
|
||||
)
|
||||
self.BLACK_FIELD_NAME = "Black or African American"
|
||||
self.AMERICAN_INDIAN_FIELD_NAME = "American Indian / Alaska Native"
|
||||
self.ASIAN_FIELD_NAME = "Asian"
|
||||
self.HAWAIIAN_FIELD_NAME = "Native Hawaiian or Pacific"
|
||||
self.TWO_OR_MORE_RACES_FIELD_NAME = "two or more races"
|
||||
self.NON_HISPANIC_WHITE_FIELD_NAME = "White"
|
||||
self.HISPANIC_FIELD_NAME = "Hispanic or Latino"
|
||||
# Note that `other` is lowercase because the whole field will show up in the download
|
||||
# file as "Percent other races"
|
||||
self.OTHER_RACE_FIELD_NAME = "other races"
|
||||
|
||||
# Name output demographics fields.
|
||||
self.RE_OUTPUT_FIELDS = [
|
||||
self.BLACK_FIELD_NAME,
|
||||
self.AMERICAN_INDIAN_FIELD_NAME,
|
||||
self.ASIAN_FIELD_NAME,
|
||||
self.HAWAIIAN_FIELD_NAME,
|
||||
self.TWO_OR_MORE_RACES_FIELD_NAME,
|
||||
self.NON_HISPANIC_WHITE_FIELD_NAME,
|
||||
self.HISPANIC_FIELD_NAME,
|
||||
self.OTHER_RACE_FIELD_NAME,
|
||||
]
|
||||
|
||||
var_list = [
|
||||
self.MEDIAN_INCOME_FIELD,
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD,
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD,
|
||||
self.TOTAL_POPULATION_FIELD,
|
||||
self.MALE_HIGH_SCHOOL_ED_FIELD,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
self.TOTAL_POP_FIELD,
|
||||
self.TOTAL_RACE_POPULATION_FIELD,
|
||||
self.ASIAN_FIELD,
|
||||
self.BLACK_FIELD,
|
||||
self.HAWAIIAN_FIELD,
|
||||
self.NON_HISPANIC_WHITE_FIELD,
|
||||
self.HISPANIC_FIELD,
|
||||
self.OTHER_RACE_FIELD,
|
||||
]
|
||||
var_list = ",".join(var_list)
|
||||
|
||||
var_list_vi = [
|
||||
self.MEDIAN_INCOME_VI_FIELD,
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD,
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD,
|
||||
self.TOTAL_POPULATION_VI_FIELD,
|
||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
|
||||
self.TOTAL_POP_VI_FIELD,
|
||||
self.BLACK_VI_FIELD,
|
||||
self.AMERICAN_INDIAN_VI_FIELD,
|
||||
self.ASIAN_VI_FIELD,
|
||||
self.HAWAIIAN_VI_FIELD,
|
||||
self.TWO_OR_MORE_RACES_VI_FIELD,
|
||||
self.NON_HISPANIC_WHITE_VI_FIELD,
|
||||
self.HISPANIC_VI_FIELD,
|
||||
self.OTHER_RACE_VI_FIELD,
|
||||
self.TOTAL_RACE_POPULATION_VI_FIELD,
|
||||
]
|
||||
var_list_vi = ",".join(var_list_vi)
|
||||
|
||||
self.FIELD_NAME_XWALK = {
|
||||
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
||||
self.MEDIAN_INCOME_VI_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD: self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_VI_FIELD: self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD: self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_VI_FIELD: self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.TOTAL_POPULATION_FIELD: self.TOTAL_POPULATION_FIELD_NAME,
|
||||
self.TOTAL_POPULATION_VI_FIELD: self.TOTAL_POPULATION_FIELD_NAME,
|
||||
self.MALE_HIGH_SCHOOL_ED_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.MALE_HIGH_SCHOOL_ED_VI_FIELD: self.MALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD: self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_ONE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_TWO: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_VI_PART_THREE: self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
|
||||
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
|
||||
self.TOTAL_RACE_POPULATION_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
|
||||
self.TOTAL_RACE_POPULATION_VI_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
|
||||
# Note there is no American Indian data for AS/GU/MI
|
||||
self.AMERICAN_INDIAN_VI_FIELD: self.AMERICAN_INDIAN_FIELD_NAME,
|
||||
self.ASIAN_FIELD: self.ASIAN_FIELD_NAME,
|
||||
self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME,
|
||||
self.BLACK_FIELD: self.BLACK_FIELD_NAME,
|
||||
self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME,
|
||||
self.HAWAIIAN_FIELD: self.HAWAIIAN_FIELD_NAME,
|
||||
self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME,
|
||||
self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME,
|
||||
self.NON_HISPANIC_WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
|
||||
self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
|
||||
self.HISPANIC_FIELD: self.HISPANIC_FIELD_NAME,
|
||||
self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME,
|
||||
self.OTHER_RACE_FIELD: self.OTHER_RACE_FIELD_NAME,
|
||||
self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME,
|
||||
}
|
||||
|
||||
# To do: Ask Census Slack Group about whether you need to hardcode the county fips
|
||||
# https://uscensusbureau.slack.com/archives/C6DGLC05B/p1635218909012600
|
||||
self.ISLAND_TERRITORIES = [
|
||||
{
|
||||
"state_abbreviation": "as",
|
||||
"fips": "60",
|
||||
"county_fips": ["010", "020", "030", "040", "050"],
|
||||
"var_list": var_list,
|
||||
# Note: we hardcode the median income for each territory in this dict,
|
||||
# because that data is hard to programmatically access.
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 23892,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "gu",
|
||||
"fips": "66",
|
||||
"county_fips": ["010"],
|
||||
"var_list": var_list,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 48274,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "mp",
|
||||
"fips": "69",
|
||||
"county_fips": ["085", "100", "110", "120"],
|
||||
"var_list": var_list,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 19958,
|
||||
},
|
||||
{
|
||||
"state_abbreviation": "vi",
|
||||
"fips": "78",
|
||||
"county_fips": ["010", "020", "030"],
|
||||
"var_list": var_list_vi,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD: 37254,
|
||||
},
|
||||
]
|
||||
|
||||
self.API_URL = (
|
||||
"https://api.census.gov/data/{}/dec/{}?get=NAME,{}"
|
||||
+ "&for=tract:*&in=state:{}%20county:{}"
|
||||
)
|
||||
|
||||
census_api_key = os.environ.get("CENSUS_API_KEY")
|
||||
if census_api_key:
|
||||
self.API_URL = self.API_URL + f"&key={census_api_key}"
|
||||
url += f"&key={census_api_key}"
|
||||
return url
|
||||
|
||||
self.final_race_fields: List[str] = []
|
||||
def __get_destination_path(
|
||||
self,
|
||||
state_abbreviation: str,
|
||||
fips: str,
|
||||
county: str,
|
||||
test_path: Path = None,
|
||||
) -> str:
|
||||
root_path = test_path or self.get_sources_path()
|
||||
return (
|
||||
root_path
|
||||
/ str(self.DECENNIAL_YEAR)
|
||||
/ state_abbreviation
|
||||
/ fips
|
||||
/ county
|
||||
/ "census.json"
|
||||
)
|
||||
|
||||
self.df: pd.DataFrame
|
||||
self.df_vi: pd.DataFrame
|
||||
self.df_all: pd.DataFrame
|
||||
|
||||
def get_data_sources(self) -> [DataSource]:
|
||||
def __init__(self):
|
||||
self.df_all = pd.DataFrame()
|
||||
self.final_race_fields = []
|
||||
|
||||
def get_data_sources(self) -> List[DataSource]:
|
||||
sources = []
|
||||
|
||||
for island in self.ISLAND_TERRITORIES:
|
||||
for island in DEC_TERRITORY_PARAMS:
|
||||
for county in island["county_fips"]:
|
||||
|
||||
api_url = self.API_URL.format(
|
||||
self.DECENNIAL_YEAR,
|
||||
api_url = self.__get_api_url(
|
||||
island["state_abbreviation"],
|
||||
island["var_list"],
|
||||
",".join(island["xwalk"].keys()),
|
||||
island["fips"],
|
||||
county,
|
||||
)
|
||||
|
||||
sources.append(
|
||||
FileDataSource(
|
||||
source=api_url,
|
||||
destination=self.get_sources_path()
|
||||
/ str(self.DECENNIAL_YEAR)
|
||||
/ island["state_abbreviation"]
|
||||
/ island["fips"]
|
||||
/ county
|
||||
/ "census.json",
|
||||
api_url,
|
||||
self.__get_destination_path(
|
||||
island["state_abbreviation"], island["fips"], county
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
return sources
|
||||
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
||||
super().extract(
|
||||
use_cached_data_sources
|
||||
) # download and extract data sources
|
||||
|
||||
dfs = []
|
||||
dfs_vi = []
|
||||
for island in self.ISLAND_TERRITORIES:
|
||||
logger.debug(
|
||||
f"Downloading data for state/territory {island['state_abbreviation']}"
|
||||
)
|
||||
for county in island["county_fips"]:
|
||||
|
||||
def extract(
|
||||
self,
|
||||
use_cached_data_sources: bool = False,
|
||||
test_territory_params=None,
|
||||
test_path: Path = None,
|
||||
) -> None:
|
||||
super().extract(use_cached_data_sources)
|
||||
for territory in test_territory_params or DEC_TERRITORY_PARAMS:
|
||||
for county in territory["county_fips"]:
|
||||
abbr = territory["state_abbreviation"]
|
||||
file_path = self.__get_destination_path(
|
||||
abbr, territory["fips"], county, test_path=test_path
|
||||
)
|
||||
try:
|
||||
filepath = (
|
||||
self.get_sources_path()
|
||||
/ str(self.DECENNIAL_YEAR)
|
||||
/ island["state_abbreviation"]
|
||||
/ island["fips"]
|
||||
/ county
|
||||
/ "census.json"
|
||||
)
|
||||
df = json.load(filepath.open())
|
||||
except ValueError as e:
|
||||
json_data = json.load(file_path.open())
|
||||
except (FileNotFoundError, ValueError) as e:
|
||||
logger.error(
|
||||
f"Could not load content in census decennial ETL because {e}."
|
||||
)
|
||||
raise
|
||||
df = pd.DataFrame(json_data[1:], columns=json_data[0])
|
||||
# Rename the columns to their common names
|
||||
df.rename(columns=territory["xwalk"], inplace=True)
|
||||
|
||||
# First row is the header
|
||||
df = pd.DataFrame(df[1:], columns=df[0])
|
||||
# Convert columns to numeric where applicable
|
||||
for column in df.columns:
|
||||
if column not in ["state", "county", "NAME", "tract"]:
|
||||
df[column] = pd.to_numeric(df[column], errors="ignore")
|
||||
|
||||
for col in island["var_list"].split(","):
|
||||
# Converting appropriate variables to numeric.
|
||||
# Also replacing 0s with NaNs
|
||||
df[col] = pd.to_numeric(df[col])
|
||||
# Add the territory median income
|
||||
df.loc[
|
||||
df[field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019]
|
||||
> 0,
|
||||
DEC_FIELD_NAMES.TERRITORY_MEDIAN_INCOME,
|
||||
] = territory["median_income"]
|
||||
self.df_all = pd.concat([self.df_all, df])
|
||||
|
||||
# TO-DO: CHECK THIS. I think it makes sense to replace 0 with NaN
|
||||
# because for our variables of interest (e.g. Median Household Income,
|
||||
# it doesn't make sense for that to be 0.)
|
||||
# Likely, it's actually missing but can't find a cite for that in the docs
|
||||
df[col] = df[col].replace(0, np.nan)
|
||||
|
||||
if island["state_abbreviation"] == "vi":
|
||||
dfs_vi.append(df)
|
||||
else:
|
||||
dfs.append(df)
|
||||
|
||||
self.df = pd.concat(dfs)
|
||||
self.df_vi = pd.concat(dfs_vi)
|
||||
def _merge_tracts_2010_compatibility(self):
|
||||
"""Merges tract 69120950200 to match 2010 tracts"""
|
||||
# MP 69/120 69120950200 = 69120950201, 69120950202
|
||||
# Tract has been split, but 69120950202 has no data, so we just make 69120950200 = 69120950201
|
||||
self.df_all = self.df_all.drop(
|
||||
self.df_all[
|
||||
self.df_all[field_names.GEOID_TRACT_FIELD] == "69120950202"
|
||||
].index
|
||||
)
|
||||
self.df_all.loc[
|
||||
self.df_all[field_names.GEOID_TRACT_FIELD] == "69120950201",
|
||||
field_names.GEOID_TRACT_FIELD,
|
||||
] = "69120950200"
|
||||
|
||||
def transform(self) -> None:
|
||||
# Rename All Fields
|
||||
self.df.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
||||
self.df_vi.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
||||
# Creating Geo ID (Census Block Group) Field Name
|
||||
self.df_all[field_names.GEOID_TRACT_FIELD] = (
|
||||
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
||||
)
|
||||
|
||||
# Combine the dfs after renaming
|
||||
self.df_all = pd.concat([self.df, self.df_vi])
|
||||
# Combine the two MP 2020 tracts that were split from one 2010 tract
|
||||
self._merge_tracts_2010_compatibility()
|
||||
|
||||
# Rename total population:
|
||||
self.df_all[self.TOTAL_POP_FIELD_NAME] = self.df_all[
|
||||
self.TOTAL_POP_FIELD
|
||||
# Replace invalid numeric values with NaN
|
||||
numeric_columns = self.df_all.select_dtypes(include="number").columns
|
||||
for num_column in numeric_columns:
|
||||
self.df_all.loc[self.df_all[num_column] < -999, num_column] = np.nan
|
||||
|
||||
# Percentage of households below 100% FPL
|
||||
self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019
|
||||
] = (
|
||||
self.df_all[DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_UNDER_0_5]
|
||||
+ self.df_all[DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_UNDER_0_74]
|
||||
+ self.df_all[DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_UNDER_0_99]
|
||||
) / self.df_all[
|
||||
DEC_FIELD_NAMES.TOTAL_HOUSEHOLD_POVERTY_LEVEL
|
||||
]
|
||||
|
||||
# Percentage of households below 200% which is
|
||||
# [PBG083001 (total) - PBG083010 (num households over 200%)] / PBG083001 (total)
|
||||
self.df_all[
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
] = (
|
||||
self.df_all[
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||
]
|
||||
- self.df_all[self.HOUSEHOLD_OVER_200_PERC_POVERTY_LEVEL_FIELD_NAME]
|
||||
self.df_all[DEC_FIELD_NAMES.TOTAL_HOUSEHOLD_POVERTY_LEVEL]
|
||||
- self.df_all[DEC_FIELD_NAMES.HOUSEHOLD_POVERTY_LEVEL_OVER_2_0]
|
||||
) / self.df_all[
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||
]
|
||||
|
||||
# Percentage of households below 100% FPL
|
||||
# which we get by adding `Total!!Under .50`, `Total!!.50 to .74`, ` Total!!.75 to .99`,
|
||||
# and then dividing by PBG083001 (total)
|
||||
self.df_all[
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME
|
||||
] = (
|
||||
self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_ONE
|
||||
]
|
||||
+ self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_TWO
|
||||
]
|
||||
+ self.df_all[
|
||||
self.HOUSEHOLD_UNDER_100_PERC_POVERTY_LEVEL_FIELD_PART_THREE
|
||||
]
|
||||
) / self.df_all[
|
||||
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD_NAME
|
||||
DEC_FIELD_NAMES.TOTAL_HOUSEHOLD_POVERTY_LEVEL
|
||||
]
|
||||
|
||||
# Percentage High School Achievement is
|
||||
# Percentage = (Male + Female) / (Total)
|
||||
self.df_all[self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME] = (
|
||||
self.df_all[self.MALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
||||
+ self.df_all[self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME]
|
||||
) / self.df_all[self.TOTAL_POPULATION_FIELD_NAME]
|
||||
self.df_all[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019] = (
|
||||
self.df_all[DEC_FIELD_NAMES.MALE_HIGH_SCHOOL_ED]
|
||||
+ self.df_all[DEC_FIELD_NAMES.FEMALE_HIGH_SCHOOL_ED]
|
||||
) / self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
|
||||
]
|
||||
|
||||
# Calculate employment.
|
||||
self.df_all[self.UNEMPLOYMENT_FIELD_NAME] = (
|
||||
self.df_all[self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD]
|
||||
+ self.df_all[self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD]
|
||||
self.df_all[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019] = (
|
||||
self.df_all[DEC_FIELD_NAMES.EMPLOYMENT_MALE_UNEMPLOYED]
|
||||
+ self.df_all[DEC_FIELD_NAMES.EMPLOYMENT_FEMALE_UNEMPLOYED]
|
||||
) / (
|
||||
self.df_all[self.EMPLOYMENT_MALE_IN_LABOR_FORCE_FIELD]
|
||||
+ self.df_all[self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD]
|
||||
self.df_all[DEC_FIELD_NAMES.EMPLOYMENT_MALE_IN_LABOR_FORCE]
|
||||
+ self.df_all[DEC_FIELD_NAMES.EMPLOYMENT_FEMALE_IN_LABOR_FORCE]
|
||||
)
|
||||
|
||||
# Calculate area median income
|
||||
median_income_df = pd.DataFrame(self.ISLAND_TERRITORIES)
|
||||
median_income_df = median_income_df[
|
||||
["fips", self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||
]
|
||||
self.df_all = self.df_all.merge(
|
||||
right=median_income_df, left_on="state", right_on="fips", how="left"
|
||||
)
|
||||
self.df_all[self.AREA_MEDIAN_INCOME_FIELD_NAME] = (
|
||||
self.df_all[self.MEDIAN_INCOME_FIELD_NAME]
|
||||
/ self.df_all[self.TERRITORY_MEDIAN_INCOME_FIELD]
|
||||
self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019
|
||||
] = (
|
||||
self.df_all[field_names.CENSUS_DECENNIAL_MEDIAN_INCOME_2019]
|
||||
/ self.df_all[DEC_FIELD_NAMES.TERRITORY_MEDIAN_INCOME]
|
||||
)
|
||||
|
||||
# Creating Geo ID (Census Block Group) Field Name
|
||||
self.df_all[self.GEOID_TRACT_FIELD_NAME] = (
|
||||
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
||||
# Calculate college attendance
|
||||
self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION] = (
|
||||
self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_MALE_ENROLLED]
|
||||
+ self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_FEMALE_ENROLLED]
|
||||
)
|
||||
self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT] = (
|
||||
self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_MALE_ENROLLED]
|
||||
+ self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_FEMALE_ENROLLED]
|
||||
) / self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_TOTAL_ENROLLED]
|
||||
self.df_all[DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE_PERCENT] = (
|
||||
1 - self.df_all[DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT]
|
||||
)
|
||||
|
||||
# Calculate stats by race
|
||||
for race_field_name in self.RE_OUTPUT_FIELDS:
|
||||
for race_field_name in OUTPUT_RACE_FIELDS:
|
||||
output_field_name = (
|
||||
field_names.PERCENT_PREFIX
|
||||
+ race_field_name
|
||||
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX
|
||||
# 2010 vs 2020 WARNING
|
||||
# We must keep the old 2009 date to make it compatible with all the other 2010 data
|
||||
+ f" in {field_names.DEC_DATA_YEAR}"
|
||||
)
|
||||
self.final_race_fields.append(output_field_name)
|
||||
self.df_all[output_field_name] = (
|
||||
self.df_all[race_field_name]
|
||||
/ self.df_all[self.TOTAL_RACE_POPULATION_FIELD_NAME]
|
||||
/ self.df_all[DEC_FIELD_NAMES.TOTAL_RACE_POPULATION]
|
||||
)
|
||||
self.final_race_fields.append(output_field_name)
|
||||
|
||||
# Reporting Missing Values
|
||||
for col in self.df_all.columns:
|
||||
|
@ -533,21 +233,21 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def load(self) -> None:
|
||||
# mkdir census
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
columns_to_include = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
self.TOTAL_POP_FIELD_NAME,
|
||||
self.MEDIAN_INCOME_FIELD_NAME,
|
||||
self.TERRITORY_MEDIAN_INCOME_FIELD,
|
||||
self.AREA_MEDIAN_INCOME_FIELD_NAME,
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_100_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
|
||||
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
|
||||
self.UNEMPLOYMENT_FIELD_NAME,
|
||||
field_names.GEOID_TRACT_FIELD,
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_MEDIAN_INCOME_2019,
|
||||
DEC_FIELD_NAMES.TERRITORY_MEDIAN_INCOME,
|
||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
|
||||
DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE,
|
||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
|
||||
] + self.final_race_fields
|
||||
|
||||
self.df_all[columns_to_include].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue