In [None]:
import pandas as pd
import censusdata
import csv
from pathlib import Path
import os
import re
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
 sys.path.append(module_path)

from etl.sources.census.etl_utils import get_state_fips_codes
from utils import unzip_file_from_url, remove_all_from_dir

DATA_PATH = Path.cwd().parent / "data"
TMP_PATH = DATA_PATH / "tmp"
OUTPUT_PATH = DATA_PATH / "dataset" / "hud_housing"

GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"

# We measure households earning less than 80% of HUD Area Median Family Income by county
# and paying greater than 30% of their income to housing costs.
HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
HOUSING_BURDEN_NUMERATOR_FIELD_NAME = "HOUSING_BURDEN_NUMERATOR"
HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = "HOUSING_BURDEN_DENOMINATOR"

# Note: some variable definitions.
# HUD-adjusted median family income (HAMFI).
# The four housing problems are: incomplete kitchen facilities, incomplete plumbing facilities, more than 1 person per room, and cost burden greater than 30%.
# Table 8 is the desired table.

In [None]:
# Download the data.
dfs = []
zip_file_dir = TMP_PATH / "hud_housing"

print(f"Downloading 225MB housing data")
unzip_file_from_url(
 "https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip",
 TMP_PATH,
 zip_file_dir,
)

In [None]:
# New file name:
tmp_csv_file_path = (
 zip_file_dir
 / "2012thru2016-140-csv"
 / "2012thru2016-140-csv"
 / "140"
 / "Table8.csv"
)
df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)

df.head()

In [None]:
# Rename and reformat block group ID
df.rename(columns={"geoid": GEOID_TRACT_FIELD_NAME}, inplace=True)

# The CHAS data has census tract ids such as `14000US01001020100`
# Whereas the rest of our data uses, for the same tract, `01001020100`.
# the characters before `US`:
df[GEOID_TRACT_FIELD_NAME] = df[GEOID_TRACT_FIELD_NAME].str.replace(
 r"^.*?US", "", regex=True
)

df[GEOID_TRACT_FIELD_NAME].head()

In [None]:
# Calculate housing burden
# This is quite a number of steps. It does not appear to be accessible nationally in a simpler format, though.
# See "CHAS data dictionary 12-16.xlsx"

# Owner occupied numerator fields
OWNER_OCCUPIED_NUMERATOR_FIELDS = [
 # Key: Column Name	Line_Type	Tenure	Household income	Cost burden	Facilities
 # T8_est7	Subtotal	Owner occupied	less than or equal to 30% of HAMFI	greater than 30% but less than or equal to 50%	All
 "T8_est7",
 # T8_est10	Subtotal	Owner occupied	less than or equal to 30% of HAMFI	greater than 50%	All
 "T8_est10",
 # T8_est20	Subtotal	Owner occupied	greater than 30% but less than or equal to 50% of HAMFI	greater than 30% but less than or equal to 50%	All
 "T8_est20",
 # T8_est23	Subtotal	Owner occupied	greater than 30% but less than or equal to 50% of HAMFI	greater than 50%	All
 "T8_est23",
 # T8_est33	Subtotal	Owner occupied	greater than 50% but less than or equal to 80% of HAMFI	greater than 30% but less than or equal to 50%	All
 "T8_est33",
 # T8_est36	Subtotal	Owner occupied	greater than 50% but less than or equal to 80% of HAMFI	greater than 50%	All
 "T8_est36",
]

# These rows have the values where HAMFI was not computed, b/c of no or negative income.
OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [
 # Key: Column Name	Line_Type	Tenure	Household income	Cost burden	Facilities
 # T8_est13	Subtotal	Owner occupied	less than or equal to 30% of HAMFI	not computed (no/negative income)	All
 "T8_est13",
 # T8_est26	Subtotal	Owner occupied	greater than 30% but less than or equal to 50% of HAMFI	not computed (no/negative income)	All
 "T8_est26",
 # T8_est39	Subtotal	Owner occupied	greater than 50% but less than or equal to 80% of HAMFI	not computed (no/negative income)	All
 "T8_est39",
 # T8_est52	Subtotal	Owner occupied	greater than 80% but less than or equal to 100% of HAMFI	not computed (no/negative income)	All
 "T8_est52",
 # T8_est65	Subtotal	Owner occupied	greater than 100% of HAMFI	not computed (no/negative income)	All
 "T8_est65",
]

# T8_est2	Subtotal	Owner occupied	All	All	All
OWNER_OCCUPIED_POPULATION_FIELD = "T8_est2"

# Renter occupied numerator fields
RENTER_OCCUPIED_NUMERATOR_FIELDS = [
 # Key: Column Name	Line_Type	Tenure	Household income	Cost burden	Facilities
 # T8_est73	Subtotal	Renter occupied	less than or equal to 30% of HAMFI	greater than 30% but less than or equal to 50%	All
 "T8_est73",
 # T8_est76	Subtotal	Renter occupied	less than or equal to 30% of HAMFI	greater than 50%	All
 "T8_est76",
 # T8_est86	Subtotal	Renter occupied	greater than 30% but less than or equal to 50% of HAMFI	greater than 30% but less than or equal to 50%	All
 "T8_est86",
 # T8_est89	Subtotal	Renter occupied	greater than 30% but less than or equal to 50% of HAMFI	greater than 50%	All
 "T8_est89",
 # T8_est99	Subtotal	Renter occupied	greater than 50% but less than or equal to 80% of HAMFI	greater than 30% but less than or equal to 50%	All
 "T8_est99",
 # T8_est102	Subtotal	Renter occupied	greater than 50% but less than or equal to 80% of HAMFI	greater than 50%	All
 "T8_est102",
]

# These rows have the values where HAMFI was not computed, b/c of no or negative income.
RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [
 # Key: Column Name	Line_Type	Tenure	Household income	Cost burden	Facilities
 # T8_est79	Subtotal	Renter occupied	less than or equal to 30% of HAMFI	not computed (no/negative income)	All
 "T8_est79",
 # T8_est92	Subtotal	Renter occupied	greater than 30% but less than or equal to 50% of HAMFI	not computed (no/negative income)	All
 "T8_est92",
 # T8_est105	Subtotal	Renter occupied	greater than 50% but less than or equal to 80% of HAMFI	not computed (no/negative income)	All
 "T8_est105",
 # T8_est118	Subtotal	Renter occupied	greater than 80% but less than or equal to 100% of HAMFI	not computed (no/negative income)	All
 "T8_est118",
 # T8_est131	Subtotal	Renter occupied	greater than 100% of HAMFI	not computed (no/negative income)	All
 "T8_est131",
]


# T8_est68	Subtotal	Renter occupied	All	All	All
RENTER_OCCUPIED_POPULATION_FIELD = "T8_est68"


# Math:
# (
# # of Owner Occupied Units Meeting Criteria
# + # of Renter Occupied Units Meeting Criteria
# )
# divided by
# (
# Total # of Owner Occupied Units
# + Total # of Renter Occupied Units
# - # of Owner Occupied Units with HAMFI Not Computed
# - # of Renter Occupied Units with HAMFI Not Computed
# )

df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME] = df[OWNER_OCCUPIED_NUMERATOR_FIELDS].sum(
 axis=1
) + df[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)

df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME] = (
 df[OWNER_OCCUPIED_POPULATION_FIELD]
 + df[RENTER_OCCUPIED_POPULATION_FIELD]
 - df[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)
 - df[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)
)

# TODO: add small sample size checks
df[HOUSING_BURDEN_FIELD_NAME] = df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME].astype(
 float
) / df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(float)

df.head()

In [None]:
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# Drop unnecessary fields
df[
 [
 GEOID_TRACT_FIELD_NAME,
 HOUSING_BURDEN_NUMERATOR_FIELD_NAME,
 HOUSING_BURDEN_DENOMINATOR_FIELD_NAME,
 HOUSING_BURDEN_FIELD_NAME,
 ]
].to_csv(path_or_buf=OUTPUT_PATH / "usa.csv", index=False)

In [None]:
# cleanup
remove_all_from_dir(TMP_PATH)