{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "c21b63a3", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import censusdata\n", "import csv\n", "from pathlib import Path\n", "import os\n", "import re\n", "import sys\n", "\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", "from etl.sources.census.etl_utils import get_state_fips_codes\n", "from utils import unzip_file_from_url, remove_all_from_dir\n", "\n", "DATA_PATH = Path.cwd().parent / \"data\"\n", "TMP_PATH = DATA_PATH / \"tmp\"\n", "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"hud_housing\"\n", "\n", "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", "\n", "# We measure households earning less than 80% of HUD Area Median Family Income by county\n", "# and paying greater than 30% of their income to housing costs.\n", "HOUSING_BURDEN_FIELD_NAME = \"Housing burden (percent)\"\n", "HOUSING_BURDEN_NUMERATOR_FIELD_NAME = \"HOUSING_BURDEN_NUMERATOR\"\n", "HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = \"HOUSING_BURDEN_DENOMINATOR\"\n", "\n", "# Note: some variable definitions.\n", "# HUD-adjusted median family income (HAMFI).\n", "# The four housing problems are: incomplete kitchen facilities, incomplete plumbing facilities, more than 1 person per room, and cost burden greater than 30%.\n", "# Table 8 is the desired table." ] }, { "cell_type": "code", "execution_count": null, "id": "6696bc66", "metadata": { "scrolled": false }, "outputs": [], "source": [ "# Download the data.\n", "dfs = []\n", "zip_file_dir = TMP_PATH / \"hud_housing\"\n", "\n", "print(f\"Downloading 225MB housing data\")\n", "unzip_file_from_url(\n", " \"https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip\",\n", " TMP_PATH,\n", " zip_file_dir,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "3e954589", "metadata": {}, "outputs": [], "source": [ "# New file name:\n", "tmp_csv_file_path = (\n", " zip_file_dir\n", " / \"2012thru2016-140-csv\"\n", " / \"2012thru2016-140-csv\"\n", " / \"140\"\n", " / \"Table8.csv\"\n", ")\n", "df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "244e0d03", "metadata": {}, "outputs": [], "source": [ "# Rename and reformat block group ID\n", "df.rename(columns={\"geoid\": GEOID_TRACT_FIELD_NAME}, inplace=True)\n", "\n", "# The CHAS data has census tract ids such as `14000US01001020100`\n", "# Whereas the rest of our data uses, for the same tract, `01001020100`.\n", "# the characters before `US`:\n", "df[GEOID_TRACT_FIELD_NAME] = df[GEOID_TRACT_FIELD_NAME].str.replace(\n", " r\"^.*?US\", \"\", regex=True\n", ")\n", "\n", "df[GEOID_TRACT_FIELD_NAME].head()" ] }, { "cell_type": "code", "execution_count": null, "id": "03250026", "metadata": {}, "outputs": [], "source": [ "# Calculate housing burden\n", "# This is quite a number of steps. It does not appear to be accessible nationally in a simpler format, though.\n", "# See \"CHAS data dictionary 12-16.xlsx\"\n", "\n", "# Owner occupied numerator fields\n", "OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n", " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", " # T8_est7\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", " \"T8_est7\",\n", " # T8_est10\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tgreater than 50%\tAll\n", " \"T8_est10\",\n", " # T8_est20\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", " \"T8_est20\",\n", " # T8_est23\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 50%\tAll\n", " \"T8_est23\",\n", " # T8_est33\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", " \"T8_est33\",\n", " # T8_est36\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 50%\tAll\n", " \"T8_est36\",\n", "]\n", "\n", "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n", "OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n", " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", " # T8_est13\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est13\",\n", " # T8_est26\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est26\",\n", " # T8_est39\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est39\",\n", " # T8_est52\tSubtotal\tOwner occupied\tgreater than 80% but less than or equal to 100% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est52\",\n", " # T8_est65\tSubtotal\tOwner occupied\tgreater than 100% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est65\",\n", "]\n", "\n", "# T8_est2\tSubtotal\tOwner occupied\tAll\tAll\tAll\n", "OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n", "\n", "# Renter occupied numerator fields\n", "RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n", " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", " # T8_est73\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", " \"T8_est73\",\n", " # T8_est76\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tgreater than 50%\tAll\n", " \"T8_est76\",\n", " # T8_est86\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", " \"T8_est86\",\n", " # T8_est89\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 50%\tAll\n", " \"T8_est89\",\n", " # T8_est99\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", " \"T8_est99\",\n", " # T8_est102\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 50%\tAll\n", " \"T8_est102\",\n", "]\n", "\n", "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n", "RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n", " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", " # T8_est79\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est79\",\n", " # T8_est92\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est92\",\n", " # T8_est105\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est105\",\n", " # T8_est118\tSubtotal\tRenter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est118\",\n", " # T8_est131\tSubtotal\tRenter occupied\tgreater than 100% of HAMFI\tnot computed (no/negative income)\tAll\n", " \"T8_est131\",\n", "]\n", "\n", "\n", "# T8_est68\tSubtotal\tRenter occupied\tAll\tAll\tAll\n", "RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\"\n", "\n", "\n", "# Math:\n", "# (\n", "# # of Owner Occupied Units Meeting Criteria\n", "# + # of Renter Occupied Units Meeting Criteria\n", "# )\n", "# divided by\n", "# (\n", "# Total # of Owner Occupied Units\n", "# + Total # of Renter Occupied Units\n", "# - # of Owner Occupied Units with HAMFI Not Computed\n", "# - # of Renter Occupied Units with HAMFI Not Computed\n", "# )\n", "\n", "df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME] = df[OWNER_OCCUPIED_NUMERATOR_FIELDS].sum(\n", " axis=1\n", ") + df[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)\n", "\n", "df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME] = (\n", " df[OWNER_OCCUPIED_POPULATION_FIELD]\n", " + df[RENTER_OCCUPIED_POPULATION_FIELD]\n", " - df[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n", " - df[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n", ")\n", "\n", "# TODO: add small sample size checks\n", "df[HOUSING_BURDEN_FIELD_NAME] = df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME].astype(\n", " float\n", ") / df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(float)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "8275c1ef", "metadata": { "scrolled": true }, "outputs": [], "source": [ "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n", "\n", "# Drop unnecessary fields\n", "df[\n", " [\n", " GEOID_TRACT_FIELD_NAME,\n", " HOUSING_BURDEN_NUMERATOR_FIELD_NAME,\n", " HOUSING_BURDEN_DENOMINATOR_FIELD_NAME,\n", " HOUSING_BURDEN_FIELD_NAME,\n", " ]\n", "].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "ef5bb862", "metadata": {}, "outputs": [], "source": [ "# cleanup\n", "remove_all_from_dir(TMP_PATH)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 5 }