From f80d8c1880dc0e383eec8a6000be3dab34d4ba6a Mon Sep 17 00:00:00 2001 From: Saran Ahluwalia Date: Sat, 11 Dec 2021 08:28:01 -0500 Subject: [PATCH] added revisions including ETL process for table 8 acquistion --- ...da_se_12_09_2021-revised-denominator.ipynb | 349 +++++++++++++++++- .../ipython/hud_eda_se_12_09_2021.ipynb | 78 +++- 2 files changed, 407 insertions(+), 20 deletions(-) diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021-revised-denominator.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021-revised-denominator.ipynb index 3327ed80..fc53ac72 100644 --- a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021-revised-denominator.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021-revised-denominator.ipynb @@ -26,13 +26,18 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import math\n", "import numpy as np\n", "import os\n", + "import requests\n", + "import sys\n", + "import zipfile\n", + "import shutil\n", + "from pathlib import Path\n", "import pandas as pd" ] }, @@ -70,6 +75,63 @@ " " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ETL - one needs to only run this cell block once to replicate the subsequent notebooks in this draft PR\n", + "\n", + "##### Once run, Table 8 will be in the notebook's current working directory" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Copy and adapt certain sections of code from data_pipeline.utils \n", + "\n", + "def download_hud_dataset():\n", + " DOWNLOAD_FILENAME = \"HUD_ZIPPED.csv\"\n", + " HOUSING_FTP_URL = \"https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip\" \n", + " response = requests.get(HOUSING_FTP_URL, verify=True)\n", + " if response.status_code == 200:\n", + " file_contents = response.content\n", + " else:\n", + " sys.exit(\n", + " f\"HTTP response {response.status_code} from url {file_url}. Info: {response.content}\"\n", + " )\n", + "\n", + " # Write the contents to disk.\n", + " file = open(DOWNLOAD_FILENAME, \"wb\")\n", + " file.write(file_contents)\n", + " file.close()\n", + " \n", + "def extract_zipped_download(zip_file_path, unzipped_path):\n", + " with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n", + " zip_ref.extractall(unzipped_path)\n", + " # cleanup temporary file\n", + " os.remove(zip_file_path)\n", + " \n", + "def up_one_directory(path):\n", + " try:\n", + " # from Python 3.6\n", + " parent_dir = Path(path).parents[1]\n", + " # for Python 3.4/3.5, use str to convert the path to string\n", + " # parent_dir = str(Path(path).parents[1])\n", + " shutil.move(path, parent_dir)\n", + " except IndexError:\n", + " # no upper directory\n", + " pass\n", + "\n", + "CURRENT_DIRECTORY = os.getcwd()\n", + "download_hud_dataset()\n", + "extract_zipped_download(CURRENT_DIRECTORY + \"/HUD_ZIPPED.csv\", CURRENT_DIRECTORY) \n", + "up_one_directory(CURRENT_DIRECTORY + \"/140/Table8.csv\")\n", + "shutil.rmtree(\"./140/\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -79,16 +141,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Read in the data from https://www.huduser.gov/portal/datasets/cp.html\n", "housing = pd.read_csv(\"Table8.csv\", \n", - " encoding = \"ISO-8859-1\", \n", + " encoding=\"latin-1\", \n", " dtype = {'Tract_ID': object, 'st': object, 'geoid': object})\n", "\n", - "\n", "# Remove data for states that aren't included in the census (e.g. American Samoa, Guam, etc.):\n", "housing.drop(housing.loc[housing['st'] == '72'].index, inplace = True)\n", "\n", @@ -285,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -325,7 +386,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -396,7 +457,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -469,7 +530,7 @@ "4 885 4135 1090" ] }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -487,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -496,7 +557,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -506,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -523,7 +584,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -652,7 +713,7 @@ "4 0.81 " ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -660,6 +721,268 @@ "source": [ "housing_df.head()" ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 8))\n", + "plt.title('Relative Housing Burden for Low-Income Hosuing Only')\n", + "# Set x-axis label\n", + "plt.xlabel('Ratio')\n", + "# Set y-axis label\n", + "plt.ylabel('Frequency')\n", + "\n", + "sns.histplot(housing_df[\"ratio_post\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameFIPS_tract_idratio_post
4203Census Tract 3923, Contra Costa County, Califo...14000US060133923001.01
4617Census Tract 38.06, Kern County, California14000US060290038061.03
5262Census Tract 2060.20, Los Angeles County, Cali...14000US060372060201.07
5565Census Tract 2626.01, Los Angeles County, Cali...14000US060372626011.06
12908Census Tract 303, Fairfield County, Connecticut14000US090010303001.04
14086Census Tract 102, District of Columbia, Distri...14000US110010102001.02
17668Census Tract 273.23, Pinellas County, Florida14000US121030273231.02
18102Census Tract 208.10, Seminole County, Florida14000US121170208101.03
19796Census Tract 101.02, Liberty County, Georgia14000US131790101021.06
21166Census Tract 507, Cook County, Illinois14000US170310507001.03
30658Census Tract 7053, Montgomery County, Maryland14000US240317053001.09
41123Census Tract 6075.04, Camden County, New Jersey14000US340076075041.05
42632Census Tract 363.02, Union County, New Jersey14000US340390363021.02
45335Census Tract 3033.01, Nassau County, New York14000US360593033011.02
51549Census Tract 69.10, Franklin County, Ohio14000US390490069101.10
62114Census Tract 1917.01, Bexar County, Texas14000US480291917011.10
66504Census Tract 22.12, Travis County, Texas14000US484530022121.02
67740Census Tract 1016.03, Arlington County, Virginia14000US510131016031.01
69094Census Tract 111, Hampton city, Virginia14000US516500111001.13
69486Census Tract 432, Virginia Beach city, Virginia14000US518100432001.04
\n", + "
" + ], + "text/plain": [ + " name FIPS_tract_id \\\n", + "4203 Census Tract 3923, Contra Costa County, Califo... 14000US06013392300 \n", + "4617 Census Tract 38.06, Kern County, California 14000US06029003806 \n", + "5262 Census Tract 2060.20, Los Angeles County, Cali... 14000US06037206020 \n", + "5565 Census Tract 2626.01, Los Angeles County, Cali... 14000US06037262601 \n", + "12908 Census Tract 303, Fairfield County, Connecticut 14000US09001030300 \n", + "14086 Census Tract 102, District of Columbia, Distri... 14000US11001010200 \n", + "17668 Census Tract 273.23, Pinellas County, Florida 14000US12103027323 \n", + "18102 Census Tract 208.10, Seminole County, Florida 14000US12117020810 \n", + "19796 Census Tract 101.02, Liberty County, Georgia 14000US13179010102 \n", + "21166 Census Tract 507, Cook County, Illinois 14000US17031050700 \n", + "30658 Census Tract 7053, Montgomery County, Maryland 14000US24031705300 \n", + "41123 Census Tract 6075.04, Camden County, New Jersey 14000US34007607504 \n", + "42632 Census Tract 363.02, Union County, New Jersey 14000US34039036302 \n", + "45335 Census Tract 3033.01, Nassau County, New York 14000US36059303301 \n", + "51549 Census Tract 69.10, Franklin County, Ohio 14000US39049006910 \n", + "62114 Census Tract 1917.01, Bexar County, Texas 14000US48029191701 \n", + "66504 Census Tract 22.12, Travis County, Texas 14000US48453002212 \n", + "67740 Census Tract 1016.03, Arlington County, Virginia 14000US51013101603 \n", + "69094 Census Tract 111, Hampton city, Virginia 14000US51650011100 \n", + "69486 Census Tract 432, Virginia Beach city, Virginia 14000US51810043200 \n", + "\n", + " ratio_post \n", + "4203 1.01 \n", + "4617 1.03 \n", + "5262 1.07 \n", + "5565 1.06 \n", + "12908 1.04 \n", + "14086 1.02 \n", + "17668 1.02 \n", + "18102 1.03 \n", + "19796 1.06 \n", + "21166 1.03 \n", + "30658 1.09 \n", + "41123 1.05 \n", + "42632 1.02 \n", + "45335 1.02 \n", + "51549 1.10 \n", + "62114 1.10 \n", + "66504 1.02 \n", + "67740 1.01 \n", + "69094 1.13 \n", + "69486 1.04 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "housing_df[housing_df[\"ratio_post\"] > 1][['name', \"FIPS_tract_id\", 'ratio_post']]" + ] } ], "metadata": { diff --git a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021.ipynb b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021.ipynb index 648f8bc5..edddcec9 100644 --- a/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/hud_eda_se_12_09_2021.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Methodology to address fundamental problem 1 tiemized in Issue 1024" + "## Methodology to address fundamental problem 1 itemized in Issue 1024" ] }, { @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -36,6 +36,68 @@ "import pandas as pd" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ETL process for acquiring relevant tables" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### NOTE: If you ran the ETL Process to acquire Table 8 in the other notebook of this draft PR you do not need to run the ETL cell block again" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copy and adapt certain sections of code from data_pipeline.utils \n", + "\n", + "def download_hud_dataset():\n", + " DOWNLOAD_FILENAME = \"HUD_ZIPPED.csv\"\n", + " HOUSING_FTP_URL = \"https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip\" \n", + " response = requests.get(HOUSING_FTP_URL, verify=True)\n", + " if response.status_code == 200:\n", + " file_contents = response.content\n", + " else:\n", + " sys.exit(\n", + " f\"HTTP response {response.status_code} from url {file_url}. Info: {response.content}\"\n", + " )\n", + "\n", + " # Write the contents to disk.\n", + " file = open(DOWNLOAD_FILENAME, \"wb\")\n", + " file.write(file_contents)\n", + " file.close()\n", + " \n", + "def extract_zipped_download(zip_file_path, unzipped_path):\n", + " with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n", + " zip_ref.extractall(unzipped_path)\n", + " # cleanup temporary file\n", + " os.remove(zip_file_path)\n", + " \n", + "def up_one_directory(path):\n", + " try:\n", + " # from Python 3.6\n", + " parent_dir = Path(path).parents[1]\n", + " # for Python 3.4/3.5, use str to convert the path to string\n", + " # parent_dir = str(Path(path).parents[1])\n", + " shutil.move(path, parent_dir)\n", + " except IndexError:\n", + " # no upper directory\n", + " pass\n", + "\n", + "CURRENT_DIRECTORY = os.getcwd()\n", + "download_hud_dataset()\n", + "extract_zipped_download(CURRENT_DIRECTORY + \"/HUD_ZIPPED.csv\", CURRENT_DIRECTORY) \n", + "up_one_directory(CURRENT_DIRECTORY + \"/140/Table8.csv\")\n", + "shutil.rmtree(\"./140/\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -93,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -205,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -284,7 +346,7 @@ "4 01 01001020500 0.142515 68.221154" ] }, - "execution_count": 6, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -315,10 +377,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "housingburden.to_csv(\"housing_burden.csv\", index=False)" + ] } ], "metadata": {