added revisions including ETL process for table 8 acquistion

This commit is contained in:
Saran Ahluwalia 2021-12-11 08:28:01 -05:00
parent aa27f5d6e2
commit f80d8c1880
2 changed files with 407 additions and 20 deletions

File diff suppressed because one or more lines are too long

View file

@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Methodology to address fundamental problem 1 tiemized in Issue 1024"
"## Methodology to address fundamental problem 1 itemized in Issue 1024"
]
},
{
@ -26,7 +26,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@ -36,6 +36,68 @@
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ETL process for acquiring relevant tables"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### NOTE: If you ran the ETL Process to acquire Table 8 in the other notebook of this draft PR you do not need to run the ETL cell block again"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copy and adapt certain sections of code from data_pipeline.utils \n",
"\n",
"def download_hud_dataset():\n",
" DOWNLOAD_FILENAME = \"HUD_ZIPPED.csv\"\n",
" HOUSING_FTP_URL = \"https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip\" \n",
" response = requests.get(HOUSING_FTP_URL, verify=True)\n",
" if response.status_code == 200:\n",
" file_contents = response.content\n",
" else:\n",
" sys.exit(\n",
" f\"HTTP response {response.status_code} from url {file_url}. Info: {response.content}\"\n",
" )\n",
"\n",
" # Write the contents to disk.\n",
" file = open(DOWNLOAD_FILENAME, \"wb\")\n",
" file.write(file_contents)\n",
" file.close()\n",
" \n",
"def extract_zipped_download(zip_file_path, unzipped_path):\n",
" with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
" zip_ref.extractall(unzipped_path)\n",
" # cleanup temporary file\n",
" os.remove(zip_file_path)\n",
" \n",
"def up_one_directory(path):\n",
" try:\n",
" # from Python 3.6\n",
" parent_dir = Path(path).parents[1]\n",
" # for Python 3.4/3.5, use str to convert the path to string\n",
" # parent_dir = str(Path(path).parents[1])\n",
" shutil.move(path, parent_dir)\n",
" except IndexError:\n",
" # no upper directory\n",
" pass\n",
"\n",
"CURRENT_DIRECTORY = os.getcwd()\n",
"download_hud_dataset()\n",
"extract_zipped_download(CURRENT_DIRECTORY + \"/HUD_ZIPPED.csv\", CURRENT_DIRECTORY) \n",
"up_one_directory(CURRENT_DIRECTORY + \"/140/Table8.csv\")\n",
"shutil.rmtree(\"./140/\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -93,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -205,7 +267,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -284,7 +346,7 @@
"4 01 01001020500 0.142515 68.221154"
]
},
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@ -315,10 +377,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": []
"source": [
"housingburden.to_csv(\"housing_burden.csv\", index=False)"
]
}
],
"metadata": {