From 678ab8c081f0bb3d5c3dc79eb9bc4e025ae745d4 Mon Sep 17 00:00:00 2001 From: lucasmbrown-usds Date: Wed, 23 Jun 2021 12:26:45 -0700 Subject: [PATCH] adding housing CSV to score --- .../housing_and_transportation_etl.ipynb | 124 ++++++++++++++++++ score/ipython/score_calc.ipynb | 41 +++++- score/ipython/scoring_comparison.ipynb | 12 +- 3 files changed, 165 insertions(+), 12 deletions(-) create mode 100644 score/ipython/housing_and_transportation_etl.ipynb diff --git a/score/ipython/housing_and_transportation_etl.ipynb b/score/ipython/housing_and_transportation_etl.ipynb new file mode 100644 index 00000000..e7fb0daa --- /dev/null +++ b/score/ipython/housing_and_transportation_etl.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c21b63a3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import censusdata\n", + "import csv\n", + "import requests\n", + "import zipfile\n", + "\n", + "from pathlib import Path\n", + "\n", + "ACS_YEAR = 2019\n", + "\n", + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n", + "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"housing_and_transportation_index\"\n", + "\n", + "GEOID_FIELD_NAME = \"GEOID10\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6696bc66", + "metadata": {}, + "outputs": [], + "source": [ + "# https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid=01\n", + "\n", + "# Download each state / territory individually\n", + "dfs = []\n", + "with open(FIPS_CSV_PATH) as csv_file:\n", + " csv_reader = csv.reader(csv_file, delimiter=\",\")\n", + " line_count = 0\n", + "\n", + " for row in csv_reader:\n", + " if line_count == 0:\n", + " line_count += 1\n", + " else:\n", + " fips = row[0].strip()\n", + "\n", + " print(f\"Downloading data for state/territory with FIPS code {fips}\")\n", + "\n", + " download = requests.get(\n", + " f\"https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid={fips}\",\n", + " verify=False,\n", + " )\n", + " file_contents = download.content\n", + " zip_file_dir = DATA_PATH / \"tmp\" / \"housing_and_transportation_index\"\n", + "\n", + " # Make the directory if it doesn't exist\n", + " zip_file_dir.mkdir(parents=True, exist_ok=True)\n", + " zip_file_path = zip_file_dir / f\"{fips}-downloaded.zip\"\n", + " zip_file = open(zip_file_name, \"wb\")\n", + " zip_file.write(file_contents)\n", + " zip_file.close()\n", + "\n", + " with zipfile.ZipFile(zip_file_name, \"r\") as zip_ref:\n", + " zip_ref.extractall(zip_file_dir)\n", + "\n", + " # New file name:\n", + " tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n", + " tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n", + "\n", + " dfs.append(tmp_df)\n", + "\n", + "df = pd.concat(dfs)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "244e0d03", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename and reformat block group ID\n", + "df.rename(columns={\"blkgrp\": GEOID_FIELD_NAME}, inplace=True)\n", + "df[GEOID_FIELD_NAME] = df[GEOID_FIELD_NAME].str.replace('\"', \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8275c1ef", + "metadata": {}, + "outputs": [], + "source": [ + "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + "df.to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/ipython/score_calc.ipynb b/score/ipython/score_calc.ipynb index 277c9646..a4a9652e 100644 --- a/score/ipython/score_calc.ipynb +++ b/score/ipython/score_calc.ipynb @@ -7,11 +7,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Before running this notebook, you must run the following notebooks:\n", + "# Before running this notebook, you must run the following notebooks (in any order):\n", "# 1. `ejscreen_etl.ipynb`\n", "# 2. `census_etl.ipynb`\n", + "# 3. `housing_and_transportation_etl.ipynb`\n", "\n", "import collections\n", + "import functools\n", "from pathlib import Path\n", "import pandas as pd\n", "import csv\n", @@ -74,6 +76,25 @@ "census_df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "144bdde2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load housing and transportation data\n", + "housing_and_transportation_index_csv = (\n", + " data_path / \"dataset\" / \"housing_and_transportation_index\" / \"usa.csv\"\n", + ")\n", + "housing_and_transportation_df = pd.read_csv(\n", + " housing_and_transportation_index_csv,\n", + " dtype={GEOID_FIELD_NAME: \"string\"},\n", + " low_memory=False,\n", + ")\n", + "housing_and_transportation_df.head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -81,11 +102,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Join the two datasets\n", - "df = ejscreen_df.merge(\n", - " census_df,\n", - " how=\"left\",\n", - " on=GEOID_FIELD_NAME,\n", + "# Join all the data sources that use census block groups\n", + "dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n", + "\n", + "df = functools.reduce(\n", + " lambda left, right: pd.merge(\n", + " left=left, right=right, on=GEOID_FIELD_NAME, how=\"outer\"\n", + " ),\n", + " dfs,\n", ")\n", "\n", "df.head()" @@ -195,6 +219,11 @@ " renamed_field=\"Unemployed Civilians (percent)\",\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", + " DataSet(\n", + " input_field=\"ht_ami\",\n", + " renamed_field=\"Housing + Transportation Costs % Income for the Regional Typical Household\",\n", + " bucket=BUCKET_SOCIOECONOMIC,\n", + " ),\n", "]" ] }, diff --git a/score/ipython/scoring_comparison.ipynb b/score/ipython/scoring_comparison.ipynb index 1b3f2683..49e66835 100644 --- a/score/ipython/scoring_comparison.ipynb +++ b/score/ipython/scoring_comparison.ipynb @@ -301,12 +301,12 @@ "id": "0c534966", "metadata": { "variables": { - "all_100_sum": {}, - "all_100_sum_percent": {}, - "at_least_one_sum": {}, - "at_least_one_sum_percent": {}, - "cejst_cbgs_ca_only": {}, - "ces_tracts_count": {} + "all_100_sum": "1373", + "all_100_sum_percent": "69%", + "at_least_one_sum": "1866", + "at_least_one_sum_percent": "94%", + "cejst_cbgs_ca_only": "10849", + "ces_tracts_count": "1983" } }, "source": [