wip on lead score

2025-02-22 01:31:25 -08:00 · 2021-06-25 09:44:55 -07:00 · 2021-06-25 09:44:55 -07:00 · 4ed43226cf
commit 4ed43226cf
parent 678ab8c081
3 changed files with 189 additions and 2 deletions
--- a/score/ipython/census_etl.ipynb
+++ b/score/ipython/census_etl.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 112,
   "id": "0491828b",
   "metadata": {},
   "outputs": [],
@ -12,7 +12,7 @@
    "import csv\n",
    "from pathlib import Path\n",
    "\n",
-    "ACS_YEAR = 2019\n",
+    "ACS_YEAR = 2016\n",
    "\n",
    "DATA_PATH = Path.cwd().parent / \"data\"\n",
    "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
@ -26,6 +26,49 @@
    "pd.set_option(\"display.precision\", 2)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "id": "6ba65bff",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('B25040_005E',\n",
+       "  'HOUSE HEATING FUEL',\n",
+       "  'Estimate!!Total:!!Fuel oil, kerosene, etc.'),\n",
+       " ('B25040_009E', 'HOUSE HEATING FUEL', 'Estimate!!Total:!!Other fuel'),\n",
+       " ('B25040_010E', 'HOUSE HEATING FUEL', 'Estimate!!Total:!!No fuel used'),\n",
+       " ('B25117_006E',\n",
+       "  'TENURE BY HOUSE HEATING FUEL',\n",
+       "  'Estimate!!Total:!!Owner occupied:!!Fuel oil, kerosene, etc.'),\n",
+       " ('B25117_010E',\n",
+       "  'TENURE BY HOUSE HEATING FUEL',\n",
+       "  'Estimate!!Total:!!Owner occupied:!!Other fuel'),\n",
+       " ('B25117_011E',\n",
+       "  'TENURE BY HOUSE HEATING FUEL',\n",
+       "  'Estimate!!Total:!!Owner occupied:!!No fuel used'),\n",
+       " ('B25117_016E',\n",
+       "  'TENURE BY HOUSE HEATING FUEL',\n",
+       "  'Estimate!!Total:!!Renter occupied:!!Fuel oil, kerosene, etc.'),\n",
+       " ('B25117_020E',\n",
+       "  'TENURE BY HOUSE HEATING FUEL',\n",
+       "  'Estimate!!Total:!!Renter occupied:!!Other fuel'),\n",
+       " ('B25117_021E',\n",
+       "  'TENURE BY HOUSE HEATING FUEL',\n",
+       "  'Estimate!!Total:!!Renter occupied:!!No fuel used')]"
+      ]
+     },
+     "execution_count": 116,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "censusdata.search('acs5', 2019, 'label', 'fuel')"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/score/ipython/lead_score_etl.ipynb
+++ b/score/ipython/lead_score_etl.ipynb
@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c21b63a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import censusdata\n",
+    "import csv\n",
+    "import requests\n",
+    "import us\n",
+    "import zipfile\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "DATA_PATH = Path.cwd().parent / \"data\"\n",
+    "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
+    "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"lead_score\"\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "6696bc66",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading data for state/territory with abbreviation AL\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/lucas/.virtualenvs/scoring/lib/python3.7/site-packages/urllib3/connectionpool.py:1020: InsecureRequestWarning: Unverified HTTPS request is being made to host 'data.openei.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
+      "  InsecureRequestWarning,\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'fips' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-8867059d74b2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     17\u001b[0m     \u001b[0;31m# Make the directory if it doesn't exist\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     18\u001b[0m     \u001b[0mzip_file_dir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparents\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m     \u001b[0mzip_file_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip_file_dir\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34mf\"{fips}-downloaded.zip\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     20\u001b[0m     \u001b[0mzip_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip_file_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     21\u001b[0m     \u001b[0mzip_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_contents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'fips' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "# See data at https://data.openei.org/submissions/573\n",
+    "\n",
+    "state_abbreviations = [state.abbr for state in us.states.STATES_AND_TERRITORIES]\n",
+    "\n",
+    "# Download each state / territory individually\n",
+    "dfs = []\n",
+    "for state_abbreviation in state_abbreviations:\n",
+    "    print(f\"Downloading data for state/territory with abbreviation {state_abbreviation}\")\n",
+    "\n",
+    "    download = requests.get(\n",
+    "        f\"https://data.openei.org/files/573/{state_abbreviation}-2018-LEAD-data.zip\",\n",
+    "        verify=False,\n",
+    "    )\n",
+    "    file_contents = download.content\n",
+    "    zip_file_dir = DATA_PATH / \"tmp\" / \"lead_score\"\n",
+    "\n",
+    "    # Make the directory if it doesn't exist\n",
+    "    zip_file_dir.mkdir(parents=True, exist_ok=True)\n",
+    "    zip_file_path = zip_file_dir / f\"{state_abbreviation}-downloaded.zip\"\n",
+    "    zip_file = open(zip_file_name, \"wb\")\n",
+    "    zip_file.write(file_contents)\n",
+    "    zip_file.close()\n",
+    "\n",
+    "    with zipfile.ZipFile(zip_file_name, \"r\") as zip_ref:\n",
+    "        zip_ref.extractall(zip_file_dir)\n",
+    "\n",
+    "    # New file name:\n",
+    "    tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n",
+    "    tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
+    "\n",
+    "    dfs.append(tmp_df)\n",
+    "\n",
+    "df = pd.concat(dfs)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "244e0d03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename and reformat block group ID\n",
+    "df.rename(columns={\"blkgrp\": GEOID_FIELD_NAME}, inplace=True)\n",
+    "df[GEOID_FIELD_NAME] = df[GEOID_FIELD_NAME].str.replace('\"', \"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8275c1ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "df.to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/score/requirements.txt
+++ b/score/requirements.txt
@ -5,3 +5,4 @@ jupyter_contrib_nbextensions
 numpy
 pandas
 requests
+us