diff --git a/score/ipython/census_etl.ipynb b/score/ipython/census_etl.ipynb index 2adfa1c0..68dd32bb 100644 --- a/score/ipython/census_etl.ipynb +++ b/score/ipython/census_etl.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "id": "0491828b", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ "import csv\n", "from pathlib import Path\n", "\n", - "ACS_YEAR = 2019\n", + "ACS_YEAR = 2016\n", "\n", "DATA_PATH = Path.cwd().parent / \"data\"\n", "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n", @@ -26,6 +26,49 @@ "pd.set_option(\"display.precision\", 2)" ] }, + { + "cell_type": "code", + "execution_count": 116, + "id": "6ba65bff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('B25040_005E',\n", + " 'HOUSE HEATING FUEL',\n", + " 'Estimate!!Total:!!Fuel oil, kerosene, etc.'),\n", + " ('B25040_009E', 'HOUSE HEATING FUEL', 'Estimate!!Total:!!Other fuel'),\n", + " ('B25040_010E', 'HOUSE HEATING FUEL', 'Estimate!!Total:!!No fuel used'),\n", + " ('B25117_006E',\n", + " 'TENURE BY HOUSE HEATING FUEL',\n", + " 'Estimate!!Total:!!Owner occupied:!!Fuel oil, kerosene, etc.'),\n", + " ('B25117_010E',\n", + " 'TENURE BY HOUSE HEATING FUEL',\n", + " 'Estimate!!Total:!!Owner occupied:!!Other fuel'),\n", + " ('B25117_011E',\n", + " 'TENURE BY HOUSE HEATING FUEL',\n", + " 'Estimate!!Total:!!Owner occupied:!!No fuel used'),\n", + " ('B25117_016E',\n", + " 'TENURE BY HOUSE HEATING FUEL',\n", + " 'Estimate!!Total:!!Renter occupied:!!Fuel oil, kerosene, etc.'),\n", + " ('B25117_020E',\n", + " 'TENURE BY HOUSE HEATING FUEL',\n", + " 'Estimate!!Total:!!Renter occupied:!!Other fuel'),\n", + " ('B25117_021E',\n", + " 'TENURE BY HOUSE HEATING FUEL',\n", + " 'Estimate!!Total:!!Renter occupied:!!No fuel used')]" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "censusdata.search('acs5', 2019, 'label', 'fuel')" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/score/ipython/lead_score_etl.ipynb b/score/ipython/lead_score_etl.ipynb new file mode 100644 index 00000000..9dfc68e6 --- /dev/null +++ b/score/ipython/lead_score_etl.ipynb @@ -0,0 +1,143 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "c21b63a3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import censusdata\n", + "import csv\n", + "import requests\n", + "import us\n", + "import zipfile\n", + "\n", + "from pathlib import Path\n", + "\n", + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n", + "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"lead_score\"\n", + "\n", + "GEOID_FIELD_NAME = \"GEOID10\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6696bc66", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading data for state/territory with abbreviation AL\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lucas/.virtualenvs/scoring/lib/python3.7/site-packages/urllib3/connectionpool.py:1020: InsecureRequestWarning: Unverified HTTPS request is being made to host 'data.openei.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", + " InsecureRequestWarning,\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'fips' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;31m# Make the directory if it doesn't exist\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mzip_file_dir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparents\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0mzip_file_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip_file_dir\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34mf\"{fips}-downloaded.zip\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mzip_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip_file_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mzip_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_contents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'fips' is not defined" + ] + } + ], + "source": [ + "# See data at https://data.openei.org/submissions/573\n", + "\n", + "state_abbreviations = [state.abbr for state in us.states.STATES_AND_TERRITORIES]\n", + "\n", + "# Download each state / territory individually\n", + "dfs = []\n", + "for state_abbreviation in state_abbreviations:\n", + " print(f\"Downloading data for state/territory with abbreviation {state_abbreviation}\")\n", + "\n", + " download = requests.get(\n", + " f\"https://data.openei.org/files/573/{state_abbreviation}-2018-LEAD-data.zip\",\n", + " verify=False,\n", + " )\n", + " file_contents = download.content\n", + " zip_file_dir = DATA_PATH / \"tmp\" / \"lead_score\"\n", + "\n", + " # Make the directory if it doesn't exist\n", + " zip_file_dir.mkdir(parents=True, exist_ok=True)\n", + " zip_file_path = zip_file_dir / f\"{state_abbreviation}-downloaded.zip\"\n", + " zip_file = open(zip_file_name, \"wb\")\n", + " zip_file.write(file_contents)\n", + " zip_file.close()\n", + "\n", + " with zipfile.ZipFile(zip_file_name, \"r\") as zip_ref:\n", + " zip_ref.extractall(zip_file_dir)\n", + "\n", + " # New file name:\n", + " tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n", + " tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n", + "\n", + " dfs.append(tmp_df)\n", + "\n", + "df = pd.concat(dfs)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "244e0d03", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename and reformat block group ID\n", + "df.rename(columns={\"blkgrp\": GEOID_FIELD_NAME}, inplace=True)\n", + "df[GEOID_FIELD_NAME] = df[GEOID_FIELD_NAME].str.replace('\"', \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8275c1ef", + "metadata": {}, + "outputs": [], + "source": [ + "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + "df.to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/requirements.txt b/score/requirements.txt index e8b20fbe..538ffef0 100644 --- a/score/requirements.txt +++ b/score/requirements.txt @@ -5,3 +5,4 @@ jupyter_contrib_nbextensions numpy pandas requests +us