wip on lead score

This commit is contained in:
lucasmbrown-usds 2021-06-25 09:44:55 -07:00
parent 678ab8c081
commit 4ed43226cf
3 changed files with 189 additions and 2 deletions

View file

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 112,
"id": "0491828b",
"metadata": {},
"outputs": [],
@ -12,7 +12,7 @@
"import csv\n",
"from pathlib import Path\n",
"\n",
"ACS_YEAR = 2019\n",
"ACS_YEAR = 2016\n",
"\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
@ -26,6 +26,49 @@
"pd.set_option(\"display.precision\", 2)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "6ba65bff",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('B25040_005E',\n",
" 'HOUSE HEATING FUEL',\n",
" 'Estimate!!Total:!!Fuel oil, kerosene, etc.'),\n",
" ('B25040_009E', 'HOUSE HEATING FUEL', 'Estimate!!Total:!!Other fuel'),\n",
" ('B25040_010E', 'HOUSE HEATING FUEL', 'Estimate!!Total:!!No fuel used'),\n",
" ('B25117_006E',\n",
" 'TENURE BY HOUSE HEATING FUEL',\n",
" 'Estimate!!Total:!!Owner occupied:!!Fuel oil, kerosene, etc.'),\n",
" ('B25117_010E',\n",
" 'TENURE BY HOUSE HEATING FUEL',\n",
" 'Estimate!!Total:!!Owner occupied:!!Other fuel'),\n",
" ('B25117_011E',\n",
" 'TENURE BY HOUSE HEATING FUEL',\n",
" 'Estimate!!Total:!!Owner occupied:!!No fuel used'),\n",
" ('B25117_016E',\n",
" 'TENURE BY HOUSE HEATING FUEL',\n",
" 'Estimate!!Total:!!Renter occupied:!!Fuel oil, kerosene, etc.'),\n",
" ('B25117_020E',\n",
" 'TENURE BY HOUSE HEATING FUEL',\n",
" 'Estimate!!Total:!!Renter occupied:!!Other fuel'),\n",
" ('B25117_021E',\n",
" 'TENURE BY HOUSE HEATING FUEL',\n",
" 'Estimate!!Total:!!Renter occupied:!!No fuel used')]"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"censusdata.search('acs5', 2019, 'label', 'fuel')"
]
},
{
"cell_type": "code",
"execution_count": null,

View file

@ -0,0 +1,143 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "c21b63a3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import censusdata\n",
"import csv\n",
"import requests\n",
"import us\n",
"import zipfile\n",
"\n",
"from pathlib import Path\n",
"\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
"OUTPUT_PATH = DATA_PATH / \"dataset\" / \"lead_score\"\n",
"\n",
"GEOID_FIELD_NAME = \"GEOID10\""
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "6696bc66",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading data for state/territory with abbreviation AL\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lucas/.virtualenvs/scoring/lib/python3.7/site-packages/urllib3/connectionpool.py:1020: InsecureRequestWarning: Unverified HTTPS request is being made to host 'data.openei.org'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
" InsecureRequestWarning,\n"
]
},
{
"ename": "NameError",
"evalue": "name 'fips' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-8867059d74b2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;31m# Make the directory if it doesn't exist\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mzip_file_dir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparents\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0mzip_file_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip_file_dir\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34mf\"{fips}-downloaded.zip\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mzip_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip_file_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mzip_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_contents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'fips' is not defined"
]
}
],
"source": [
"# See data at https://data.openei.org/submissions/573\n",
"\n",
"state_abbreviations = [state.abbr for state in us.states.STATES_AND_TERRITORIES]\n",
"\n",
"# Download each state / territory individually\n",
"dfs = []\n",
"for state_abbreviation in state_abbreviations:\n",
" print(f\"Downloading data for state/territory with abbreviation {state_abbreviation}\")\n",
"\n",
" download = requests.get(\n",
" f\"https://data.openei.org/files/573/{state_abbreviation}-2018-LEAD-data.zip\",\n",
" verify=False,\n",
" )\n",
" file_contents = download.content\n",
" zip_file_dir = DATA_PATH / \"tmp\" / \"lead_score\"\n",
"\n",
" # Make the directory if it doesn't exist\n",
" zip_file_dir.mkdir(parents=True, exist_ok=True)\n",
" zip_file_path = zip_file_dir / f\"{state_abbreviation}-downloaded.zip\"\n",
" zip_file = open(zip_file_name, \"wb\")\n",
" zip_file.write(file_contents)\n",
" zip_file.close()\n",
"\n",
" with zipfile.ZipFile(zip_file_name, \"r\") as zip_ref:\n",
" zip_ref.extractall(zip_file_dir)\n",
"\n",
" # New file name:\n",
" tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n",
" tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
"\n",
" dfs.append(tmp_df)\n",
"\n",
"df = pd.concat(dfs)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "244e0d03",
"metadata": {},
"outputs": [],
"source": [
"# Rename and reformat block group ID\n",
"df.rename(columns={\"blkgrp\": GEOID_FIELD_NAME}, inplace=True)\n",
"df[GEOID_FIELD_NAME] = df[GEOID_FIELD_NAME].str.replace('\"', \"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8275c1ef",
"metadata": {},
"outputs": [],
"source": [
"OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n",
"df.to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -5,3 +5,4 @@ jupyter_contrib_nbextensions
numpy
pandas
requests
us