diff --git a/score/ipython/census_etl.ipynb b/score/ipython/census_etl.ipynb new file mode 100644 index 00000000..1e43b94b --- /dev/null +++ b/score/ipython/census_etl.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0491828b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import censusdata\n", + "import csv\n", + "from pathlib import Path\n", + "\n", + "ACS_YEAR = 2019\n", + "\n", + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n", + "\n", + "GEOID_FIELD_NAME = \"GEOID10\"\n", + "UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n", + "\n", + "# Some display settings to make pandas outputs more readable.\n", + "pd.set_option(\"display.expand_frame_repr\", False)\n", + "pd.set_option(\"display.precision\", 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "654f25a1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", + "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", + "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B23025\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8999cea4", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n", + " \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n", + " fips = \"\".join([value for (key, value) in censusgeo.params()])\n", + " return fips\n", + "\n", + "\n", + "dfs = []\n", + "with open(FIPS_CSV_PATH) as csv_file:\n", + " csv_reader = csv.reader(csv_file, delimiter=\",\")\n", + " line_count = 0\n", + "\n", + " for row in csv_reader:\n", + " if line_count == 0:\n", + " line_count += 1\n", + " else:\n", + " fips = row[0].strip()\n", + " print(f\"Downloading data for state with FIPS code {fips}\")\n", + "\n", + " dfs.append(\n", + " censusdata.download(\n", + " src=\"acs5\",\n", + " year=ACS_YEAR,\n", + " geo=censusdata.censusgeo(\n", + " [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n", + " ),\n", + " var=[\"B23025_005E\", \"B23025_003E\"],\n", + " )\n", + " )\n", + "\n", + "df = pd.concat(dfs)\n", + "\n", + "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "803cce31", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Calculate percent unemployment.\n", + "# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.\n", + "df[UNEMPLOYED_FIELD_NAME] = df.B23025_005E / df.B23025_003E\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a269bb1", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# mkdir census\n", + "columns_to_include = [GEOID_FIELD_NAME, UNEMPLOYED_FIELD_NAME]\n", + "\n", + "df[columns_to_include].to_csv(\n", + " path_or_buf=DATA_PATH / \"tmp\" / \"census\" / \"census.csv\", index=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05b93deb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43784bc1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1abc16a5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}