{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0491828b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import censusdata\n", "import csv\n", "from pathlib import Path\n", "import os\n", "import sys\n", "\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", "from etl.sources.census.etl_utils import get_state_fips_codes\n", "\n", "ACS_YEAR = 2019\n", "\n", "DATA_PATH = Path.cwd().parent / \"data\"\n", "OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n", "\n", "GEOID_FIELD_NAME = \"GEOID10\"\n", "UNEMPLOYED_FIELD_NAME = \"Unemployed civilians (percent)\"\n", "LINGUISTIC_ISOLATION_FIELD_NAME = \"Linguistic isolation (percent)\"\n", "LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = \"Linguistic isolation (total)\"\n", "\n", "LINGUISTIC_ISOLATION_FIELDS = [\n", " \"C16002_001E\",\n", " \"C16002_004E\",\n", " \"C16002_007E\",\n", " \"C16002_010E\",\n", " \"C16002_013E\",\n", "]\n", "\n", "# Some display settings to make pandas outputs more readable.\n", "pd.set_option(\"display.expand_frame_repr\", False)\n", "pd.set_option(\"display.precision\", 2)" ] }, { "cell_type": "code", "execution_count": null, "id": "64df0b63", "metadata": {}, "outputs": [], "source": [ "# For variable discovery, if necessary.\n", "# censusdata.search(\n", "# \"acs5\", 2019, \"label\", \"Limited English speaking\"\n", "# )" ] }, { "cell_type": "code", "execution_count": null, "id": "654f25a1", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B23025\"))\n", "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"C16002\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "8999cea4", "metadata": { "scrolled": false }, "outputs": [], "source": [ "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n", " \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n", " fips = \"\".join([value for (key, value) in censusgeo.params()])\n", " return fips\n", "\n", "\n", "dfs = []\n", "for fips in get_state_fips_codes(DATA_PATH):\n", " print(f\"Downloading data for state/territory with FIPS code {fips}\")\n", "\n", " dfs.append(\n", " censusdata.download(\n", " src=\"acs5\",\n", " year=ACS_YEAR,\n", " geo=censusdata.censusgeo(\n", " [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n", " ),\n", " var=[\n", " # Emploment fields\n", " \"B23025_005E\",\n", " \"B23025_003E\",\n", " ]\n", " + LINGUISTIC_ISOLATION_FIELDS,\n", " )\n", " )\n", "\n", "\n", "df = pd.concat(dfs)\n", "\n", "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "803cce31", "metadata": { "scrolled": false }, "outputs": [], "source": [ "# Calculate percent unemployment.\n", "# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.\n", "df[UNEMPLOYED_FIELD_NAME] = df.B23025_005E / df.B23025_003E\n", "\n", "df[UNEMPLOYED_FIELD_NAME].describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "e475472c", "metadata": { "scrolled": false }, "outputs": [], "source": [ "# Calculate linguistic isolation.\n", "individual_limited_english_fields = [\n", " \"C16002_004E\",\n", " \"C16002_007E\",\n", " \"C16002_010E\",\n", " \"C16002_013E\",\n", "]\n", "\n", "df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[individual_limited_english_fields].sum(\n", " axis=1, skipna=True\n", ")\n", "df[LINGUISTIC_ISOLATION_FIELD_NAME] = (\n", " df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float) / df[\"C16002_001E\"]\n", ")\n", "\n", "df[LINGUISTIC_ISOLATION_FIELD_NAME].describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "2a269bb1", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# mkdir census\n", "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n", "\n", "columns_to_include = [\n", " GEOID_FIELD_NAME,\n", " UNEMPLOYED_FIELD_NAME,\n", " LINGUISTIC_ISOLATION_FIELD_NAME,\n", "]\n", "\n", "df[columns_to_include].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 5 }