j40-cejst-2/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51412a14",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import collections\n",
    "from datetime import datetime\n",
    "import functools\n",
    "import itertools\n",
    "import os\n",
    "import pathlib\n",
    "import requests\n",
    "import string\n",
    "import sys\n",
    "import typing\n",
    "import zipfile\n",
    "\n",
    "import IPython\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pypandoc\n",
    "\n",
    "from tqdm.notebook import tqdm_notebook\n",
    "\n",
    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
    "\n",
    "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
    "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
    "\n",
    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
    "tqdm_notebook.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3234c61",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
    "pd.options.display.float_format = \"{:.2f}\".format\n",
    "\n",
    "# Set some global parameters\n",
    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
    "TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
    "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
    "\n",
    "## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
    "GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
    "\n",
    "# Make the dirs if they don't exist\n",
    "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
    "COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
    "\n",
    "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
    "# and introducing the risk of misspelling the field name.)\n",
    "\n",
    "GEOID_FIELD_NAME = \"GEOID10\"\n",
    "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
    "GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
    "COUNTRY_FIELD_NAME = \"Country\"\n",
    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
    "\n",
    "CEJST_SCORE_FIELD = \"cejst_score\"\n",
    "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
    "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
    "\n",
    "# Define some suffixes\n",
    "POPULATION_SUFFIX = \" (priority population)\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "376f5b2e",
   "metadata": {},
   "source": [
    "## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
    "\n",
    "The end result is a dataframe `urban_rural_map`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4147c081",
   "metadata": {},
   "outputs": [],
   "source": [
    "# CSV was manually generated\n",
    "# Instructions for how to generate the CSV from Geocorr are here: https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787\n",
    "geocorr_urban_rural_map = pd.read_csv(\n",
    "    os.path.join(GEOCORR_DATA_DIR, \"geocorr2014_2125804280.csv\"),\n",
    "    encoding=\"ISO-8859-1\",\n",
    "    skiprows=[1],\n",
    "    dtype=\"str\",\n",
    ")\n",
    "\n",
    "geocorr_urban_rural_map[\"pop10\"] = pd.to_numeric(\n",
    "    geocorr_urban_rural_map[\"pop10\"]\n",
    ")\n",
    "geocorr_urban_rural_map[\"afact\"] = pd.to_numeric(\n",
    "    geocorr_urban_rural_map[\"afact\"]\n",
    ")\n",
    "\n",
    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = (\n",
    "    geocorr_urban_rural_map[\"county\"] + geocorr_urban_rural_map[\"tract\"]\n",
    ")  # + geocorr_urban_rural_map['bg']\n",
    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[\n",
    "    GEOID_TRACT_FIELD_NAME\n",
    "].str.replace(\".\", \"\", regex=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78276a83",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2890779",
   "metadata": {},
   "source": [
    "We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd89f6c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_map = geocorr_urban_rural_map[\n",
    "    [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\", \"cntyname\", \"uaname\", \"pop10\", \"afact\"]\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e597d7e2",
   "metadata": {},
   "source": [
    "Checking Primary Key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29929046",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_map.groupby(\n",
    "    [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\"], dropna=False\n",
    ").size().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e4c0c3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_map.loc[\n",
    "    geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == \"36117020302\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d52761e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "total_geo_population = (\n",
    "    geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME)\n",
    "    .agg({\"pop10\": np.sum})\n",
    "    .reset_index()\n",
    ")\n",
    "total_geo_population.rename(columns={\"pop10\": \"total_population\"}, inplace=True)\n",
    "total_geo_population.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38225b78",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_with_total_pop_map = (\n",
    "    geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, \"ur\"])\n",
    "    .agg({\"pop10\": np.sum})\n",
    "    .reset_index()\n",
    ")\n",
    "geocorr_urban_rural_with_total_pop_map = (\n",
    "    geocorr_urban_rural_with_total_pop_map.merge(\n",
    "        total_geo_population, how=\"inner\", on=GEOID_TRACT_FIELD_NAME\n",
    "    )\n",
    ")\n",
    "geocorr_urban_rural_with_total_pop_map.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41b9448a",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_with_total_pop_map[\"afact\"] = (\n",
    "    geocorr_urban_rural_with_total_pop_map[\"pop10\"]\n",
    "    / geocorr_urban_rural_with_total_pop_map[\"total_population\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb4ddb9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_with_total_pop_map.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e03d1e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "geocorr_urban_rural_with_total_pop_map.loc[\n",
    "    geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME]\n",
    "    == \"01001020200\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d976cb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(\n",
    "    index=GEOID_TRACT_FIELD_NAME, columns=\"ur\", values=[\"pop10\", \"afact\"]\n",
    ")\n",
    "urban_rural_map.columns = [\n",
    "    \"_\".join(col).strip() for col in urban_rural_map.columns.values\n",
    "]\n",
    "urban_rural_map.reset_index(inplace=True)\n",
    "urban_rural_map[\"urban_heuristic_flag\"] = 0\n",
    "mask = urban_rural_map[\"afact_U\"] >= 0.5\n",
    "urban_rural_map.loc[mask, \"urban_heuristic_flag\"] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f3a0993",
   "metadata": {},
   "outputs": [],
   "source": [
    "urban_rural_map.rename(\n",
    "    columns={\n",
    "        \"pop10_R\": \"population_in_rural_areas\",\n",
    "        \"pop10_U\": \"population_in_urban_areas\",\n",
    "        \"afact_R\": \"perc_population_in_rural_areas\",\n",
    "        \"afact_U\": \"perc_population_in_urban_areas\",\n",
    "    },\n",
    "    inplace=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba10f07c",
   "metadata": {},
   "outputs": [],
   "source": [
    "urban_rural_map.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56098d7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "urban_rural_map.to_csv(\n",
    "    path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}