Ticket 355: Adding map to Urban vs Rural Census Tracts (#696)

* Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * adding urban to comparison tool * renaming file * adding urban rural to more comp tool outputs * updating requirements and poetry * Adding ej screen notebook * removing ej screen notebook since it's in justice40-tool-iss-719 Co-authored-by: La <ryy0@cdc.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-07-30 12:01:16 -07:00 · 2021-09-22 12:31:03 -04:00 · 2021-09-22 12:31:03 -04:00 · 7709836a12
commit 7709836a12
parent aaf304fc89
10 changed files with 563 additions and 142 deletions
--- a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51412a14",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import collections\n",
+    "from datetime import datetime\n",
+    "import functools\n",
+    "import itertools\n",
+    "import os\n",
+    "import pathlib\n",
+    "import requests\n",
+    "import string\n",
+    "import sys\n",
+    "import typing\n",
+    "import zipfile\n",
+    "\n",
+    "import IPython\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pypandoc\n",
+    "\n",
+    "from tqdm.notebook import tqdm_notebook\n",
+    "\n",
+    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)\n",
+    "\n",
+    "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
+    "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
+    "\n",
+    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
+    "tqdm_notebook.pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3234c61",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
+    "pd.options.display.float_format = \"{:.2f}\".format\n",
+    "\n",
+    "# Set some global parameters\n",
+    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
+    "TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
+    "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
+    "\n",
+    "## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
+    "GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
+    "\n",
+    "# Make the dirs if they don't exist\n",
+    "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
+    "\n",
+    "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
+    "# and introducing the risk of misspelling the field name.)\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\"\n",
+    "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
+    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
+    "GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
+    "COUNTRY_FIELD_NAME = \"Country\"\n",
+    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
+    "\n",
+    "CEJST_SCORE_FIELD = \"cejst_score\"\n",
+    "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
+    "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
+    "\n",
+    "# Define some suffixes\n",
+    "POPULATION_SUFFIX = \" (priority population)\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "376f5b2e",
+   "metadata": {},
+   "source": [
+    "## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
+    "\n",
+    "The end result is a dataframe `urban_rural_map`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4147c081",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map = pd.read_csv(\n",
+    "    os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
+    "    encoding = \"ISO-8859-1\",\n",
+    "    skiprows=[1],\n",
+    "    dtype='str',\n",
+    ")\n",
+    "\n",
+    "geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
+    "geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
+    "\n",
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78276a83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2890779",
+   "metadata": {},
+   "source": [
+    "We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd89f6c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
+    "    GEOID_TRACT_FIELD_NAME,\n",
+    "    'ur',\n",
+    "    'ua',\n",
+    "    'cntyname',\n",
+    "    'uaname',\n",
+    "    'pop10',\n",
+    "    'afact'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e597d7e2",
+   "metadata": {},
+   "source": [
+    "Checking Primary Key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29929046",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e4c0c3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d52761e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
+    "total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
+    "total_geo_population.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38225b78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
+    "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
+    "geocorr_urban_rural_with_total_pop_map.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41b9448a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb4ddb9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e03d1e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d976cb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
+    "urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
+    "urban_rural_map.reset_index(inplace=True)\n",
+    "urban_rural_map['urban_heuristic_flag'] = 0\n",
+    "mask = urban_rural_map['afact_U'] >= 0.5\n",
+    "urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f3a0993",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map.rename(\n",
+    "    columns={\n",
+    "        'pop10_R': 'population_in_rural_areas',\n",
+    "        'pop10_U': 'population_in_urban_areas',\n",
+    "        'afact_R': 'perc_population_in_rural_areas',\n",
+    "        'afact_U': 'perc_population_in_urban_areas',\n",
+    "    }, \n",
+    "    inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba10f07c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56098d7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map.to_csv(\n",
+    "    path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}