mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-30 12:01:16 -07:00
Ticket 355: Adding map to Urban vs Rural Census Tracts (#696)
* Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * adding urban to comparison tool * renaming file * adding urban rural to more comp tool outputs * updating requirements and poetry * Adding ej screen notebook * removing ej screen notebook since it's in justice40-tool-iss-719 Co-authored-by: La <ryy0@cdc.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
parent
aaf304fc89
commit
7709836a12
10 changed files with 563 additions and 142 deletions
311
data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
Normal file
311
data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
Normal file
|
@ -0,0 +1,311 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51412a14",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import collections\n",
|
||||
"from datetime import datetime\n",
|
||||
"import functools\n",
|
||||
"import itertools\n",
|
||||
"import os\n",
|
||||
"import pathlib\n",
|
||||
"import requests\n",
|
||||
"import string\n",
|
||||
"import sys\n",
|
||||
"import typing\n",
|
||||
"import zipfile\n",
|
||||
"\n",
|
||||
"import IPython\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import pypandoc\n",
|
||||
"\n",
|
||||
"from tqdm.notebook import tqdm_notebook\n",
|
||||
"\n",
|
||||
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
|
||||
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
|
||||
"\n",
|
||||
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
||||
"tqdm_notebook.pandas()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e3234c61",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
|
||||
"pd.options.display.float_format = \"{:.2f}\".format\n",
|
||||
"\n",
|
||||
"# Set some global parameters\n",
|
||||
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
|
||||
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
|
||||
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
|
||||
"\n",
|
||||
"## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
|
||||
"GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
|
||||
"\n",
|
||||
"# Make the dirs if they don't exist\n",
|
||||
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
"CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
|
||||
"\n",
|
||||
"# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
|
||||
"# and introducing the risk of misspelling the field name.)\n",
|
||||
"\n",
|
||||
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
||||
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
|
||||
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
|
||||
"GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
|
||||
"COUNTRY_FIELD_NAME = \"Country\"\n",
|
||||
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
|
||||
"\n",
|
||||
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
|
||||
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
|
||||
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
|
||||
"\n",
|
||||
"# Define some suffixes\n",
|
||||
"POPULATION_SUFFIX = \" (priority population)\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "376f5b2e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
|
||||
"\n",
|
||||
"The end result is a dataframe `urban_rural_map`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4147c081",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map = pd.read_csv(\n",
|
||||
" os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
|
||||
" encoding = \"ISO-8859-1\",\n",
|
||||
" skiprows=[1],\n",
|
||||
" dtype='str',\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
|
||||
"geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
|
||||
"\n",
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "78276a83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f2890779",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fd89f6c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
|
||||
" GEOID_TRACT_FIELD_NAME,\n",
|
||||
" 'ur',\n",
|
||||
" 'ua',\n",
|
||||
" 'cntyname',\n",
|
||||
" 'uaname',\n",
|
||||
" 'pop10',\n",
|
||||
" 'afact'\n",
|
||||
"]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e597d7e2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Checking Primary Key"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29929046",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9e4c0c3f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d52761e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
|
||||
"total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
|
||||
"total_geo_population.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "38225b78",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
|
||||
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
|
||||
"geocorr_urban_rural_with_total_pop_map.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "41b9448a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eb4ddb9b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1e03d1e6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1d976cb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
|
||||
"urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
|
||||
"urban_rural_map.reset_index(inplace=True)\n",
|
||||
"urban_rural_map['urban_heuristic_flag'] = 0\n",
|
||||
"mask = urban_rural_map['afact_U'] >= 0.5\n",
|
||||
"urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0f3a0993",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map.rename(\n",
|
||||
" columns={\n",
|
||||
" 'pop10_R': 'population_in_rural_areas',\n",
|
||||
" 'pop10_U': 'population_in_urban_areas',\n",
|
||||
" 'afact_R': 'perc_population_in_rural_areas',\n",
|
||||
" 'afact_U': 'perc_population_in_urban_areas',\n",
|
||||
" }, \n",
|
||||
" inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ba10f07c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "56098d7b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map.to_csv(\n",
|
||||
" path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue