mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-06 01:54:18 -07:00
* Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * adding urban to comparison tool * renaming file * adding urban rural to more comp tool outputs * updating requirements and poetry * Adding ej screen notebook * removing ej screen notebook since it's in justice40-tool-iss-719 Co-authored-by: La <ryy0@cdc.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
311 lines
8.8 KiB
Text
311 lines
8.8 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "51412a14",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import collections\n",
|
|
"from datetime import datetime\n",
|
|
"import functools\n",
|
|
"import itertools\n",
|
|
"import os\n",
|
|
"import pathlib\n",
|
|
"import requests\n",
|
|
"import string\n",
|
|
"import sys\n",
|
|
"import typing\n",
|
|
"import zipfile\n",
|
|
"\n",
|
|
"import IPython\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import pypandoc\n",
|
|
"\n",
|
|
"from tqdm.notebook import tqdm_notebook\n",
|
|
"\n",
|
|
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
|
"if module_path not in sys.path:\n",
|
|
" sys.path.append(module_path)\n",
|
|
"\n",
|
|
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
|
|
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
|
|
"\n",
|
|
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
|
"tqdm_notebook.pandas()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e3234c61",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
|
|
"pd.options.display.float_format = \"{:.2f}\".format\n",
|
|
"\n",
|
|
"# Set some global parameters\n",
|
|
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
|
|
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
|
|
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
|
|
"\n",
|
|
"## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
|
|
"GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
|
|
"\n",
|
|
"# Make the dirs if they don't exist\n",
|
|
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
|
|
"COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
|
|
"\n",
|
|
"CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
|
|
"\n",
|
|
"# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
|
|
"# and introducing the risk of misspelling the field name.)\n",
|
|
"\n",
|
|
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
|
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
|
|
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
|
|
"GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
|
|
"COUNTRY_FIELD_NAME = \"Country\"\n",
|
|
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
|
|
"\n",
|
|
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
|
|
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
|
|
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
|
|
"\n",
|
|
"# Define some suffixes\n",
|
|
"POPULATION_SUFFIX = \" (priority population)\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "376f5b2e",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
|
|
"\n",
|
|
"The end result is a dataframe `urban_rural_map`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4147c081",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map = pd.read_csv(\n",
|
|
" os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
|
|
" encoding = \"ISO-8859-1\",\n",
|
|
" skiprows=[1],\n",
|
|
" dtype='str',\n",
|
|
")\n",
|
|
"\n",
|
|
"geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
|
|
"geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
|
|
"\n",
|
|
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
|
|
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "78276a83",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f2890779",
|
|
"metadata": {},
|
|
"source": [
|
|
"We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fd89f6c8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
|
|
" GEOID_TRACT_FIELD_NAME,\n",
|
|
" 'ur',\n",
|
|
" 'ua',\n",
|
|
" 'cntyname',\n",
|
|
" 'uaname',\n",
|
|
" 'pop10',\n",
|
|
" 'afact'\n",
|
|
"]]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e597d7e2",
|
|
"metadata": {},
|
|
"source": [
|
|
"Checking Primary Key"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "29929046",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9e4c0c3f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d52761e8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
|
|
"total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
|
|
"total_geo_population.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "38225b78",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
|
|
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
|
|
"geocorr_urban_rural_with_total_pop_map.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "41b9448a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "eb4ddb9b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1e03d1e6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1d976cb5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
|
|
"urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
|
|
"urban_rural_map.reset_index(inplace=True)\n",
|
|
"urban_rural_map['urban_heuristic_flag'] = 0\n",
|
|
"mask = urban_rural_map['afact_U'] >= 0.5\n",
|
|
"urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0f3a0993",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map.rename(\n",
|
|
" columns={\n",
|
|
" 'pop10_R': 'population_in_rural_areas',\n",
|
|
" 'pop10_U': 'population_in_urban_areas',\n",
|
|
" 'afact_R': 'perc_population_in_rural_areas',\n",
|
|
" 'afact_U': 'perc_population_in_urban_areas',\n",
|
|
" }, \n",
|
|
" inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ba10f07c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map.head(5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "56098d7b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map.to_csv(\n",
|
|
" path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
|
|
")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|