mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-10-23 21:03:58 -07:00
342 lines
9.4 KiB
Text
342 lines
9.4 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "51412a14",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import collections\n",
|
|
"from datetime import datetime\n",
|
|
"import functools\n",
|
|
"import itertools\n",
|
|
"import os\n",
|
|
"import pathlib\n",
|
|
"import requests\n",
|
|
"import string\n",
|
|
"import sys\n",
|
|
"import typing\n",
|
|
"import zipfile\n",
|
|
"\n",
|
|
"import IPython\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import pypandoc\n",
|
|
"\n",
|
|
"from tqdm.notebook import tqdm_notebook\n",
|
|
"\n",
|
|
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
|
"if module_path not in sys.path:\n",
|
|
" sys.path.append(module_path)\n",
|
|
"\n",
|
|
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
|
|
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
|
|
"\n",
|
|
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
|
"tqdm_notebook.pandas()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e3234c61",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
|
|
"pd.options.display.float_format = \"{:.2f}\".format\n",
|
|
"\n",
|
|
"# Set some global parameters\n",
|
|
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
|
|
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
|
|
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
|
|
"\n",
|
|
"## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
|
|
"GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
|
|
"\n",
|
|
"# Make the dirs if they don't exist\n",
|
|
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
|
|
"COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
|
|
"\n",
|
|
"CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
|
|
"\n",
|
|
"# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
|
|
"# and introducing the risk of misspelling the field name.)\n",
|
|
"\n",
|
|
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
|
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
|
|
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
|
|
"GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
|
|
"COUNTRY_FIELD_NAME = \"Country\"\n",
|
|
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
|
|
"\n",
|
|
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
|
|
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
|
|
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
|
|
"\n",
|
|
"# Define some suffixes\n",
|
|
"POPULATION_SUFFIX = \" (priority population)\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "376f5b2e",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
|
|
"\n",
|
|
"The end result is a dataframe `urban_rural_map`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4147c081",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# CSV was manually generated\n",
|
|
"# Instructions for how to generate the CSV from Geocorr are here: https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787\n",
|
|
"geocorr_urban_rural_map = pd.read_csv(\n",
|
|
" os.path.join(GEOCORR_DATA_DIR, \"geocorr2014_2125804280.csv\"),\n",
|
|
" encoding=\"ISO-8859-1\",\n",
|
|
" skiprows=[1],\n",
|
|
" dtype=\"str\",\n",
|
|
")\n",
|
|
"\n",
|
|
"geocorr_urban_rural_map[\"pop10\"] = pd.to_numeric(\n",
|
|
" geocorr_urban_rural_map[\"pop10\"]\n",
|
|
")\n",
|
|
"geocorr_urban_rural_map[\"afact\"] = pd.to_numeric(\n",
|
|
" geocorr_urban_rural_map[\"afact\"]\n",
|
|
")\n",
|
|
"\n",
|
|
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = (\n",
|
|
" geocorr_urban_rural_map[\"county\"] + geocorr_urban_rural_map[\"tract\"]\n",
|
|
") # + geocorr_urban_rural_map['bg']\n",
|
|
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[\n",
|
|
" GEOID_TRACT_FIELD_NAME\n",
|
|
"].str.replace(\".\", \"\", regex=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "78276a83",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f2890779",
|
|
"metadata": {},
|
|
"source": [
|
|
"We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fd89f6c8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map = geocorr_urban_rural_map[\n",
|
|
" [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\", \"cntyname\", \"uaname\", \"pop10\", \"afact\"]\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e597d7e2",
|
|
"metadata": {},
|
|
"source": [
|
|
"Checking Primary Key"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "29929046",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map.groupby(\n",
|
|
" [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\"], dropna=False\n",
|
|
").size().sort_values(ascending=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9e4c0c3f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_map.loc[\n",
|
|
" geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == \"36117020302\"\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d52761e8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"total_geo_population = (\n",
|
|
" geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME)\n",
|
|
" .agg({\"pop10\": np.sum})\n",
|
|
" .reset_index()\n",
|
|
")\n",
|
|
"total_geo_population.rename(columns={\"pop10\": \"total_population\"}, inplace=True)\n",
|
|
"total_geo_population.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "38225b78",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map = (\n",
|
|
" geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, \"ur\"])\n",
|
|
" .agg({\"pop10\": np.sum})\n",
|
|
" .reset_index()\n",
|
|
")\n",
|
|
"geocorr_urban_rural_with_total_pop_map = (\n",
|
|
" geocorr_urban_rural_with_total_pop_map.merge(\n",
|
|
" total_geo_population, how=\"inner\", on=GEOID_TRACT_FIELD_NAME\n",
|
|
" )\n",
|
|
")\n",
|
|
"geocorr_urban_rural_with_total_pop_map.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "41b9448a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map[\"afact\"] = (\n",
|
|
" geocorr_urban_rural_with_total_pop_map[\"pop10\"]\n",
|
|
" / geocorr_urban_rural_with_total_pop_map[\"total_population\"]\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "eb4ddb9b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1e03d1e6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"geocorr_urban_rural_with_total_pop_map.loc[\n",
|
|
" geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME]\n",
|
|
" == \"01001020200\"\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1d976cb5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(\n",
|
|
" index=GEOID_TRACT_FIELD_NAME, columns=\"ur\", values=[\"pop10\", \"afact\"]\n",
|
|
")\n",
|
|
"urban_rural_map.columns = [\n",
|
|
" \"_\".join(col).strip() for col in urban_rural_map.columns.values\n",
|
|
"]\n",
|
|
"urban_rural_map.reset_index(inplace=True)\n",
|
|
"urban_rural_map[\"urban_heuristic_flag\"] = 0\n",
|
|
"mask = urban_rural_map[\"afact_U\"] >= 0.5\n",
|
|
"urban_rural_map.loc[mask, \"urban_heuristic_flag\"] = 1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0f3a0993",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map.rename(\n",
|
|
" columns={\n",
|
|
" \"pop10_R\": \"population_in_rural_areas\",\n",
|
|
" \"pop10_U\": \"population_in_urban_areas\",\n",
|
|
" \"afact_R\": \"perc_population_in_rural_areas\",\n",
|
|
" \"afact_U\": \"perc_population_in_urban_areas\",\n",
|
|
" },\n",
|
|
" inplace=True,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ba10f07c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map.head(5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "56098d7b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urban_rural_map.to_csv(\n",
|
|
" path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
|
|
")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|