mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 07:10:16 -07:00
Run ETL processes in parallel (#1253)
* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
This commit is contained in:
parent
389eb59ac4
commit
a0d6e55f0a
30 changed files with 286 additions and 160 deletions
|
@ -2,12 +2,33 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"id": "71c4acd0",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'lab_black'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_29987/670980058.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mdata_pipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mfield_names\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'load_ext'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'lab_black'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0;31m# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2349\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'local_ns'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_local_scope\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstack_depth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2350\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2351\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2352\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2353\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/decorator.py\u001b[0m in \u001b[0;36mfun\u001b[0;34m(*args, **kw)\u001b[0m\n\u001b[1;32m 230\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mkwsyntax\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 231\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 232\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcaller\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextras\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 233\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/IPython/core/magics/extension.py\u001b[0m in \u001b[0;36mload_ext\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mmodule_str\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mUsageError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Missing module name.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 33\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshell\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextension_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_extension\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'already loaded'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/IPython/core/extensions.py\u001b[0m in \u001b[0;36mload_extension\u001b[0;34m(self, module_str)\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmodule_str\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodules\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mprepended_to_syspath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mipython_extension_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 80\u001b[0;31m \u001b[0mmod\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimport_module\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 81\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmod\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__file__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mipython_extension_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m print((\"Loading extensions from {dir} is deprecated. \"\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.9.6/lib/python3.9/importlib/__init__.py\u001b[0m in \u001b[0;36mimport_module\u001b[0;34m(name, package)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0mlevel\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_bootstrap\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_gcd_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpackage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.9.6/lib/python3.9/importlib/_bootstrap.py\u001b[0m in \u001b[0;36m_gcd_import\u001b[0;34m(name, package, level)\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.9.6/lib/python3.9/importlib/_bootstrap.py\u001b[0m in \u001b[0;36m_find_and_load\u001b[0;34m(name, import_)\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.9.6/lib/python3.9/importlib/_bootstrap.py\u001b[0m in \u001b[0;36m_find_and_load_unlocked\u001b[0;34m(name, import_)\u001b[0m\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'lab_black'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import collections\n",
|
||||
"import functools\n",
|
||||
|
@ -102,7 +123,9 @@
|
|||
"# Create the state ID by taking the first two digits of the FIPS CODE of the tract.\n",
|
||||
"# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n",
|
||||
"cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n",
|
||||
" cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].astype(str).str[0:2]\n",
|
||||
" cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n",
|
||||
" .astype(str)\n",
|
||||
" .str[0:2]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"cejst_df.head()"
|
||||
|
@ -174,7 +197,7 @@
|
|||
"source": [
|
||||
"# Analyze one field at a time (useful for setting thresholds)\n",
|
||||
"\n",
|
||||
"quantile = 0.95\n",
|
||||
"quantile = 0.90\n",
|
||||
"\n",
|
||||
"for field in [\n",
|
||||
" field_names.COLLEGE_ATTENDANCE_FIELD,\n",
|
||||
|
@ -207,16 +230,18 @@
|
|||
"CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
|
||||
"CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
|
||||
"\n",
|
||||
"calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
|
||||
"calenviroscreen_data_path = (\n",
|
||||
" DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
|
||||
")\n",
|
||||
"calenviroscreen_df = pd.read_csv(\n",
|
||||
" calenviroscreen_data_path,\n",
|
||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Convert priority community field to a bool.\n",
|
||||
"calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
|
||||
"calenviroscreen_df[\n",
|
||||
" CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
|
||||
"].astype(bool)\n",
|
||||
"] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n",
|
||||
"\n",
|
||||
"calenviroscreen_df.head()"
|
||||
]
|
||||
|
@ -245,7 +270,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b1ac2854-80c8-42a8-85e8-84c5684bbe43",
|
||||
"id": "891b5bfc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -271,7 +296,9 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Load persistent poverty data\n",
|
||||
"persistent_poverty_path = DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
|
||||
"persistent_poverty_path = (\n",
|
||||
" DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
|
||||
")\n",
|
||||
"persistent_poverty_df = pd.read_csv(\n",
|
||||
" persistent_poverty_path,\n",
|
||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||
|
@ -284,7 +311,9 @@
|
|||
"PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
|
||||
"\n",
|
||||
"persistent_poverty_df.rename(\n",
|
||||
" columns={PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD},\n",
|
||||
" columns={\n",
|
||||
" PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n",
|
||||
" },\n",
|
||||
" inplace=True,\n",
|
||||
" errors=\"raise\",\n",
|
||||
")\n",
|
||||
|
@ -305,7 +334,9 @@
|
|||
" field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,\n",
|
||||
" field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,\n",
|
||||
"]\n",
|
||||
"mapping_inequality_path = DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
|
||||
"mapping_inequality_path = (\n",
|
||||
" DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
|
||||
")\n",
|
||||
"mapping_inequality_df = pd.read_csv(\n",
|
||||
" mapping_inequality_path,\n",
|
||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||
|
@ -436,7 +467,9 @@
|
|||
" census_tract_dfs,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"tract_values = merged_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
|
||||
"tract_values = (\n",
|
||||
" merged_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
|
||||
")\n",
|
||||
"if any(tract_values != [11]):\n",
|
||||
" print(tract_values)\n",
|
||||
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
|
||||
|
@ -728,13 +761,13 @@
|
|||
" summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
|
||||
"\n",
|
||||
" for priority_communities_field in priority_communities_fields:\n",
|
||||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
|
||||
" summary_dict[\n",
|
||||
" f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
|
||||
" ].sum()\n",
|
||||
" ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n",
|
||||
"\n",
|
||||
" summary_dict[f\"{priority_communities_field} (total tracts)\"] = frame[\n",
|
||||
" f\"{priority_communities_field}\"\n",
|
||||
" ].sum()\n",
|
||||
" summary_dict[\n",
|
||||
" f\"{priority_communities_field} (total tracts)\"\n",
|
||||
" ] = frame[f\"{priority_communities_field}\"].sum()\n",
|
||||
"\n",
|
||||
" # Calculate some combinations of other variables.\n",
|
||||
" summary_dict[f\"{priority_communities_field} (percent tracts)\"] = (\n",
|
||||
|
@ -742,7 +775,9 @@
|
|||
" / total_tracts_in_geography\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
|
||||
" summary_dict[\n",
|
||||
" f\"{priority_communities_field} (percent population)\"\n",
|
||||
" ] = (\n",
|
||||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
|
||||
" / total_population_in_geography\n",
|
||||
" )\n",
|
||||
|
@ -788,7 +823,9 @@
|
|||
"\n",
|
||||
" # Run the comparison function on the groups.\n",
|
||||
" region_distribution_df = region_grouped_df.progress_apply(\n",
|
||||
" lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
|
||||
" lambda frame: calculate_state_comparison(\n",
|
||||
" frame, geography_field=\"region\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Next, run the comparison by division\n",
|
||||
|
@ -796,7 +833,9 @@
|
|||
"\n",
|
||||
" # Run the comparison function on the groups.\n",
|
||||
" division_distribution_df = division_grouped_df.progress_apply(\n",
|
||||
" lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
|
||||
" lambda frame: calculate_state_comparison(\n",
|
||||
" frame, geography_field=\"division\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Next, run the comparison by urban/rural\n",
|
||||
|
@ -851,7 +890,9 @@
|
|||
" column_character = get_excel_column_name(column_index)\n",
|
||||
"\n",
|
||||
" # Set all columns to larger width\n",
|
||||
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||
" worksheet.set_column(\n",
|
||||
" f\"{column_character}:{column_character}\", column_width\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Special formatting for all percent columns\n",
|
||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||
|
@ -866,9 +907,7 @@
|
|||
"\n",
|
||||
" # Special formatting for columns that capture the percent of population considered priority.\n",
|
||||
" if \"(percent population)\" in column:\n",
|
||||
" column_ranges = (\n",
|
||||
" f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
|
||||
" )\n",
|
||||
" column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
|
||||
"\n",
|
||||
" # Add green to red conditional formatting.\n",
|
||||
" worksheet.conditional_format(\n",
|
||||
|
@ -894,14 +933,18 @@
|
|||
" writer.save()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"fields_to_analyze = [index.priority_communities_field for index in census_tract_indices]\n",
|
||||
"fields_to_analyze = [\n",
|
||||
" index.priority_communities_field for index in census_tract_indices\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Convert all indices to boolean\n",
|
||||
"for field_to_analyze in fields_to_analyze:\n",
|
||||
" if \"Areas of Concern\" in field_to_analyze:\n",
|
||||
" print(f\"Converting {field_to_analyze} to boolean.\")\n",
|
||||
"\n",
|
||||
" merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(value=0)\n",
|
||||
" merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(\n",
|
||||
" value=0\n",
|
||||
" )\n",
|
||||
" merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
|
@ -968,10 +1011,14 @@
|
|||
" column_character = get_excel_column_name(column_index)\n",
|
||||
"\n",
|
||||
" # Set all columns to larger width\n",
|
||||
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||
" worksheet.set_column(\n",
|
||||
" f\"{column_character}:{column_character}\", column_width\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Add green to red conditional formatting.\n",
|
||||
" column_ranges = f\"{column_character}2:{column_character}{len(basic_stats_df)+1}\"\n",
|
||||
" column_ranges = (\n",
|
||||
" f\"{column_character}2:{column_character}{len(basic_stats_df)+1}\"\n",
|
||||
" )\n",
|
||||
" worksheet.conditional_format(\n",
|
||||
" column_ranges,\n",
|
||||
" # Min: green, max: red.\n",
|
||||
|
@ -984,7 +1031,11 @@
|
|||
"\n",
|
||||
" # Special formatting for all percent columns\n",
|
||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||
" if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
|
||||
" if (\n",
|
||||
" \"percent \" in column\n",
|
||||
" or \"(percent)\" in column\n",
|
||||
" or \"Percent \" in column\n",
|
||||
" ):\n",
|
||||
" # Make these columns percentages.\n",
|
||||
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
||||
" worksheet.set_column(\n",
|
||||
|
@ -1013,9 +1064,15 @@
|
|||
" temp_df[index.priority_communities_field] == True\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" grouped_df = temp_df.groupby(index.priority_communities_field).mean().reset_index()\n",
|
||||
" result_df = grouped_df[[index.priority_communities_field] + comparison_fields]\n",
|
||||
" result_df.to_csv(directory / f\"{index.method_name} Basic Stats.csv\", index=False)\n",
|
||||
" grouped_df = (\n",
|
||||
" temp_df.groupby(index.priority_communities_field).mean().reset_index()\n",
|
||||
" )\n",
|
||||
" result_df = grouped_df[\n",
|
||||
" [index.priority_communities_field] + comparison_fields\n",
|
||||
" ]\n",
|
||||
" result_df.to_csv(\n",
|
||||
" directory / f\"{index.method_name} Basic Stats.csv\", index=False\n",
|
||||
" )\n",
|
||||
" write_basic_stats_excel(\n",
|
||||
" basic_stats_df=result_df,\n",
|
||||
" file_path=directory / f\"{index.method_name} Basic Stats.xlsx\",\n",
|
||||
|
@ -1064,7 +1121,9 @@
|
|||
"\n",
|
||||
" # Also add in the count of census tracts.\n",
|
||||
" count_field_name = \"Count of census tracts\"\n",
|
||||
" comparison_df[count_field_name] = grouped_df.size().to_frame(count_field_name)\n",
|
||||
" comparison_df[count_field_name] = grouped_df.size().to_frame(\n",
|
||||
" count_field_name\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" comparison_df = comparison_df.reset_index()\n",
|
||||
"\n",
|
||||
|
@ -1079,7 +1138,9 @@
|
|||
"\n",
|
||||
" # Put criteria description column first.\n",
|
||||
" columns_to_put_first = (\n",
|
||||
" [criteria_description_field_name] + fields_to_group_by + [count_field_name]\n",
|
||||
" [criteria_description_field_name]\n",
|
||||
" + fields_to_group_by\n",
|
||||
" + [count_field_name]\n",
|
||||
" )\n",
|
||||
" new_column_order = columns_to_put_first + [\n",
|
||||
" col for col in comparison_df.columns if col not in columns_to_put_first\n",
|
||||
|
@ -1110,7 +1171,9 @@
|
|||
"\n",
|
||||
" # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n",
|
||||
" # index column at the left of the output dataframe.\n",
|
||||
" census_tracts_score_comparison_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n",
|
||||
" census_tracts_score_comparison_df.to_excel(\n",
|
||||
" writer, sheet_name=\"Sheet1\", index=False\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Get the xlsxwriter workbook and worksheet objects.\n",
|
||||
" workbook = writer.book\n",
|
||||
|
@ -1132,7 +1195,9 @@
|
|||
" column_character = get_excel_column_name(column_index)\n",
|
||||
"\n",
|
||||
" # Set all columns to larger width\n",
|
||||
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||
" worksheet.set_column(\n",
|
||||
" f\"{column_character}:{column_character}\", column_width\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Add green to red conditional formatting.\n",
|
||||
" column_ranges = f\"{column_character}2:{column_character}{len(census_tracts_score_comparison_df)+1}\"\n",
|
||||
|
@ -1148,7 +1213,11 @@
|
|||
"\n",
|
||||
" # Special formatting for all percent columns\n",
|
||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||
" if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
|
||||
" if (\n",
|
||||
" \"percent \" in column\n",
|
||||
" or \"(percent)\" in column\n",
|
||||
" or \"Percent \" in column\n",
|
||||
" ):\n",
|
||||
" # Make these columns percentages.\n",
|
||||
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
||||
" worksheet.set_column(\n",
|
||||
|
@ -1164,7 +1233,9 @@
|
|||
" # Overwrite both the value and the format of each header cell\n",
|
||||
" # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n",
|
||||
" # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n",
|
||||
" for col_num, value in enumerate(census_tracts_score_comparison_df.columns.values):\n",
|
||||
" for col_num, value in enumerate(\n",
|
||||
" census_tracts_score_comparison_df.columns.values\n",
|
||||
" ):\n",
|
||||
" worksheet.write(0, col_num, value, header_format)\n",
|
||||
"\n",
|
||||
" writer.save()\n",
|
||||
|
@ -1422,7 +1493,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.10"
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue