j40-cejst-2/data/data-pipeline/data_pipeline/ipython/aggregate_burden_exploration.ipynb
Travis Newby a27ca46b1d
Update dependencies to fix safety check failures (#2142)
* Update dependencies

Update dependencies causing safety check to fail

* Remove nb_black from jupyter notebooks

Because of the build issue on M1 macs, nb_black was removed as a dev dependency. This change removes the lines referencing nb_black (%load_ext lab_black) from all jupyter notebooks.
2023-02-02 16:43:59 -06:00

1236 lines
40 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "029b2e96-5a58-4816-81cc-4f60da27e518",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pathlib"
]
},
{
"cell_type": "markdown",
"id": "c20b1785-f802-474d-8aec-423ec574bd05",
"metadata": {},
"source": [
"# Creating an aggregate burden index\n",
"\n",
"Although we will not be using a aggregate burden index for v1.0 of the CEJST, the USDS team wanted to demonstrate how even duplicating CalEnviroScreen's cumulative index (or a loose interpretation of it) would impact the communities highlighted. Here, the scoring procedure that we use is lifted as closely as possible from CalEnviroScreen, including the categorization of burdens and the weighting between cumulative score. \n",
"\n",
"The data team believes that a threshold methodology has significant limitations that an aggregate or cumulative burden index could remediate, and presents the following as an example of such an index. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3dac5f2a-385d-4d23-9564-cfbe2d0706d1",
"metadata": {},
"outputs": [],
"source": [
"SCORE_DIR = pathlib.Path.cwd().parent / \"data\" / \"score\" / \"csv\" / \"full\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9fb12b5e-1f53-4eaf-a306-42ea9c1422fd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/Cellar/jupyterlab/3.2.8/libexec/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3251: DtypeWarning: Columns (1,2,3,4,75,76,78) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
}
],
"source": [
"usa = pd.read_csv(\n",
" SCORE_DIR / \"usa.csv\",\n",
" dtype={\"GEOID10_TRACT\": str},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f80bdf5f-11e3-4967-a0b3-a7504374c803",
"metadata": {},
"outputs": [],
"source": [
"## environment\n",
"toxins_category = [\n",
" \"Percent pre-1960s housing (lead paint indicator) (percentile)\",\n",
" \"Proximity to Risk Management Plan (RMP) facilities (percentile)\",\n",
" \"Proximity to NPL sites (percentile)\",\n",
" \"Proximity to hazardous waste sites (percentile)\",\n",
" \"Wastewater discharge (percentile)\",\n",
"]\n",
"\n",
"## sensitive populations\n",
"health_category = [\n",
" \"Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)\",\n",
" \"Current asthma among adults aged greater than or equal to 18 years (percentile)\",\n",
" \"Coronary heart disease among adults aged greater than or equal to 18 years (percentile)\",\n",
" \"Low life expectancy (percentile)\",\n",
"]\n",
"\n",
"## exposure\n",
"built_environment_category = [\n",
" \"Expected building loss rate (Natural Hazards Risk Index) (percentile)\",\n",
" \"Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)\",\n",
" \"Expected population loss rate (Natural Hazards Risk Index) (percentile)\",\n",
" \"Energy burden (percentile)\",\n",
" \"Diesel particulate matter exposure (percentile)\",\n",
" \"Traffic proximity and volume (percentile)\",\n",
" \"PM2.5 in the air (percentile)\",\n",
"]\n",
"\n",
"## socioeconomic\n",
"socioeconomic_category = [\n",
" \"Unemployment (percent) (percentile)\",\n",
" \"Housing burden (percent) (percentile)\",\n",
" \"Low median household income as a percent of area median income (percentile)\",\n",
" \"Percent of households in linguistic isolation (percentile)\",\n",
" \"Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile)\",\n",
" \"Percent individuals age 25 or over with less than high school degree (percentile)\",\n",
" \"Percent of individuals < 100% Federal Poverty Line (percentile)\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a5057a21-318c-438e-84da-8f2e765c02d5",
"metadata": {},
"outputs": [],
"source": [
"usa[\"toxins_cat\"] = usa[toxins_category].mean(axis=1)\n",
"usa[\"built_env_cat\"] = usa[built_environment_category].mean(axis=1)\n",
"usa[\"health_cat\"] = usa[health_category].mean(axis=1)\n",
"usa[\"ses_cat\"] = usa[socioeconomic_category].mean(axis=1)\n",
"\n",
"\n",
"usa[\"pollution_burden\"] = 0.5 * usa[\"toxins_cat\"] + usa[\"built_env_cat\"]\n",
"usa[\"population_characteristics\"] = usa[\"health_cat\"] + usa[\"ses_cat\"]\n",
"poll_max = usa[\"pollution_burden\"].max()\n",
"pop_max = usa[\"population_characteristics\"].max()\n",
"\n",
"usa[\"scaled_pollution_burden\"] = usa[\"pollution_burden\"] / poll_max\n",
"usa[\"scaled_population_characteristics\"] = usa[\"population_characteristics\"] / pop_max\n",
"\n",
"usa[\"cal_score\"] = (\n",
" usa[\"scaled_pollution_burden\"] * usa[\"scaled_population_characteristics\"]\n",
")\n",
"usa[\"pct_cal_score\"] = usa[\"cal_score\"].rank(pct=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3217e0ce-d9ec-46b8-9584-4cf9f6b35686",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.65</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>42039</td>\n",
" <td>6823</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>5423</td>\n",
" <td>19849</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.65 \n",
"False 42039 6823\n",
"True 5423 19849"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.65</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>0.567068</td>\n",
" <td>0.092036</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>0.073151</td>\n",
" <td>0.267745</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.65 \n",
"False 0.567068 0.092036\n",
"True 0.073151 0.267745"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.8</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>46152</td>\n",
" <td>13540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>1310</td>\n",
" <td>13132</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.8 \n",
"False 46152 13540\n",
"True 1310 13132"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.8</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>0.622548</td>\n",
" <td>0.182642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>0.017671</td>\n",
" <td>0.177139</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.8 \n",
"False 0.622548 0.182642\n",
"True 0.017671 0.177139"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.825</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>46575</td>\n",
" <td>14923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>887</td>\n",
" <td>11749</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.825 \n",
"False 46575 14923\n",
"True 887 11749"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.825</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>0.628254</td>\n",
" <td>0.201298</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>0.011965</td>\n",
" <td>0.158483</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.825 \n",
"False 0.628254 0.201298\n",
"True 0.011965 0.158483"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.85</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>46898</td>\n",
" <td>16405</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>564</td>\n",
" <td>10267</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.85 \n",
"False 46898 16405\n",
"True 564 10267"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.85</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>0.632611</td>\n",
" <td>0.221288</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>0.007608</td>\n",
" <td>0.138492</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.85 \n",
"False 0.632611 0.221288\n",
"True 0.007608 0.138492"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.9</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>47288</td>\n",
" <td>19625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>174</td>\n",
" <td>7047</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.9 \n",
"False 47288 19625\n",
"True 174 7047"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Definition N (communities)</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.9</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>0.637872</td>\n",
" <td>0.264723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>0.002347</td>\n",
" <td>0.095058</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Definition N (communities) False True\n",
"pct_cal_score_0.9 \n",
"False 0.637872 0.264723\n",
"True 0.002347 0.095058"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# this shows the number of communities identified by Score N (base)\n",
"# that are also identified by our cumulative burden metric (at some threshold)\n",
"\n",
"for cutoff in [0.65, 0.8, 0.825, 0.85, 0.9]:\n",
"\n",
" usa[\"pct_cal_score_\" + str(cutoff)] = usa[\"pct_cal_score\"] >= cutoff\n",
" display(\n",
" pd.crosstab(\n",
" usa[\"pct_cal_score_\" + str(cutoff)],\n",
" usa[\"Definition N (communities)\"],\n",
" )\n",
" )\n",
" display(\n",
" pd.crosstab(\n",
" usa[\"pct_cal_score_\" + str(cutoff)],\n",
" usa[\"Definition N (communities)\"],\n",
" normalize=True,\n",
" )\n",
" )"
]
},
{
"cell_type": "markdown",
"id": "fa47f4f8-16bd-47ec-b2a5-e20fe2d83c83",
"metadata": {},
"source": [
"## Does it square with calenvironscreen? \n",
"\n",
"Here we compare OUR work with the data from CalEnviroScreen, limiting to California. We see reasonable agreement -- the vast majority (>90%) of tracts at or above 90th percentile match."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "5498bbd7-b40f-4e17-9dd4-c102f071b250",
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = pathlib.Path.cwd().parent / \"data\" / \"dataset\"\n",
"\n",
"true_ces = pd.read_csv(\n",
" DATA_DIR / \"calenviroscreen4/data06.csv\",\n",
" dtype={\"GEOID10_TRACT\": str},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "edf211f6-84ee-4a9f-b0a6-31ffc2622adb",
"metadata": {},
"outputs": [],
"source": [
"ces_merged = usa.merge(true_ces, on=\"GEOID10_TRACT\", how=\"right\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "cd54de47-2025-460e-beda-7807adf334df",
"metadata": {},
"outputs": [],
"source": [
"ces_merged[\"new_cal_score\"] = ces_merged[\"pct_cal_score\"].rank(pct=True)\n",
"ces_merged[\"new_cal_flag\"] = ces_merged[\"new_cal_score\"] >= 0.9"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ea5aabe0-d3e3-47b2-98df-335538cbf8e4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 0.900311\n",
"True 0.099689\n",
"Name: new_cal_flag, dtype: float64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ces_merged[\"new_cal_flag\"].value_counts(normalize=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d63f6db6-aae3-43e2-9ea8-5766d630fe8a",
"metadata": {},
"outputs": [],
"source": [
"ces_merged[\"any_flag\"] = (\n",
" ces_merged[\"pct_cal_score_0.9\"] | ces_merged[\"Definition N (communities)\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "23360b80-a3d3-4d35-b56c-cc7f6423646c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pct_cal_score_0.9 DRAFT CES 4.0\\nPercentile Range\n",
"False 10-15% 0.052527\n",
" 20-25% 0.052527\n",
" 40-45% 0.052527\n",
" 1-5% (lowest scores) 0.052395\n",
" 15-20% 0.052395\n",
" 25-30% 0.052395\n",
" 30-35% 0.052395\n",
" 35-40% 0.052395\n",
" 5-10% 0.052395\n",
" 45-50% 0.052263\n",
" 50-55% 0.052263\n",
" 60-65% 0.052130\n",
" 55-60% 0.051998\n",
" 65-70% 0.051336\n",
" 70-75% 0.050410\n",
" 75-80% 0.048690\n",
" 80-85% 0.047499\n",
" 85-90% 0.041413\n",
" 90-95% 0.037841\n",
" 95-100% (highest scores) 0.028844\n",
" NaN 0.013363\n",
"True 95-100% (highest scores) 0.375262\n",
" 90-95% 0.232704\n",
" 85-90% 0.174004\n",
" 80-85% 0.079665\n",
" 75-80% 0.058700\n",
" 70-75% 0.033543\n",
" 65-70% 0.016771\n",
" NaN 0.008386\n",
" 55-60% 0.006289\n",
" 60-65% 0.006289\n",
" 50-55% 0.004193\n",
" 30-35% 0.002096\n",
" 45-50% 0.002096\n",
"Name: DRAFT CES 4.0\\nPercentile Range, dtype: float64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Each row of the following outputs answers the question:\n",
"# what share of communities at or above the 90th percentile for our version of aggregate burden fall into\n",
"# each percentile range for the true calenviroscreen? For example, 38% of tracts that are above 90th percentile\n",
"# for our metric are in the highest score bucket on CES.\n",
"\n",
"ces_merged.groupby(\"pct_cal_score_0.9\")[\"DRAFT CES 4.0\\nPercentile Range\"].value_counts(\n",
" dropna=False, normalize=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "72bcce80-ed57-4f1c-ad01-c88b0b9046a8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>DRAFT CES 4.0\n",
"Percentile Range</th>\n",
" <th>1-5% (lowest scores)</th>\n",
" <th>10-15%</th>\n",
" <th>15-20%</th>\n",
" <th>20-25%</th>\n",
" <th>25-30%</th>\n",
" <th>30-35%</th>\n",
" <th>35-40%</th>\n",
" <th>40-45%</th>\n",
" <th>45-50%</th>\n",
" <th>5-10%</th>\n",
" <th>50-55%</th>\n",
" <th>55-60%</th>\n",
" <th>60-65%</th>\n",
" <th>65-70%</th>\n",
" <th>70-75%</th>\n",
" <th>75-80%</th>\n",
" <th>80-85%</th>\n",
" <th>85-90%</th>\n",
" <th>90-95%</th>\n",
" <th>95-100% (highest scores)</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pct_cal_score_0.9</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>False</th>\n",
" <td>0.052395</td>\n",
" <td>0.052527</td>\n",
" <td>0.052395</td>\n",
" <td>0.052527</td>\n",
" <td>0.052395</td>\n",
" <td>0.052395</td>\n",
" <td>0.052395</td>\n",
" <td>0.052527</td>\n",
" <td>0.052263</td>\n",
" <td>0.052395</td>\n",
" <td>0.052263</td>\n",
" <td>0.051998</td>\n",
" <td>0.052130</td>\n",
" <td>0.051336</td>\n",
" <td>0.050410</td>\n",
" <td>0.04869</td>\n",
" <td>0.047499</td>\n",
" <td>0.041413</td>\n",
" <td>0.037841</td>\n",
" <td>0.028844</td>\n",
" </tr>\n",
" <tr>\n",
" <th>True</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.002096</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.002096</td>\n",
" <td>NaN</td>\n",
" <td>0.004193</td>\n",
" <td>0.006289</td>\n",
" <td>0.006289</td>\n",
" <td>0.016771</td>\n",
" <td>0.033543</td>\n",
" <td>0.05870</td>\n",
" <td>0.079665</td>\n",
" <td>0.174004</td>\n",
" <td>0.232704</td>\n",
" <td>0.375262</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"DRAFT CES 4.0\n",
"Percentile Range 1-5% (lowest scores) 10-15% 15-20% \\\n",
"pct_cal_score_0.9 \n",
"False 0.052395 0.052527 0.052395 \n",
"True NaN NaN NaN \n",
"\n",
"DRAFT CES 4.0\n",
"Percentile Range 20-25% 25-30% 30-35% 35-40% \\\n",
"pct_cal_score_0.9 \n",
"False 0.052527 0.052395 0.052395 0.052395 \n",
"True NaN NaN 0.002096 NaN \n",
"\n",
"DRAFT CES 4.0\n",
"Percentile Range 40-45% 45-50% 5-10% 50-55% \\\n",
"pct_cal_score_0.9 \n",
"False 0.052527 0.052263 0.052395 0.052263 \n",
"True NaN 0.002096 NaN 0.004193 \n",
"\n",
"DRAFT CES 4.0\n",
"Percentile Range 55-60% 60-65% 65-70% 70-75% \\\n",
"pct_cal_score_0.9 \n",
"False 0.051998 0.052130 0.051336 0.050410 \n",
"True 0.006289 0.006289 0.016771 0.033543 \n",
"\n",
"DRAFT CES 4.0\n",
"Percentile Range 75-80% 80-85% 85-90% 90-95% \\\n",
"pct_cal_score_0.9 \n",
"False 0.04869 0.047499 0.041413 0.037841 \n",
"True 0.05870 0.079665 0.174004 0.232704 \n",
"\n",
"DRAFT CES 4.0\n",
"Percentile Range 95-100% (highest scores) \n",
"pct_cal_score_0.9 \n",
"False 0.028844 \n",
"True 0.375262 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Easier to read!\n",
"ces_merged.groupby(\"pct_cal_score_0.9\")[\"DRAFT CES 4.0\\nPercentile Range\"].value_counts(\n",
" dropna=False, normalize=True\n",
").rename(\"share\").reset_index().pivot_table(\n",
" index=\"pct_cal_score_0.9\", columns=\"DRAFT CES 4.0\\nPercentile Range\", values=\"share\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "86c66114-b090-4f96-877d-60dad7e18252",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>any_flag</th>\n",
" <th>False</th>\n",
" <th>True</th>\n",
" </tr>\n",
" <tr>\n",
" <th>DRAFT CES 4.0\n",
"Percentile Range</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1-5% (lowest scores)</th>\n",
" <td>0.079518</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10-15%</th>\n",
" <td>0.077108</td>\n",
" <td>0.004255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15-20%</th>\n",
" <td>0.075904</td>\n",
" <td>0.005892</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20-25%</th>\n",
" <td>0.072691</td>\n",
" <td>0.011457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25-30%</th>\n",
" <td>0.071285</td>\n",
" <td>0.013421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30-35%</th>\n",
" <td>0.068876</td>\n",
" <td>0.017676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35-40%</th>\n",
" <td>0.066466</td>\n",
" <td>0.021277</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40-45%</th>\n",
" <td>0.061847</td>\n",
" <td>0.029133</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45-50%</th>\n",
" <td>0.056225</td>\n",
" <td>0.037971</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5-10%</th>\n",
" <td>0.078715</td>\n",
" <td>0.001309</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50-55%</th>\n",
" <td>0.050000</td>\n",
" <td>0.048445</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55-60%</th>\n",
" <td>0.046787</td>\n",
" <td>0.053355</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60-65%</th>\n",
" <td>0.046185</td>\n",
" <td>0.054664</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65-70%</th>\n",
" <td>0.039960</td>\n",
" <td>0.064484</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70-75%</th>\n",
" <td>0.026908</td>\n",
" <td>0.086088</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75-80%</th>\n",
" <td>0.023695</td>\n",
" <td>0.090998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80-85%</th>\n",
" <td>0.018273</td>\n",
" <td>0.100164</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85-90%</th>\n",
" <td>0.013253</td>\n",
" <td>0.108020</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90-95%</th>\n",
" <td>0.007831</td>\n",
" <td>0.117185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95-100% (highest scores)</th>\n",
" <td>0.003213</td>\n",
" <td>0.124714</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"any_flag False True\n",
"DRAFT CES 4.0\\nPercentile Range \n",
"1-5% (lowest scores) 0.079518 NaN\n",
"10-15% 0.077108 0.004255\n",
"15-20% 0.075904 0.005892\n",
"20-25% 0.072691 0.011457\n",
"25-30% 0.071285 0.013421\n",
"30-35% 0.068876 0.017676\n",
"35-40% 0.066466 0.021277\n",
"40-45% 0.061847 0.029133\n",
"45-50% 0.056225 0.037971\n",
"5-10% 0.078715 0.001309\n",
"50-55% 0.050000 0.048445\n",
"55-60% 0.046787 0.053355\n",
"60-65% 0.046185 0.054664\n",
"65-70% 0.039960 0.064484\n",
"70-75% 0.026908 0.086088\n",
"75-80% 0.023695 0.090998\n",
"80-85% 0.018273 0.100164\n",
"85-90% 0.013253 0.108020\n",
"90-95% 0.007831 0.117185\n",
"95-100% (highest scores) 0.003213 0.124714"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Any flag here is score N or our version of cumulative burden. This table can be read the same way as above\n",
"\n",
"ces_merged.groupby(\"any_flag\")[\"DRAFT CES 4.0\\nPercentile Range\"].value_counts(\n",
" dropna=False, normalize=True\n",
").rename(\"share\").reset_index().pivot_table(\n",
" index=\"any_flag\", columns=\"DRAFT CES 4.0\\nPercentile Range\", values=\"share\"\n",
").T"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}