j40-cejst-2/data/data-pipeline/data_pipeline/ipython/aggregate_burden_exploration.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "029b2e96-5a58-4816-81cc-4f60da27e518",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pathlib"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c20b1785-f802-474d-8aec-423ec574bd05",
   "metadata": {},
   "source": [
    "# Creating an aggregate burden index\n",
    "\n",
    "Although we will not be using a aggregate burden index for v1.0 of the CEJST, the USDS team wanted to demonstrate how even duplicating CalEnviroScreen's cumulative index (or a loose interpretation of it) would impact the communities highlighted. Here, the scoring procedure that we use is lifted as closely as possible from CalEnviroScreen, including the categorization of burdens and the weighting between cumulative score. \n",
    "\n",
    "The data team believes that a threshold methodology has significant limitations that an aggregate or cumulative burden index could remediate, and presents the following as an example of such an index. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3dac5f2a-385d-4d23-9564-cfbe2d0706d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "SCORE_DIR = pathlib.Path.cwd().parent / \"data\" / \"score\" / \"csv\" / \"full\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9fb12b5e-1f53-4eaf-a306-42ea9c1422fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/jupyterlab/3.2.8/libexec/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3251: DtypeWarning: Columns (1,2,3,4,75,76,78) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "usa = pd.read_csv(\n",
    "    SCORE_DIR / \"usa.csv\",\n",
    "    dtype={\"GEOID10_TRACT\": str},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f80bdf5f-11e3-4967-a0b3-a7504374c803",
   "metadata": {},
   "outputs": [],
   "source": [
    "## environment\n",
    "toxins_category = [\n",
    "    \"Percent pre-1960s housing (lead paint indicator) (percentile)\",\n",
    "    \"Proximity to Risk Management Plan (RMP) facilities (percentile)\",\n",
    "    \"Proximity to NPL sites (percentile)\",\n",
    "    \"Proximity to hazardous waste sites (percentile)\",\n",
    "    \"Wastewater discharge (percentile)\",\n",
    "]\n",
    "\n",
    "## sensitive populations\n",
    "health_category = [\n",
    "    \"Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)\",\n",
    "    \"Current asthma among adults aged greater than or equal to 18 years (percentile)\",\n",
    "    \"Coronary heart disease among adults aged greater than or equal to 18 years (percentile)\",\n",
    "    \"Low life expectancy (percentile)\",\n",
    "]\n",
    "\n",
    "## exposure\n",
    "built_environment_category = [\n",
    "    \"Expected building loss rate (Natural Hazards Risk Index) (percentile)\",\n",
    "    \"Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)\",\n",
    "    \"Expected population loss rate (Natural Hazards Risk Index) (percentile)\",\n",
    "    \"Energy burden (percentile)\",\n",
    "    \"Diesel particulate matter exposure (percentile)\",\n",
    "    \"Traffic proximity and volume (percentile)\",\n",
    "    \"PM2.5 in the air (percentile)\",\n",
    "]\n",
    "\n",
    "## socioeconomic\n",
    "socioeconomic_category = [\n",
    "    \"Unemployment (percent) (percentile)\",\n",
    "    \"Housing burden (percent) (percentile)\",\n",
    "    \"Low median household income as a percent of area median income (percentile)\",\n",
    "    \"Percent of households in linguistic isolation (percentile)\",\n",
    "    \"Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile)\",\n",
    "    \"Percent individuals age 25 or over with less than high school degree (percentile)\",\n",
    "    \"Percent of individuals < 100% Federal Poverty Line (percentile)\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a5057a21-318c-438e-84da-8f2e765c02d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "usa[\"toxins_cat\"] = usa[toxins_category].mean(axis=1)\n",
    "usa[\"built_env_cat\"] = usa[built_environment_category].mean(axis=1)\n",
    "usa[\"health_cat\"] = usa[health_category].mean(axis=1)\n",
    "usa[\"ses_cat\"] = usa[socioeconomic_category].mean(axis=1)\n",
    "\n",
    "\n",
    "usa[\"pollution_burden\"] = 0.5 * usa[\"toxins_cat\"] + usa[\"built_env_cat\"]\n",
    "usa[\"population_characteristics\"] = usa[\"health_cat\"] + usa[\"ses_cat\"]\n",
    "poll_max = usa[\"pollution_burden\"].max()\n",
    "pop_max = usa[\"population_characteristics\"].max()\n",
    "\n",
    "usa[\"scaled_pollution_burden\"] = usa[\"pollution_burden\"] / poll_max\n",
    "usa[\"scaled_population_characteristics\"] = usa[\"population_characteristics\"] / pop_max\n",
    "\n",
    "usa[\"cal_score\"] = (\n",
    "    usa[\"scaled_pollution_burden\"] * usa[\"scaled_population_characteristics\"]\n",
    ")\n",
    "usa[\"pct_cal_score\"] = usa[\"cal_score\"].rank(pct=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3217e0ce-d9ec-46b8-9584-4cf9f6b35686",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.65</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>42039</td>\n",
       "      <td>6823</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>5423</td>\n",
       "      <td>19849</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)  False   True\n",
       "pct_cal_score_0.65                      \n",
       "False                       42039   6823\n",
       "True                         5423  19849"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.65</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>0.567068</td>\n",
       "      <td>0.092036</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.073151</td>\n",
       "      <td>0.267745</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)     False      True\n",
       "pct_cal_score_0.65                            \n",
       "False                       0.567068  0.092036\n",
       "True                        0.073151  0.267745"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.8</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>46152</td>\n",
       "      <td>13540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>1310</td>\n",
       "      <td>13132</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)  False   True\n",
       "pct_cal_score_0.8                       \n",
       "False                       46152  13540\n",
       "True                         1310  13132"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.8</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>0.622548</td>\n",
       "      <td>0.182642</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.017671</td>\n",
       "      <td>0.177139</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)     False      True\n",
       "pct_cal_score_0.8                             \n",
       "False                       0.622548  0.182642\n",
       "True                        0.017671  0.177139"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.825</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>46575</td>\n",
       "      <td>14923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>887</td>\n",
       "      <td>11749</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)  False   True\n",
       "pct_cal_score_0.825                     \n",
       "False                       46575  14923\n",
       "True                          887  11749"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.825</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>0.628254</td>\n",
       "      <td>0.201298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.011965</td>\n",
       "      <td>0.158483</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)     False      True\n",
       "pct_cal_score_0.825                           \n",
       "False                       0.628254  0.201298\n",
       "True                        0.011965  0.158483"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.85</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>46898</td>\n",
       "      <td>16405</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>564</td>\n",
       "      <td>10267</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)  False   True\n",
       "pct_cal_score_0.85                      \n",
       "False                       46898  16405\n",
       "True                          564  10267"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.85</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>0.632611</td>\n",
       "      <td>0.221288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.007608</td>\n",
       "      <td>0.138492</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)     False      True\n",
       "pct_cal_score_0.85                            \n",
       "False                       0.632611  0.221288\n",
       "True                        0.007608  0.138492"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.9</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>47288</td>\n",
       "      <td>19625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>174</td>\n",
       "      <td>7047</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)  False   True\n",
       "pct_cal_score_0.9                       \n",
       "False                       47288  19625\n",
       "True                          174   7047"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Definition N (communities)</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.9</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>0.637872</td>\n",
       "      <td>0.264723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.002347</td>\n",
       "      <td>0.095058</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Definition N (communities)     False      True\n",
       "pct_cal_score_0.9                             \n",
       "False                       0.637872  0.264723\n",
       "True                        0.002347  0.095058"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# this shows the number of communities identified by Score N (base)\n",
    "# that are also identified by our cumulative burden metric (at some threshold)\n",
    "\n",
    "for cutoff in [0.65, 0.8, 0.825, 0.85, 0.9]:\n",
    "\n",
    "    usa[\"pct_cal_score_\" + str(cutoff)] = usa[\"pct_cal_score\"] >= cutoff\n",
    "    display(\n",
    "        pd.crosstab(\n",
    "            usa[\"pct_cal_score_\" + str(cutoff)],\n",
    "            usa[\"Definition N (communities)\"],\n",
    "        )\n",
    "    )\n",
    "    display(\n",
    "        pd.crosstab(\n",
    "            usa[\"pct_cal_score_\" + str(cutoff)],\n",
    "            usa[\"Definition N (communities)\"],\n",
    "            normalize=True,\n",
    "        )\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa47f4f8-16bd-47ec-b2a5-e20fe2d83c83",
   "metadata": {},
   "source": [
    "## Does it square with calenvironscreen? \n",
    "\n",
    "Here we compare OUR work with the data from CalEnviroScreen, limiting to California. We see reasonable agreement -- the vast majority (>90%) of tracts at or above 90th percentile match."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5498bbd7-b40f-4e17-9dd4-c102f071b250",
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = pathlib.Path.cwd().parent / \"data\" / \"dataset\"\n",
    "\n",
    "true_ces = pd.read_csv(\n",
    "    DATA_DIR / \"calenviroscreen4/data06.csv\",\n",
    "    dtype={\"GEOID10_TRACT\": str},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "edf211f6-84ee-4a9f-b0a6-31ffc2622adb",
   "metadata": {},
   "outputs": [],
   "source": [
    "ces_merged = usa.merge(true_ces, on=\"GEOID10_TRACT\", how=\"right\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cd54de47-2025-460e-beda-7807adf334df",
   "metadata": {},
   "outputs": [],
   "source": [
    "ces_merged[\"new_cal_score\"] = ces_merged[\"pct_cal_score\"].rank(pct=True)\n",
    "ces_merged[\"new_cal_flag\"] = ces_merged[\"new_cal_score\"] >= 0.9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ea5aabe0-d3e3-47b2-98df-335538cbf8e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False    0.900311\n",
       "True     0.099689\n",
       "Name: new_cal_flag, dtype: float64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ces_merged[\"new_cal_flag\"].value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d63f6db6-aae3-43e2-9ea8-5766d630fe8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "ces_merged[\"any_flag\"] = (\n",
    "    ces_merged[\"pct_cal_score_0.9\"] | ces_merged[\"Definition N (communities)\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "23360b80-a3d3-4d35-b56c-cc7f6423646c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pct_cal_score_0.9  DRAFT CES 4.0\\nPercentile Range\n",
       "False              10-15%                             0.052527\n",
       "                   20-25%                             0.052527\n",
       "                   40-45%                             0.052527\n",
       "                   1-5% (lowest scores)               0.052395\n",
       "                   15-20%                             0.052395\n",
       "                   25-30%                             0.052395\n",
       "                   30-35%                             0.052395\n",
       "                   35-40%                             0.052395\n",
       "                   5-10%                              0.052395\n",
       "                   45-50%                             0.052263\n",
       "                   50-55%                             0.052263\n",
       "                   60-65%                             0.052130\n",
       "                   55-60%                             0.051998\n",
       "                   65-70%                             0.051336\n",
       "                   70-75%                             0.050410\n",
       "                   75-80%                             0.048690\n",
       "                   80-85%                             0.047499\n",
       "                   85-90%                             0.041413\n",
       "                   90-95%                             0.037841\n",
       "                   95-100% (highest scores)           0.028844\n",
       "                   NaN                                0.013363\n",
       "True               95-100% (highest scores)           0.375262\n",
       "                   90-95%                             0.232704\n",
       "                   85-90%                             0.174004\n",
       "                   80-85%                             0.079665\n",
       "                   75-80%                             0.058700\n",
       "                   70-75%                             0.033543\n",
       "                   65-70%                             0.016771\n",
       "                   NaN                                0.008386\n",
       "                   55-60%                             0.006289\n",
       "                   60-65%                             0.006289\n",
       "                   50-55%                             0.004193\n",
       "                   30-35%                             0.002096\n",
       "                   45-50%                             0.002096\n",
       "Name: DRAFT CES 4.0\\nPercentile Range, dtype: float64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Each row of the following outputs answers the question:\n",
    "# what share of communities at or above the 90th percentile for our version of aggregate burden fall into\n",
    "# each percentile range for the true calenviroscreen? For example, 38% of tracts that are above 90th percentile\n",
    "# for our metric are in the highest score bucket on CES.\n",
    "\n",
    "ces_merged.groupby(\"pct_cal_score_0.9\")[\"DRAFT CES 4.0\\nPercentile Range\"].value_counts(\n",
    "    dropna=False, normalize=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "72bcce80-ed57-4f1c-ad01-c88b0b9046a8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>DRAFT CES 4.0\n",
       "Percentile Range</th>\n",
       "      <th>1-5% (lowest scores)</th>\n",
       "      <th>10-15%</th>\n",
       "      <th>15-20%</th>\n",
       "      <th>20-25%</th>\n",
       "      <th>25-30%</th>\n",
       "      <th>30-35%</th>\n",
       "      <th>35-40%</th>\n",
       "      <th>40-45%</th>\n",
       "      <th>45-50%</th>\n",
       "      <th>5-10%</th>\n",
       "      <th>50-55%</th>\n",
       "      <th>55-60%</th>\n",
       "      <th>60-65%</th>\n",
       "      <th>65-70%</th>\n",
       "      <th>70-75%</th>\n",
       "      <th>75-80%</th>\n",
       "      <th>80-85%</th>\n",
       "      <th>85-90%</th>\n",
       "      <th>90-95%</th>\n",
       "      <th>95-100% (highest scores)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pct_cal_score_0.9</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>0.052395</td>\n",
       "      <td>0.052527</td>\n",
       "      <td>0.052395</td>\n",
       "      <td>0.052527</td>\n",
       "      <td>0.052395</td>\n",
       "      <td>0.052395</td>\n",
       "      <td>0.052395</td>\n",
       "      <td>0.052527</td>\n",
       "      <td>0.052263</td>\n",
       "      <td>0.052395</td>\n",
       "      <td>0.052263</td>\n",
       "      <td>0.051998</td>\n",
       "      <td>0.052130</td>\n",
       "      <td>0.051336</td>\n",
       "      <td>0.050410</td>\n",
       "      <td>0.04869</td>\n",
       "      <td>0.047499</td>\n",
       "      <td>0.041413</td>\n",
       "      <td>0.037841</td>\n",
       "      <td>0.028844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.002096</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.002096</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.004193</td>\n",
       "      <td>0.006289</td>\n",
       "      <td>0.006289</td>\n",
       "      <td>0.016771</td>\n",
       "      <td>0.033543</td>\n",
       "      <td>0.05870</td>\n",
       "      <td>0.079665</td>\n",
       "      <td>0.174004</td>\n",
       "      <td>0.232704</td>\n",
       "      <td>0.375262</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "DRAFT CES 4.0\n",
       "Percentile Range  1-5% (lowest scores)    10-15%    15-20%  \\\n",
       "pct_cal_score_0.9                                                          \n",
       "False                                       0.052395  0.052527  0.052395   \n",
       "True                                             NaN       NaN       NaN   \n",
       "\n",
       "DRAFT CES 4.0\n",
       "Percentile Range    20-25%    25-30%    30-35%    35-40%  \\\n",
       "pct_cal_score_0.9                                                        \n",
       "False                           0.052527  0.052395  0.052395  0.052395   \n",
       "True                                 NaN       NaN  0.002096       NaN   \n",
       "\n",
       "DRAFT CES 4.0\n",
       "Percentile Range    40-45%    45-50%     5-10%    50-55%  \\\n",
       "pct_cal_score_0.9                                                        \n",
       "False                           0.052527  0.052263  0.052395  0.052263   \n",
       "True                                 NaN  0.002096       NaN  0.004193   \n",
       "\n",
       "DRAFT CES 4.0\n",
       "Percentile Range    55-60%    60-65%    65-70%    70-75%  \\\n",
       "pct_cal_score_0.9                                                        \n",
       "False                           0.051998  0.052130  0.051336  0.050410   \n",
       "True                            0.006289  0.006289  0.016771  0.033543   \n",
       "\n",
       "DRAFT CES 4.0\n",
       "Percentile Range   75-80%    80-85%    85-90%    90-95%  \\\n",
       "pct_cal_score_0.9                                                       \n",
       "False                           0.04869  0.047499  0.041413  0.037841   \n",
       "True                            0.05870  0.079665  0.174004  0.232704   \n",
       "\n",
       "DRAFT CES 4.0\n",
       "Percentile Range  95-100% (highest scores)  \n",
       "pct_cal_score_0.9                                         \n",
       "False                                           0.028844  \n",
       "True                                            0.375262  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Easier to read!\n",
    "ces_merged.groupby(\"pct_cal_score_0.9\")[\"DRAFT CES 4.0\\nPercentile Range\"].value_counts(\n",
    "    dropna=False, normalize=True\n",
    ").rename(\"share\").reset_index().pivot_table(\n",
    "    index=\"pct_cal_score_0.9\", columns=\"DRAFT CES 4.0\\nPercentile Range\", values=\"share\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "86c66114-b090-4f96-877d-60dad7e18252",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>any_flag</th>\n",
       "      <th>False</th>\n",
       "      <th>True</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DRAFT CES 4.0\n",
       "Percentile Range</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1-5% (lowest scores)</th>\n",
       "      <td>0.079518</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10-15%</th>\n",
       "      <td>0.077108</td>\n",
       "      <td>0.004255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15-20%</th>\n",
       "      <td>0.075904</td>\n",
       "      <td>0.005892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20-25%</th>\n",
       "      <td>0.072691</td>\n",
       "      <td>0.011457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25-30%</th>\n",
       "      <td>0.071285</td>\n",
       "      <td>0.013421</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30-35%</th>\n",
       "      <td>0.068876</td>\n",
       "      <td>0.017676</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35-40%</th>\n",
       "      <td>0.066466</td>\n",
       "      <td>0.021277</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40-45%</th>\n",
       "      <td>0.061847</td>\n",
       "      <td>0.029133</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45-50%</th>\n",
       "      <td>0.056225</td>\n",
       "      <td>0.037971</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5-10%</th>\n",
       "      <td>0.078715</td>\n",
       "      <td>0.001309</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50-55%</th>\n",
       "      <td>0.050000</td>\n",
       "      <td>0.048445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55-60%</th>\n",
       "      <td>0.046787</td>\n",
       "      <td>0.053355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60-65%</th>\n",
       "      <td>0.046185</td>\n",
       "      <td>0.054664</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65-70%</th>\n",
       "      <td>0.039960</td>\n",
       "      <td>0.064484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70-75%</th>\n",
       "      <td>0.026908</td>\n",
       "      <td>0.086088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75-80%</th>\n",
       "      <td>0.023695</td>\n",
       "      <td>0.090998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80-85%</th>\n",
       "      <td>0.018273</td>\n",
       "      <td>0.100164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85-90%</th>\n",
       "      <td>0.013253</td>\n",
       "      <td>0.108020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90-95%</th>\n",
       "      <td>0.007831</td>\n",
       "      <td>0.117185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95-100% (highest scores)</th>\n",
       "      <td>0.003213</td>\n",
       "      <td>0.124714</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "any_flag                            False      True\n",
       "DRAFT CES 4.0\\nPercentile Range                    \n",
       "1-5% (lowest scores)             0.079518       NaN\n",
       "10-15%                           0.077108  0.004255\n",
       "15-20%                           0.075904  0.005892\n",
       "20-25%                           0.072691  0.011457\n",
       "25-30%                           0.071285  0.013421\n",
       "30-35%                           0.068876  0.017676\n",
       "35-40%                           0.066466  0.021277\n",
       "40-45%                           0.061847  0.029133\n",
       "45-50%                           0.056225  0.037971\n",
       "5-10%                            0.078715  0.001309\n",
       "50-55%                           0.050000  0.048445\n",
       "55-60%                           0.046787  0.053355\n",
       "60-65%                           0.046185  0.054664\n",
       "65-70%                           0.039960  0.064484\n",
       "70-75%                           0.026908  0.086088\n",
       "75-80%                           0.023695  0.090998\n",
       "80-85%                           0.018273  0.100164\n",
       "85-90%                           0.013253  0.108020\n",
       "90-95%                           0.007831  0.117185\n",
       "95-100% (highest scores)         0.003213  0.124714"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Any flag here is score N or our version of cumulative burden. This table can be read the same way as above\n",
    "\n",
    "ces_merged.groupby(\"any_flag\")[\"DRAFT CES 4.0\\nPercentile Range\"].value_counts(\n",
    "    dropna=False, normalize=True\n",
    ").rename(\"share\").reset_index().pivot_table(\n",
    "    index=\"any_flag\", columns=\"DRAFT CES 4.0\\nPercentile Range\", values=\"share\"\n",
    ").T"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}