j40-cejst-2/data/data-pipeline/ipython/scoring_comparison.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93c7b73b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import collections\n",
    "import functools\n",
    "import IPython\n",
    "import itertools\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import pathlib\n",
    "import pypandoc\n",
    "import requests\n",
    "import string\n",
    "import sys\n",
    "import typing\n",
    "import us\n",
    "import zipfile\n",
    "\n",
    "from datetime import datetime\n",
    "from tqdm.notebook import tqdm_notebook\n",
    "\n",
    "module_path = os.path.abspath(os.path.join(\"..\"))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
    "\n",
    "from utils import remove_all_from_dir, get_excel_column_name\n",
    "from etl.sources.census.etl_utils import get_state_information\n",
    "\n",
    "\n",
    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
    "tqdm_notebook.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "881424fd",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
    "pd.options.display.float_format = \"{:.2f}\".format\n",
    "\n",
    "# Set some global parameters\n",
    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
    "TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
    "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
    "\n",
    "# Make the dirs if they don't exist\n",
    "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
    "COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
    "\n",
    "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
    "# and introducing the risk of misspelling the field name.)\n",
    "\n",
    "GEOID_FIELD_NAME = \"GEOID10\"\n",
    "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
    "COUNTRY_FIELD_NAME = \"Country\"\n",
    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
    "\n",
    "CEJST_SCORE_FIELD = \"cejst_score\"\n",
    "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
    "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
    "\n",
    "# Define some suffixes\n",
    "POPULATION_SUFFIX = \" (priority population)\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5f3eaa5",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3169: DtypeWarning: Columns (87,88,90) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID10</th>\n",
       "      <th>Housing burden (percent)</th>\n",
       "      <th>Total population</th>\n",
       "      <th>Air toxics cancer risk</th>\n",
       "      <th>Respiratory hazard index</th>\n",
       "      <th>Diesel particulate matter</th>\n",
       "      <th>Particulate matter (PM2.5)</th>\n",
       "      <th>Ozone</th>\n",
       "      <th>Traffic proximity and volume</th>\n",
       "      <th>Proximity to RMP sites</th>\n",
       "      <th>...</th>\n",
       "      <th>Score D (top 25th percentile)</th>\n",
       "      <th>Score E (percentile)</th>\n",
       "      <th>Score E (top 25th percentile)</th>\n",
       "      <th>GEOID</th>\n",
       "      <th>State Abbreviation</th>\n",
       "      <th>County Name</th>\n",
       "      <th>State Code</th>\n",
       "      <th>State Name</th>\n",
       "      <th>GEOID10_TRACT</th>\n",
       "      <th>GEOID10_STATE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>010010201001</td>\n",
       "      <td>0.15</td>\n",
       "      <td>692</td>\n",
       "      <td>49.38</td>\n",
       "      <td>0.79</td>\n",
       "      <td>0.28</td>\n",
       "      <td>10.00</td>\n",
       "      <td>40.12</td>\n",
       "      <td>91.02</td>\n",
       "      <td>0.09</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>0.35</td>\n",
       "      <td>False</td>\n",
       "      <td>1001</td>\n",
       "      <td>AL</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>1.00</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>01001020100</td>\n",
       "      <td>01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>010010201002</td>\n",
       "      <td>0.15</td>\n",
       "      <td>1153</td>\n",
       "      <td>49.38</td>\n",
       "      <td>0.79</td>\n",
       "      <td>0.28</td>\n",
       "      <td>10.00</td>\n",
       "      <td>40.12</td>\n",
       "      <td>2.62</td>\n",
       "      <td>0.07</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>0.11</td>\n",
       "      <td>False</td>\n",
       "      <td>1001</td>\n",
       "      <td>AL</td>\n",
       "      <td>Baldwin County</td>\n",
       "      <td>2.00</td>\n",
       "      <td>Alaska</td>\n",
       "      <td>01001020100</td>\n",
       "      <td>01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>010010202001</td>\n",
       "      <td>0.25</td>\n",
       "      <td>1020</td>\n",
       "      <td>50.32</td>\n",
       "      <td>0.81</td>\n",
       "      <td>0.30</td>\n",
       "      <td>10.07</td>\n",
       "      <td>40.22</td>\n",
       "      <td>4.68</td>\n",
       "      <td>0.08</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>0.51</td>\n",
       "      <td>False</td>\n",
       "      <td>1001</td>\n",
       "      <td>AL</td>\n",
       "      <td>Barbour County</td>\n",
       "      <td>4.00</td>\n",
       "      <td>Arizona</td>\n",
       "      <td>01001020200</td>\n",
       "      <td>01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>010010202002</td>\n",
       "      <td>0.25</td>\n",
       "      <td>1152</td>\n",
       "      <td>50.32</td>\n",
       "      <td>0.81</td>\n",
       "      <td>0.30</td>\n",
       "      <td>10.07</td>\n",
       "      <td>40.22</td>\n",
       "      <td>218.65</td>\n",
       "      <td>0.09</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>0.59</td>\n",
       "      <td>False</td>\n",
       "      <td>1001</td>\n",
       "      <td>AL</td>\n",
       "      <td>Bibb County</td>\n",
       "      <td>5.00</td>\n",
       "      <td>Arkansas</td>\n",
       "      <td>01001020200</td>\n",
       "      <td>01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>010010203001</td>\n",
       "      <td>0.21</td>\n",
       "      <td>2555</td>\n",
       "      <td>50.77</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.36</td>\n",
       "      <td>10.12</td>\n",
       "      <td>40.31</td>\n",
       "      <td>69.64</td>\n",
       "      <td>0.08</td>\n",
       "      <td>...</td>\n",
       "      <td>False</td>\n",
       "      <td>0.47</td>\n",
       "      <td>False</td>\n",
       "      <td>1001</td>\n",
       "      <td>AL</td>\n",
       "      <td>Blount County</td>\n",
       "      <td>6.00</td>\n",
       "      <td>California</td>\n",
       "      <td>01001020300</td>\n",
       "      <td>01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 93 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        GEOID10  Housing burden (percent)  Total population  \\\n",
       "0  010010201001                      0.15               692   \n",
       "1  010010201002                      0.15              1153   \n",
       "2  010010202001                      0.25              1020   \n",
       "3  010010202002                      0.25              1152   \n",
       "4  010010203001                      0.21              2555   \n",
       "\n",
       "   Air toxics cancer risk  Respiratory hazard index  \\\n",
       "0                   49.38                      0.79   \n",
       "1                   49.38                      0.79   \n",
       "2                   50.32                      0.81   \n",
       "3                   50.32                      0.81   \n",
       "4                   50.77                      0.82   \n",
       "\n",
       "   Diesel particulate matter  Particulate matter (PM2.5)  Ozone  \\\n",
       "0                       0.28                       10.00  40.12   \n",
       "1                       0.28                       10.00  40.12   \n",
       "2                       0.30                       10.07  40.22   \n",
       "3                       0.30                       10.07  40.22   \n",
       "4                       0.36                       10.12  40.31   \n",
       "\n",
       "   Traffic proximity and volume  Proximity to RMP sites  ...  \\\n",
       "0                         91.02                    0.09  ...   \n",
       "1                          2.62                    0.07  ...   \n",
       "2                          4.68                    0.08  ...   \n",
       "3                        218.65                    0.09  ...   \n",
       "4                         69.64                    0.08  ...   \n",
       "\n",
       "   Score D (top 25th percentile)  Score E (percentile)  \\\n",
       "0                          False                  0.35   \n",
       "1                          False                  0.11   \n",
       "2                          False                  0.51   \n",
       "3                          False                  0.59   \n",
       "4                          False                  0.47   \n",
       "\n",
       "   Score E (top 25th percentile)  GEOID  State Abbreviation     County Name  \\\n",
       "0                          False   1001                  AL  Autauga County   \n",
       "1                          False   1001                  AL  Baldwin County   \n",
       "2                          False   1001                  AL  Barbour County   \n",
       "3                          False   1001                  AL     Bibb County   \n",
       "4                          False   1001                  AL   Blount County   \n",
       "\n",
       "   State Code  State Name  GEOID10_TRACT  GEOID10_STATE  \n",
       "0        1.00     Alabama    01001020100             01  \n",
       "1        2.00      Alaska    01001020100             01  \n",
       "2        4.00     Arizona    01001020200             01  \n",
       "3        5.00    Arkansas    01001020200             01  \n",
       "4        6.00  California    01001020300             01  \n",
       "\n",
       "[5 rows x 93 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load CEJST score data\n",
    "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n",
    "cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n",
    "\n",
    "# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.\n",
    "# The CBG ID is the last one character.\n",
    "# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n",
    "cejst_df.loc[:, GEOID_TRACT_FIELD_NAME] = (\n",
    "    cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[:-1]\n",
    ")\n",
    "\n",
    "cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n",
    "    cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[0:2]\n",
    ")\n",
    "\n",
    "cejst_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2448dcd",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID10_TRACT</th>\n",
       "      <th>Total Population</th>\n",
       "      <th>California County</th>\n",
       "      <th>ZIP</th>\n",
       "      <th>Nearby City \\r\\n(to help approximate location only)</th>\n",
       "      <th>Longitude</th>\n",
       "      <th>Latitude</th>\n",
       "      <th>calenviroscreen_score</th>\n",
       "      <th>calenviroscreen_percentile</th>\n",
       "      <th>DRAFT CES 4.0\\r\\nPercentile Range</th>\n",
       "      <th>...</th>\n",
       "      <th>Poverty</th>\n",
       "      <th>Poverty Pctl</th>\n",
       "      <th>Unemployment</th>\n",
       "      <th>Unemployment Pctl</th>\n",
       "      <th>Housing Burden</th>\n",
       "      <th>Housing Burden Pctl</th>\n",
       "      <th>Pop. Char.</th>\n",
       "      <th>Pop. Char. Score</th>\n",
       "      <th>Pop. Char. Pctl</th>\n",
       "      <th>calenviroscreen_priority_community</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>06019001100</td>\n",
       "      <td>2760</td>\n",
       "      <td>Fresno</td>\n",
       "      <td>93706</td>\n",
       "      <td>Fresno</td>\n",
       "      <td>-119.78</td>\n",
       "      <td>36.71</td>\n",
       "      <td>94.61</td>\n",
       "      <td>100.00</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>76.60</td>\n",
       "      <td>98.43</td>\n",
       "      <td>16.20</td>\n",
       "      <td>97.15</td>\n",
       "      <td>30.70</td>\n",
       "      <td>90.61</td>\n",
       "      <td>93.73</td>\n",
       "      <td>9.72</td>\n",
       "      <td>99.87</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>06077000700</td>\n",
       "      <td>4177</td>\n",
       "      <td>San Joaquin</td>\n",
       "      <td>95206</td>\n",
       "      <td>Stockton</td>\n",
       "      <td>-121.29</td>\n",
       "      <td>37.94</td>\n",
       "      <td>90.83</td>\n",
       "      <td>99.99</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>70.60</td>\n",
       "      <td>96.43</td>\n",
       "      <td>18.50</td>\n",
       "      <td>98.45</td>\n",
       "      <td>35.20</td>\n",
       "      <td>95.61</td>\n",
       "      <td>93.40</td>\n",
       "      <td>9.68</td>\n",
       "      <td>99.84</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>06077000100</td>\n",
       "      <td>4055</td>\n",
       "      <td>San Joaquin</td>\n",
       "      <td>95202</td>\n",
       "      <td>Stockton</td>\n",
       "      <td>-121.29</td>\n",
       "      <td>37.95</td>\n",
       "      <td>85.75</td>\n",
       "      <td>99.97</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>81.80</td>\n",
       "      <td>99.50</td>\n",
       "      <td>17.90</td>\n",
       "      <td>98.17</td>\n",
       "      <td>36.40</td>\n",
       "      <td>96.51</td>\n",
       "      <td>95.71</td>\n",
       "      <td>9.92</td>\n",
       "      <td>99.97</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>06071001600</td>\n",
       "      <td>5527</td>\n",
       "      <td>San Bernardino</td>\n",
       "      <td>91761</td>\n",
       "      <td>Ontario</td>\n",
       "      <td>-117.62</td>\n",
       "      <td>34.06</td>\n",
       "      <td>83.56</td>\n",
       "      <td>99.96</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>67.10</td>\n",
       "      <td>94.82</td>\n",
       "      <td>6.70</td>\n",
       "      <td>57.20</td>\n",
       "      <td>32.10</td>\n",
       "      <td>92.65</td>\n",
       "      <td>80.59</td>\n",
       "      <td>8.36</td>\n",
       "      <td>93.06</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>06037204920</td>\n",
       "      <td>2639</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>90023</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>-118.20</td>\n",
       "      <td>34.02</td>\n",
       "      <td>82.90</td>\n",
       "      <td>99.95</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>64.90</td>\n",
       "      <td>93.51</td>\n",
       "      <td>5.60</td>\n",
       "      <td>43.81</td>\n",
       "      <td>25.00</td>\n",
       "      <td>77.95</td>\n",
       "      <td>83.95</td>\n",
       "      <td>8.70</td>\n",
       "      <td>95.78</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 59 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  GEOID10_TRACT   Total Population California County    ZIP  \\\n",
       "0   06019001100               2760           Fresno   93706   \n",
       "1   06077000700               4177       San Joaquin  95206   \n",
       "2   06077000100               4055       San Joaquin  95202   \n",
       "3   06071001600               5527    San Bernardino  91761   \n",
       "4   06037204920               2639       Los Angeles  90023   \n",
       "\n",
       "  Nearby City \\r\\n(to help approximate location only)  Longitude  Latitude  \\\n",
       "0                                             Fresno     -119.78     36.71   \n",
       "1                                           Stockton     -121.29     37.94   \n",
       "2                                           Stockton     -121.29     37.95   \n",
       "3                                            Ontario     -117.62     34.06   \n",
       "4                                        Los Angeles     -118.20     34.02   \n",
       "\n",
       "   calenviroscreen_score  calenviroscreen_percentile  \\\n",
       "0                  94.61                      100.00   \n",
       "1                  90.83                       99.99   \n",
       "2                  85.75                       99.97   \n",
       "3                  83.56                       99.96   \n",
       "4                  82.90                       99.95   \n",
       "\n",
       "  DRAFT CES 4.0\\r\\nPercentile Range  ...  Poverty  Poverty Pctl  Unemployment  \\\n",
       "0          95-100% (highest scores)  ...    76.60         98.43         16.20   \n",
       "1          95-100% (highest scores)  ...    70.60         96.43         18.50   \n",
       "2          95-100% (highest scores)  ...    81.80         99.50         17.90   \n",
       "3          95-100% (highest scores)  ...    67.10         94.82          6.70   \n",
       "4          95-100% (highest scores)  ...    64.90         93.51          5.60   \n",
       "\n",
       "   Unemployment Pctl  Housing Burden  Housing Burden Pctl  Pop. Char.   \\\n",
       "0              97.15           30.70                90.61        93.73   \n",
       "1              98.45           35.20                95.61        93.40   \n",
       "2              98.17           36.40                96.51        95.71   \n",
       "3              57.20           32.10                92.65        80.59   \n",
       "4              43.81           25.00                77.95        83.95   \n",
       "\n",
       "   Pop. Char. Score  Pop. Char. Pctl  calenviroscreen_priority_community  \n",
       "0              9.72            99.87                                True  \n",
       "1              9.68            99.84                                True  \n",
       "2              9.92            99.97                                True  \n",
       "3              8.36            93.06                                True  \n",
       "4              8.70            95.78                                True  \n",
       "\n",
       "[5 rows x 59 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load CalEnviroScreen 4.0\n",
    "CALENVIROSCREEN_SCORE_FIELD = \"calenviroscreen_score\"\n",
    "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
    "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
    "\n",
    "calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
    "calenviroscreen_df = pd.read_csv(\n",
    "    calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
    ")\n",
    "\n",
    "# Convert priority community field to a bool.\n",
    "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
    "    CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
    "].astype(bool)\n",
    "\n",
    "calenviroscreen_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f612a86a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FID</th>\n",
       "      <th>GEOID10_TRACT</th>\n",
       "      <th>STATE</th>\n",
       "      <th>STUSAB</th>\n",
       "      <th>STATE_NAME</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>COUNTY_NAME</th>\n",
       "      <th>CNTY_FIPS</th>\n",
       "      <th>TRACT</th>\n",
       "      <th>RCAP_90</th>\n",
       "      <th>RCAP_00</th>\n",
       "      <th>RCAP_10</th>\n",
       "      <th>hud_recap_priority_community</th>\n",
       "      <th>SHAPE_Length</th>\n",
       "      <th>SHAPE_Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>29993</td>\n",
       "      <td>01001020100</td>\n",
       "      <td>1</td>\n",
       "      <td>AL</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001</td>\n",
       "      <td>20100</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30627</td>\n",
       "      <td>01001020200</td>\n",
       "      <td>1</td>\n",
       "      <td>AL</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001</td>\n",
       "      <td>20200</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>29992</td>\n",
       "      <td>01001020300</td>\n",
       "      <td>1</td>\n",
       "      <td>AL</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001</td>\n",
       "      <td>20300</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>30079</td>\n",
       "      <td>01001020400</td>\n",
       "      <td>1</td>\n",
       "      <td>AL</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001</td>\n",
       "      <td>20400</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.12</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>30078</td>\n",
       "      <td>01001020500</td>\n",
       "      <td>1</td>\n",
       "      <td>AL</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001</td>\n",
       "      <td>20500</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.16</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     FID GEOID10_TRACT  STATE STUSAB STATE_NAME  COUNTY COUNTY_NAME  \\\n",
       "0  29993   01001020100      1     AL    Alabama       1     Autauga   \n",
       "1  30627   01001020200      1     AL    Alabama       1     Autauga   \n",
       "2  29992   01001020300      1     AL    Alabama       1     Autauga   \n",
       "3  30079   01001020400      1     AL    Alabama       1     Autauga   \n",
       "4  30078   01001020500      1     AL    Alabama       1     Autauga   \n",
       "\n",
       "   CNTY_FIPS  TRACT  RCAP_90  RCAP_00  RCAP_10  hud_recap_priority_community  \\\n",
       "0       1001  20100     0.00     0.00     0.00                         False   \n",
       "1       1001  20200     0.00     0.00     0.00                         False   \n",
       "2       1001  20300     0.00     0.00     0.00                         False   \n",
       "3       1001  20400     0.00     0.00     0.00                         False   \n",
       "4       1001  20500     0.00     0.00     0.00                         False   \n",
       "\n",
       "   SHAPE_Length  SHAPE_Area  \n",
       "0          0.15        0.00  \n",
       "1          0.09        0.00  \n",
       "2          0.10        0.00  \n",
       "3          0.12        0.00  \n",
       "4          0.16        0.00  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load HUD data\n",
    "hud_recap_data_path = DATA_DIR / \"dataset\" / \"hud_recap\" / \"usa.csv\"\n",
    "hud_recap_df = pd.read_csv(\n",
    "    hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
    ")\n",
    "\n",
    "hud_recap_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ee6e6ee",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID10_TRACT</th>\n",
       "      <th>Total Population</th>\n",
       "      <th>California County</th>\n",
       "      <th>ZIP</th>\n",
       "      <th>Nearby City \\r\\n(to help approximate location only)</th>\n",
       "      <th>Longitude</th>\n",
       "      <th>Latitude</th>\n",
       "      <th>calenviroscreen_score</th>\n",
       "      <th>calenviroscreen_percentile</th>\n",
       "      <th>DRAFT CES 4.0\\r\\nPercentile Range</th>\n",
       "      <th>...</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>COUNTY_NAME</th>\n",
       "      <th>CNTY_FIPS</th>\n",
       "      <th>TRACT</th>\n",
       "      <th>RCAP_90</th>\n",
       "      <th>RCAP_00</th>\n",
       "      <th>RCAP_10</th>\n",
       "      <th>hud_recap_priority_community</th>\n",
       "      <th>SHAPE_Length</th>\n",
       "      <th>SHAPE_Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>06019001100</td>\n",
       "      <td>2760.00</td>\n",
       "      <td>Fresno</td>\n",
       "      <td>93706.00</td>\n",
       "      <td>Fresno</td>\n",
       "      <td>-119.78</td>\n",
       "      <td>36.71</td>\n",
       "      <td>94.61</td>\n",
       "      <td>100.00</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>19</td>\n",
       "      <td>Fresno</td>\n",
       "      <td>6019</td>\n",
       "      <td>1100</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>True</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>06077000700</td>\n",
       "      <td>4177.00</td>\n",
       "      <td>San Joaquin</td>\n",
       "      <td>95206.00</td>\n",
       "      <td>Stockton</td>\n",
       "      <td>-121.29</td>\n",
       "      <td>37.94</td>\n",
       "      <td>90.83</td>\n",
       "      <td>99.99</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>77</td>\n",
       "      <td>San Joaquin</td>\n",
       "      <td>6077</td>\n",
       "      <td>700</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>True</td>\n",
       "      <td>0.07</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>06077000100</td>\n",
       "      <td>4055.00</td>\n",
       "      <td>San Joaquin</td>\n",
       "      <td>95202.00</td>\n",
       "      <td>Stockton</td>\n",
       "      <td>-121.29</td>\n",
       "      <td>37.95</td>\n",
       "      <td>85.75</td>\n",
       "      <td>99.97</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>77</td>\n",
       "      <td>San Joaquin</td>\n",
       "      <td>6077</td>\n",
       "      <td>100</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.00</td>\n",
       "      <td>True</td>\n",
       "      <td>0.06</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>06071001600</td>\n",
       "      <td>5527.00</td>\n",
       "      <td>San Bernardino</td>\n",
       "      <td>91761.00</td>\n",
       "      <td>Ontario</td>\n",
       "      <td>-117.62</td>\n",
       "      <td>34.06</td>\n",
       "      <td>83.56</td>\n",
       "      <td>99.96</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>71</td>\n",
       "      <td>San Bernardino</td>\n",
       "      <td>6071</td>\n",
       "      <td>1600</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>True</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>06037204920</td>\n",
       "      <td>2639.00</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>90023.00</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>-118.20</td>\n",
       "      <td>34.02</td>\n",
       "      <td>82.90</td>\n",
       "      <td>99.95</td>\n",
       "      <td>95-100% (highest scores)</td>\n",
       "      <td>...</td>\n",
       "      <td>37</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>6037</td>\n",
       "      <td>204920</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.04</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 73 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  GEOID10_TRACT   Total Population California County      ZIP  \\\n",
       "0   06019001100            2760.00           Fresno  93706.00   \n",
       "1   06077000700            4177.00       San Joaquin 95206.00   \n",
       "2   06077000100            4055.00       San Joaquin 95202.00   \n",
       "3   06071001600            5527.00    San Bernardino 91761.00   \n",
       "4   06037204920            2639.00       Los Angeles 90023.00   \n",
       "\n",
       "  Nearby City \\r\\n(to help approximate location only)  Longitude  Latitude  \\\n",
       "0                                             Fresno     -119.78     36.71   \n",
       "1                                           Stockton     -121.29     37.94   \n",
       "2                                           Stockton     -121.29     37.95   \n",
       "3                                            Ontario     -117.62     34.06   \n",
       "4                                        Los Angeles     -118.20     34.02   \n",
       "\n",
       "   calenviroscreen_score  calenviroscreen_percentile  \\\n",
       "0                  94.61                      100.00   \n",
       "1                  90.83                       99.99   \n",
       "2                  85.75                       99.97   \n",
       "3                  83.56                       99.96   \n",
       "4                  82.90                       99.95   \n",
       "\n",
       "  DRAFT CES 4.0\\r\\nPercentile Range  ...  COUNTY     COUNTY_NAME  CNTY_FIPS  \\\n",
       "0          95-100% (highest scores)  ...      19          Fresno       6019   \n",
       "1          95-100% (highest scores)  ...      77     San Joaquin       6077   \n",
       "2          95-100% (highest scores)  ...      77     San Joaquin       6077   \n",
       "3          95-100% (highest scores)  ...      71  San Bernardino       6071   \n",
       "4          95-100% (highest scores)  ...      37     Los Angeles       6037   \n",
       "\n",
       "    TRACT  RCAP_90  RCAP_00  RCAP_10  hud_recap_priority_community  \\\n",
       "0    1100     0.00     1.00     1.00                          True   \n",
       "1     700     0.00     0.00     0.00                          True   \n",
       "2     100     1.00     1.00     1.00                          True   \n",
       "3    1600     0.00     0.00     0.00                          True   \n",
       "4  204920     0.00     0.00     0.00                         False   \n",
       "\n",
       "   SHAPE_Length  SHAPE_Area  \n",
       "0          0.09        0.00  \n",
       "1          0.07        0.00  \n",
       "2          0.06        0.00  \n",
       "3          0.25        0.00  \n",
       "4          0.04        0.00  \n",
       "\n",
       "[5 rows x 73 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Join all dataframes that use tracts\n",
    "census_tract_dfs = [calenviroscreen_df, hud_recap_df]\n",
    "\n",
    "census_tract_df = functools.reduce(\n",
    "    lambda left, right: pd.merge(\n",
    "        left=left, right=right, on=GEOID_TRACT_FIELD_NAME, how=\"outer\"\n",
    "    ),\n",
    "    census_tract_dfs,\n",
    ")\n",
    "\n",
    "tract_values = census_tract_df[GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
    "if any(tract_values != [11]):\n",
    "    print(tract_values)\n",
    "    raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
    "\n",
    "if len(census_tract_df) > 74134:\n",
    "    raise ValueError(\"Too many rows in the join.\")\n",
    "\n",
    "census_tract_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70d76fbc",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID10</th>\n",
       "      <th>Housing burden (percent)</th>\n",
       "      <th>Total population</th>\n",
       "      <th>Air toxics cancer risk</th>\n",
       "      <th>Respiratory hazard index</th>\n",
       "      <th>Diesel particulate matter</th>\n",
       "      <th>Particulate matter (PM2.5)</th>\n",
       "      <th>Ozone_x</th>\n",
       "      <th>Traffic proximity and volume</th>\n",
       "      <th>Proximity to RMP sites</th>\n",
       "      <th>...</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>COUNTY_NAME</th>\n",
       "      <th>CNTY_FIPS</th>\n",
       "      <th>TRACT</th>\n",
       "      <th>RCAP_90</th>\n",
       "      <th>RCAP_00</th>\n",
       "      <th>RCAP_10</th>\n",
       "      <th>hud_recap_priority_community</th>\n",
       "      <th>SHAPE_Length</th>\n",
       "      <th>SHAPE_Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>010010201001</td>\n",
       "      <td>0.15</td>\n",
       "      <td>692</td>\n",
       "      <td>49.38</td>\n",
       "      <td>0.79</td>\n",
       "      <td>0.28</td>\n",
       "      <td>10.00</td>\n",
       "      <td>40.12</td>\n",
       "      <td>91.02</td>\n",
       "      <td>0.09</td>\n",
       "      <td>...</td>\n",
       "      <td>1.00</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001.00</td>\n",
       "      <td>20100.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>010010201002</td>\n",
       "      <td>0.15</td>\n",
       "      <td>1153</td>\n",
       "      <td>49.38</td>\n",
       "      <td>0.79</td>\n",
       "      <td>0.28</td>\n",
       "      <td>10.00</td>\n",
       "      <td>40.12</td>\n",
       "      <td>2.62</td>\n",
       "      <td>0.07</td>\n",
       "      <td>...</td>\n",
       "      <td>1.00</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001.00</td>\n",
       "      <td>20100.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>010010202001</td>\n",
       "      <td>0.25</td>\n",
       "      <td>1020</td>\n",
       "      <td>50.32</td>\n",
       "      <td>0.81</td>\n",
       "      <td>0.30</td>\n",
       "      <td>10.07</td>\n",
       "      <td>40.22</td>\n",
       "      <td>4.68</td>\n",
       "      <td>0.08</td>\n",
       "      <td>...</td>\n",
       "      <td>1.00</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001.00</td>\n",
       "      <td>20200.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>010010202002</td>\n",
       "      <td>0.25</td>\n",
       "      <td>1152</td>\n",
       "      <td>50.32</td>\n",
       "      <td>0.81</td>\n",
       "      <td>0.30</td>\n",
       "      <td>10.07</td>\n",
       "      <td>40.22</td>\n",
       "      <td>218.65</td>\n",
       "      <td>0.09</td>\n",
       "      <td>...</td>\n",
       "      <td>1.00</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001.00</td>\n",
       "      <td>20200.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>010010203001</td>\n",
       "      <td>0.21</td>\n",
       "      <td>2555</td>\n",
       "      <td>50.77</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.36</td>\n",
       "      <td>10.12</td>\n",
       "      <td>40.31</td>\n",
       "      <td>69.64</td>\n",
       "      <td>0.08</td>\n",
       "      <td>...</td>\n",
       "      <td>1.00</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>1001.00</td>\n",
       "      <td>20300.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>False</td>\n",
       "      <td>0.10</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 165 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        GEOID10  Housing burden (percent)  Total population  \\\n",
       "0  010010201001                      0.15               692   \n",
       "1  010010201002                      0.15              1153   \n",
       "2  010010202001                      0.25              1020   \n",
       "3  010010202002                      0.25              1152   \n",
       "4  010010203001                      0.21              2555   \n",
       "\n",
       "   Air toxics cancer risk  Respiratory hazard index  \\\n",
       "0                   49.38                      0.79   \n",
       "1                   49.38                      0.79   \n",
       "2                   50.32                      0.81   \n",
       "3                   50.32                      0.81   \n",
       "4                   50.77                      0.82   \n",
       "\n",
       "   Diesel particulate matter  Particulate matter (PM2.5)  Ozone_x  \\\n",
       "0                       0.28                       10.00    40.12   \n",
       "1                       0.28                       10.00    40.12   \n",
       "2                       0.30                       10.07    40.22   \n",
       "3                       0.30                       10.07    40.22   \n",
       "4                       0.36                       10.12    40.31   \n",
       "\n",
       "   Traffic proximity and volume  Proximity to RMP sites  ...  COUNTY  \\\n",
       "0                         91.02                    0.09  ...    1.00   \n",
       "1                          2.62                    0.07  ...    1.00   \n",
       "2                          4.68                    0.08  ...    1.00   \n",
       "3                        218.65                    0.09  ...    1.00   \n",
       "4                         69.64                    0.08  ...    1.00   \n",
       "\n",
       "   COUNTY_NAME  CNTY_FIPS    TRACT  RCAP_90  RCAP_00  RCAP_10  \\\n",
       "0      Autauga    1001.00 20100.00     0.00     0.00     0.00   \n",
       "1      Autauga    1001.00 20100.00     0.00     0.00     0.00   \n",
       "2      Autauga    1001.00 20200.00     0.00     0.00     0.00   \n",
       "3      Autauga    1001.00 20200.00     0.00     0.00     0.00   \n",
       "4      Autauga    1001.00 20300.00     0.00     0.00     0.00   \n",
       "\n",
       "   hud_recap_priority_community  SHAPE_Length  SHAPE_Area  \n",
       "0                         False          0.15        0.00  \n",
       "1                         False          0.15        0.00  \n",
       "2                         False          0.09        0.00  \n",
       "3                         False          0.09        0.00  \n",
       "4                         False          0.10        0.00  \n",
       "\n",
       "[5 rows x 165 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Join tract indices and CEJST data.\n",
    "# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,\n",
    "# creating multiple rows of the same CES data.\n",
    "merged_df = cejst_df.merge(\n",
    "    census_tract_df,\n",
    "    how=\"left\",\n",
    "    on=GEOID_TRACT_FIELD_NAME,\n",
    ")\n",
    "\n",
    "\n",
    "if len(merged_df) > 220333:\n",
    "    raise ValueError(\"Too many rows in the join.\")\n",
    "\n",
    "merged_df.head()\n",
    "\n",
    "\n",
    "# merged_df.to_csv(\n",
    "#     path_or_buf=COMPARISON_OUTPUTS_DIR / \"merged.csv\", na_rep=\"\", index=False\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "558a2cc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define a namedtuple for indices.\n",
    "Index = collections.namedtuple(\n",
    "    typename=\"Index\",\n",
    "    field_names=[\n",
    "        \"method_name\",\n",
    "        \"priority_communities_field\",\n",
    "        # Note: this field only used by indices defined at the census tract level.\n",
    "        \"other_census_tract_fields_to_keep\",\n",
    "    ],\n",
    ")\n",
    "\n",
    "# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n",
    "# (`census_tract_indices`).\n",
    "census_block_group_indices = [\n",
    "    Index(\n",
    "        method_name=\"Score A\",\n",
    "        priority_communities_field=\"Score A (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"Score B\",\n",
    "        priority_communities_field=\"Score B (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"Score C\",\n",
    "        priority_communities_field=\"Score C (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"Score D (25th percentile)\",\n",
    "        priority_communities_field=\"Score D (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"Score D (30th percentile)\",\n",
    "        priority_communities_field=\"Score D (top 30th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"Score D (35th percentile)\",\n",
    "        priority_communities_field=\"Score D (top 35th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"Score D (40th percentile)\",\n",
    "        priority_communities_field=\"Score D (top 40th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"Poverty\",\n",
    "        priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "]\n",
    "\n",
    "census_tract_indices = [\n",
    "    Index(\n",
    "        method_name=\"CalEnviroScreen 4.0\",\n",
    "        priority_communities_field=\"calenviroscreen_priority_community\",\n",
    "        other_census_tract_fields_to_keep=[\n",
    "            CALENVIROSCREEN_SCORE_FIELD,\n",
    "            CALENVIROSCREEN_PERCENTILE_FIELD,\n",
    "        ],\n",
    "    ),\n",
    "    Index(\n",
    "        method_name=\"HUD RECAP\",\n",
    "        priority_communities_field=\"hud_recap_priority_community\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b71b2ab",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converting calenviroscreen_priority_community to boolean.\n",
      "Converting hud_recap_priority_community to boolean.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "223dcb75c0384fd5b93bc2ac3bc07656",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/52 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>GEOID10_STATE</th>\n",
       "      <th>State name</th>\n",
       "      <th>Total CBGs in state</th>\n",
       "      <th>Total population in state</th>\n",
       "      <th>Score A (top 25th percentile) (priority population)</th>\n",
       "      <th>Score A (top 25th percentile) (total CBGs)</th>\n",
       "      <th>Score A (top 25th percentile) (percent CBGs)</th>\n",
       "      <th>Score A (top 25th percentile) (percent population)</th>\n",
       "      <th>Score B (top 25th percentile) (priority population)</th>\n",
       "      <th>Score B (top 25th percentile) (total CBGs)</th>\n",
       "      <th>...</th>\n",
       "      <th>Score E (top 25th percentile) (percent CBGs)</th>\n",
       "      <th>Score E (top 25th percentile) (percent population)</th>\n",
       "      <th>calenviroscreen_priority_community (priority population)</th>\n",
       "      <th>calenviroscreen_priority_community (total CBGs)</th>\n",
       "      <th>calenviroscreen_priority_community (percent CBGs)</th>\n",
       "      <th>calenviroscreen_priority_community (percent population)</th>\n",
       "      <th>hud_recap_priority_community (priority population)</th>\n",
       "      <th>hud_recap_priority_community (total CBGs)</th>\n",
       "      <th>hud_recap_priority_community (percent CBGs)</th>\n",
       "      <th>hud_recap_priority_community (percent population)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GEOID10_STATE</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>01</th>\n",
       "      <th>0</th>\n",
       "      <td>01</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>3438</td>\n",
       "      <td>4850771</td>\n",
       "      <td>1547345</td>\n",
       "      <td>1326</td>\n",
       "      <td>0.39</td>\n",
       "      <td>0.32</td>\n",
       "      <td>1556417</td>\n",
       "      <td>1323</td>\n",
       "      <td>...</td>\n",
       "      <td>0.23</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>235117</td>\n",
       "      <td>258</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>02</th>\n",
       "      <th>0</th>\n",
       "      <td>02</td>\n",
       "      <td>Alaska</td>\n",
       "      <td>534</td>\n",
       "      <td>738565</td>\n",
       "      <td>63868</td>\n",
       "      <td>57</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.09</td>\n",
       "      <td>63868</td>\n",
       "      <td>57</td>\n",
       "      <td>...</td>\n",
       "      <td>0.14</td>\n",
       "      <td>0.12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>6536</td>\n",
       "      <td>8</td>\n",
       "      <td>0.01</td>\n",
       "      <td>0.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>04</th>\n",
       "      <th>0</th>\n",
       "      <td>04</td>\n",
       "      <td>Arizona</td>\n",
       "      <td>4178</td>\n",
       "      <td>6809946</td>\n",
       "      <td>1956052</td>\n",
       "      <td>1230</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.29</td>\n",
       "      <td>1960856</td>\n",
       "      <td>1231</td>\n",
       "      <td>...</td>\n",
       "      <td>0.30</td>\n",
       "      <td>0.30</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>560353</td>\n",
       "      <td>378</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>05</th>\n",
       "      <th>0</th>\n",
       "      <td>05</td>\n",
       "      <td>Arkansas</td>\n",
       "      <td>2147</td>\n",
       "      <td>2977944</td>\n",
       "      <td>960799</td>\n",
       "      <td>817</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.32</td>\n",
       "      <td>975780</td>\n",
       "      <td>826</td>\n",
       "      <td>...</td>\n",
       "      <td>0.20</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>101200</td>\n",
       "      <td>106</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>06</th>\n",
       "      <th>0</th>\n",
       "      <td>06</td>\n",
       "      <td>California</td>\n",
       "      <td>23212</td>\n",
       "      <td>38982847</td>\n",
       "      <td>12610810</td>\n",
       "      <td>7102</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.32</td>\n",
       "      <td>12556846</td>\n",
       "      <td>7065</td>\n",
       "      <td>...</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.42</td>\n",
       "      <td>9610287</td>\n",
       "      <td>5690</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.25</td>\n",
       "      <td>1748765</td>\n",
       "      <td>1013</td>\n",
       "      <td>0.04</td>\n",
       "      <td>0.04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 32 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                GEOID10_STATE  State name  Total CBGs in state  \\\n",
       "GEOID10_STATE                                                    \n",
       "01            0            01     Alabama                 3438   \n",
       "02            0            02      Alaska                  534   \n",
       "04            0            04     Arizona                 4178   \n",
       "05            0            05    Arkansas                 2147   \n",
       "06            0            06  California                23212   \n",
       "\n",
       "                 Total population in state  \\\n",
       "GEOID10_STATE                                \n",
       "01            0                    4850771   \n",
       "02            0                     738565   \n",
       "04            0                    6809946   \n",
       "05            0                    2977944   \n",
       "06            0                   38982847   \n",
       "\n",
       "                 Score A (top 25th percentile) (priority population)  \\\n",
       "GEOID10_STATE                                                          \n",
       "01            0                                            1547345     \n",
       "02            0                                              63868     \n",
       "04            0                                            1956052     \n",
       "05            0                                             960799     \n",
       "06            0                                           12610810     \n",
       "\n",
       "                 Score A (top 25th percentile) (total CBGs)  \\\n",
       "GEOID10_STATE                                                 \n",
       "01            0                                        1326   \n",
       "02            0                                          57   \n",
       "04            0                                        1230   \n",
       "05            0                                         817   \n",
       "06            0                                        7102   \n",
       "\n",
       "                 Score A (top 25th percentile) (percent CBGs)  \\\n",
       "GEOID10_STATE                                                   \n",
       "01            0                                          0.39   \n",
       "02            0                                          0.11   \n",
       "04            0                                          0.29   \n",
       "05            0                                          0.38   \n",
       "06            0                                          0.31   \n",
       "\n",
       "                 Score A (top 25th percentile) (percent population)  \\\n",
       "GEOID10_STATE                                                         \n",
       "01            0                                               0.32    \n",
       "02            0                                               0.09    \n",
       "04            0                                               0.29    \n",
       "05            0                                               0.32    \n",
       "06            0                                               0.32    \n",
       "\n",
       "                 Score B (top 25th percentile) (priority population)  \\\n",
       "GEOID10_STATE                                                          \n",
       "01            0                                            1556417     \n",
       "02            0                                              63868     \n",
       "04            0                                            1960856     \n",
       "05            0                                             975780     \n",
       "06            0                                           12556846     \n",
       "\n",
       "                 Score B (top 25th percentile) (total CBGs)  ...  \\\n",
       "GEOID10_STATE                                                ...   \n",
       "01            0                                        1323  ...   \n",
       "02            0                                          57  ...   \n",
       "04            0                                        1231  ...   \n",
       "05            0                                         826  ...   \n",
       "06            0                                        7065  ...   \n",
       "\n",
       "                 Score E (top 25th percentile) (percent CBGs)  \\\n",
       "GEOID10_STATE                                                   \n",
       "01            0                                          0.23   \n",
       "02            0                                          0.14   \n",
       "04            0                                          0.30   \n",
       "05            0                                          0.20   \n",
       "06            0                                          0.40   \n",
       "\n",
       "                 Score E (top 25th percentile) (percent population)  \\\n",
       "GEOID10_STATE                                                         \n",
       "01            0                                               0.19    \n",
       "02            0                                               0.12    \n",
       "04            0                                               0.30    \n",
       "05            0                                               0.18    \n",
       "06            0                                               0.42    \n",
       "\n",
       "                 calenviroscreen_priority_community (priority population)  \\\n",
       "GEOID10_STATE                                                               \n",
       "01            0                                                  0          \n",
       "02            0                                                  0          \n",
       "04            0                                                  0          \n",
       "05            0                                                  0          \n",
       "06            0                                            9610287          \n",
       "\n",
       "                 calenviroscreen_priority_community (total CBGs)  \\\n",
       "GEOID10_STATE                                                      \n",
       "01            0                                                0   \n",
       "02            0                                                0   \n",
       "04            0                                                0   \n",
       "05            0                                                0   \n",
       "06            0                                             5690   \n",
       "\n",
       "                 calenviroscreen_priority_community (percent CBGs)  \\\n",
       "GEOID10_STATE                                                        \n",
       "01            0                                               0.00   \n",
       "02            0                                               0.00   \n",
       "04            0                                               0.00   \n",
       "05            0                                               0.00   \n",
       "06            0                                               0.25   \n",
       "\n",
       "                 calenviroscreen_priority_community (percent population)  \\\n",
       "GEOID10_STATE                                                              \n",
       "01            0                                               0.00         \n",
       "02            0                                               0.00         \n",
       "04            0                                               0.00         \n",
       "05            0                                               0.00         \n",
       "06            0                                               0.25         \n",
       "\n",
       "                 hud_recap_priority_community (priority population)  \\\n",
       "GEOID10_STATE                                                         \n",
       "01            0                                             235117    \n",
       "02            0                                               6536    \n",
       "04            0                                             560353    \n",
       "05            0                                             101200    \n",
       "06            0                                            1748765    \n",
       "\n",
       "                 hud_recap_priority_community (total CBGs)  \\\n",
       "GEOID10_STATE                                                \n",
       "01            0                                        258   \n",
       "02            0                                          8   \n",
       "04            0                                        378   \n",
       "05            0                                        106   \n",
       "06            0                                       1013   \n",
       "\n",
       "                 hud_recap_priority_community (percent CBGs)  \\\n",
       "GEOID10_STATE                                                  \n",
       "01            0                                         0.08   \n",
       "02            0                                         0.01   \n",
       "04            0                                         0.09   \n",
       "05            0                                         0.05   \n",
       "06            0                                         0.04   \n",
       "\n",
       "                 hud_recap_priority_community (percent population)  \n",
       "GEOID10_STATE                                                       \n",
       "01            0                                               0.05  \n",
       "02            0                                               0.01  \n",
       "04            0                                               0.08  \n",
       "05            0                                               0.03  \n",
       "06            0                                               0.04  \n",
       "\n",
       "[5 rows x 32 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_state_distributions(\n",
    "    df: pd.DataFrame, priority_communities_fields: typing.List[str]\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"For each boolean field of priority communities, calculate distribution across states and territories.\"\"\"\n",
    "\n",
    "    # Ensure each field is boolean.\n",
    "    for priority_communities_field in priority_communities_fields:\n",
    "        if df[priority_communities_field].dtype != bool:\n",
    "            print(f\"Converting {priority_communities_field} to boolean.\")\n",
    "\n",
    "        # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n",
    "        df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
    "            df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
    "        )\n",
    "\n",
    "    def calculate_state_comparison(\n",
    "        frame: pd.DataFrame, geography_field: str\n",
    "    ) -> pd.DataFrame:\n",
    "        \"\"\"\n",
    "        This method will be applied to a `group_by` object. Inherits some parameters from outer scope.\n",
    "\n",
    "        \"\"\"\n",
    "        summary_dict = {}\n",
    "        summary_dict[COUNTRY_FIELD_NAME] = frame[COUNTRY_FIELD_NAME].unique()[0]\n",
    "\n",
    "        if geography_field == COUNTRY_FIELD_NAME:\n",
    "            summary_dict[GEOID_STATE_FIELD_NAME] = \"00\"\n",
    "            summary_dict[\"Geography name\"] = \"(Entire USA)\"\n",
    "\n",
    "        if geography_field == GEOID_STATE_FIELD_NAME:\n",
    "            state_id = frame[GEOID_STATE_FIELD_NAME].unique()[0]\n",
    "            summary_dict[GEOID_STATE_FIELD_NAME] = state_id\n",
    "            summary_dict[\"Geography name\"] = us.states.lookup(state_id).name\n",
    "\n",
    "            # Also add region information\n",
    "            region_id = frame[\"region\"].unique()[0]\n",
    "            summary_dict[\"region\"] = region_id\n",
    "\n",
    "        if geography_field == \"region\":\n",
    "            region_id = frame[\"region\"].unique()[0]\n",
    "            summary_dict[\"region\"] = region_id\n",
    "            summary_dict[\"Geography name\"] = region_id\n",
    "\n",
    "        if geography_field == \"division\":\n",
    "            division_id = frame[\"division\"].unique()[0]\n",
    "            summary_dict[\"division\"] = division_id\n",
    "            summary_dict[\"Geography name\"] = division_id\n",
    "\n",
    "        summary_dict[\"Total CBGs in geography\"] = len(frame)\n",
    "        summary_dict[\"Total population in geography\"] = frame[\n",
    "            CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
    "        ].sum()\n",
    "\n",
    "        for priority_communities_field in priority_communities_fields:\n",
    "            summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
    "                f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
    "            ].sum()\n",
    "\n",
    "            summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n",
    "                f\"{priority_communities_field}\"\n",
    "            ].sum()\n",
    "\n",
    "            # Calculate some combinations of other variables.\n",
    "            summary_dict[f\"{priority_communities_field} (percent CBGs)\"] = (\n",
    "                summary_dict[f\"{priority_communities_field} (total CBGs)\"]\n",
    "                / summary_dict[\"Total CBGs in geography\"]\n",
    "            )\n",
    "\n",
    "            summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
    "                summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
    "                / summary_dict[\"Total population in geography\"]\n",
    "            )\n",
    "\n",
    "        df = pd.DataFrame(summary_dict, index=[0])\n",
    "\n",
    "        return df\n",
    "\n",
    "    # Add a field for country so we can do aggregations across the entire country.\n",
    "    df[COUNTRY_FIELD_NAME] = \"USA\"\n",
    "\n",
    "    # First, run the comparison by the whole country\n",
    "    usa_grouped_df = df.groupby(COUNTRY_FIELD_NAME)\n",
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    usa_distribution_df = usa_grouped_df.progress_apply(\n",
    "        lambda frame: calculate_state_comparison(\n",
    "            frame, geography_field=COUNTRY_FIELD_NAME\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Next, run the comparison by state\n",
    "    state_grouped_df = df.groupby(GEOID_STATE_FIELD_NAME)\n",
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    state_distribution_df = state_grouped_df.progress_apply(\n",
    "        lambda frame: calculate_state_comparison(\n",
    "            frame, geography_field=GEOID_STATE_FIELD_NAME\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Next, run the comparison by region\n",
    "    region_grouped_df = df.groupby(\"region\")\n",
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    region_distribution_df = region_grouped_df.progress_apply(\n",
    "        lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
    "    )\n",
    "\n",
    "    # Next, run the comparison by division\n",
    "    division_grouped_df = df.groupby(\"division\")\n",
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    division_distribution_df = division_grouped_df.progress_apply(\n",
    "        lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
    "    )\n",
    "\n",
    "    # Combine the three\n",
    "    combined_df = pd.concat(\n",
    "        [\n",
    "            usa_distribution_df,\n",
    "            state_distribution_df,\n",
    "            region_distribution_df,\n",
    "            division_distribution_df,\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    return combined_df\n",
    "\n",
    "\n",
    "def write_state_distribution_excel(\n",
    "    state_distribution_df: pd.DataFrame, file_path: pathlib.PosixPath\n",
    ") -> None:\n",
    "    \"\"\"Write the dataframe to excel with special formatting.\"\"\"\n",
    "    # Create a Pandas Excel writer using XlsxWriter as the engine.\n",
    "    writer = pd.ExcelWriter(file_path, engine=\"xlsxwriter\")\n",
    "\n",
    "    # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n",
    "    # index column at the left of the output dataframe.\n",
    "    state_distribution_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n",
    "\n",
    "    # Get the xlsxwriter workbook and worksheet objects.\n",
    "    workbook = writer.book\n",
    "    worksheet = writer.sheets[\"Sheet1\"]\n",
    "    worksheet.autofilter(\n",
    "        0, 0, state_distribution_df.shape[0], state_distribution_df.shape[1]\n",
    "    )\n",
    "\n",
    "    # Set a width parameter for all columns\n",
    "    # Note: this is parameterized because every call to `set_column` requires setting the width.\n",
    "    column_width = 15\n",
    "\n",
    "    for column in state_distribution_df.columns:\n",
    "        # Turn the column index into excel ranges (e.g., column #95 is \"CR\" and the range may be \"CR2:CR53\").\n",
    "        column_index = state_distribution_df.columns.get_loc(column)\n",
    "        column_character = get_excel_column_name(column_index)\n",
    "\n",
    "        # Set all columns to larger width\n",
    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
    "\n",
    "        # Special formatting for all percent columns\n",
    "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
    "        if \"percent \" in column or \"(percent)\" in column:\n",
    "            # Make these columns percentages.\n",
    "            percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
    "            worksheet.set_column(\n",
    "                f\"{column_character}:{column_character}\",\n",
    "                column_width,\n",
    "                percentage_format,\n",
    "            )\n",
    "\n",
    "        # Special formatting for columns that capture the percent of population considered priority.\n",
    "        if \"(percent population)\" in column:\n",
    "            column_ranges = (\n",
    "                f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
    "            )\n",
    "\n",
    "            # Add green to red conditional formatting.\n",
    "            worksheet.conditional_format(\n",
    "                column_ranges,\n",
    "                # Min: green, max: red.\n",
    "                {\n",
    "                    \"type\": \"2_color_scale\",\n",
    "                    \"min_color\": \"#00FF7F\",\n",
    "                    \"max_color\": \"#C82538\",\n",
    "                },\n",
    "            )\n",
    "\n",
    "    header_format = workbook.add_format(\n",
    "        {\"bold\": True, \"text_wrap\": True, \"valign\": \"bottom\"}\n",
    "    )\n",
    "\n",
    "    # Overwrite both the value and the format of each header cell\n",
    "    # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n",
    "    # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n",
    "    for col_num, value in enumerate(state_distribution_df.columns.values):\n",
    "        worksheet.write(0, col_num, value, header_format)\n",
    "\n",
    "    writer.save()\n",
    "\n",
    "\n",
    "fields_to_analyze = [\n",
    "    index.priority_communities_field\n",
    "    for index in census_block_group_indices + census_tract_indices\n",
    "]\n",
    "\n",
    "state_fips_codes = get_state_information(DATA_DIR)\n",
    "\n",
    "merged_with_state_information_df = merged_df.merge(\n",
    "    right=state_fips_codes, left_on=GEOID_STATE_FIELD_NAME, right_on=\"fips\"\n",
    ")\n",
    "\n",
    "state_distribution_df = get_state_distributions(\n",
    "    df=merged_with_state_information_df,\n",
    "    priority_communities_fields=fields_to_analyze,\n",
    ")\n",
    "\n",
    "state_distribution_df.to_csv(\n",
    "    path_or_buf=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.csv\",\n",
    "    na_rep=\"\",\n",
    "    index=False,\n",
    ")\n",
    "\n",
    "write_state_distribution_excel(\n",
    "    state_distribution_df=state_distribution_df,\n",
    "    file_path=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.xlsx\",\n",
    ")\n",
    "\n",
    "state_distribution_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9b9a329",
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_markdown_and_docx_content(\n",
    "    markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str\n",
    ") -> pathlib.PosixPath:\n",
    "    \"\"\"Write Markdown content to both .md and .docx files.\"\"\"\n",
    "    # Set the file paths for both files.\n",
    "    markdown_file_path = file_dir / f\"{file_name_without_extension}.md\"\n",
    "    docx_file_path = file_dir / f\"{file_name_without_extension}.docx\"\n",
    "\n",
    "    # Write the markdown content to file.\n",
    "    with open(markdown_file_path, \"w\") as text_file:\n",
    "        text_file.write(markdown_content)\n",
    "\n",
    "    # Convert markdown file to Word doc.\n",
    "    pypandoc.convert_file(\n",
    "        source_file=str(markdown_file_path),\n",
    "        to=\"docx\",\n",
    "        outputfile=str(docx_file_path),\n",
    "        extra_args=[],\n",
    "    )\n",
    "\n",
    "    return docx_file_path\n",
    "\n",
    "\n",
    "def get_markdown_comparing_census_block_group_indices(\n",
    "    census_block_group_indices=typing.List[Index],\n",
    "    df=pd.DataFrame,\n",
    "    state_field=GEOID_STATE_FIELD_NAME,\n",
    ") -> str:\n",
    "    \"\"\"Generate a Markdown string of analysis of multiple CBG indices.\"\"\"\n",
    "    count_field_name = \"Count of CBGs\"\n",
    "\n",
    "    # List of all states/territories in their FIPS codes:\n",
    "    state_ids = sorted(df[state_field].unique())\n",
    "    state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
    "\n",
    "    # Create markdown content for comparisons.\n",
    "    markdown_content = f\"\"\"\n",
    "# Comparing multiple indices at the census block group level\n",
    "    \n",
    "(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)\n",
    "\n",
    "This report compares the following indices: {\", \".join([index.method_name for index in census_block_group_indices])}.\n",
    "\n",
    "This report analyzes the following US states and territories: {state_names}.\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "    for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n",
    "        # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n",
    "        count_df = (\n",
    "            df.groupby(\n",
    "                [index1.priority_communities_field, index2.priority_communities_field]\n",
    "            )[GEOID_FIELD_NAME]\n",
    "            .count()\n",
    "            .reset_index(name=count_field_name)\n",
    "        )\n",
    "\n",
    "        total_cbgs = count_df[count_field_name].sum()\n",
    "\n",
    "        # Returns a series\n",
    "        true_true_cbgs_series = count_df.loc[\n",
    "            count_df[index1.priority_communities_field]\n",
    "            & count_df[index2.priority_communities_field],\n",
    "            count_field_name,\n",
    "        ]\n",
    "        true_false_cbgs_series = count_df.loc[\n",
    "            count_df[index1.priority_communities_field]\n",
    "            & ~count_df[index2.priority_communities_field],\n",
    "            count_field_name,\n",
    "        ]\n",
    "        false_true_cbgs_series = count_df.loc[\n",
    "            ~count_df[index1.priority_communities_field]\n",
    "            & count_df[index2.priority_communities_field],\n",
    "            count_field_name,\n",
    "        ]\n",
    "        false_false_cbgs_series = count_df.loc[\n",
    "            ~count_df[index1.priority_communities_field]\n",
    "            & ~count_df[index2.priority_communities_field],\n",
    "            count_field_name,\n",
    "        ]\n",
    "\n",
    "        # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n",
    "        true_true_cbgs = (\n",
    "            true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n",
    "        )\n",
    "        true_false_cbgs = (\n",
    "            true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n",
    "        )\n",
    "        false_true_cbgs = (\n",
    "            false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n",
    "        )\n",
    "        false_false_cbgs = (\n",
    "            false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n",
    "        )\n",
    "\n",
    "        markdown_content += (\n",
    "            \"*** \\n\\n\"\n",
    "            \"There are \"\n",
    "            f\"{true_true_cbgs} ({true_true_cbgs / total_cbgs:.0%}) \"\n",
    "            f\"census block groups that are both {index1.method_name} priority communities and {index2.method_name} priority communities.\\n\\n\"\n",
    "            \"There are \"\n",
    "            f\"{true_false_cbgs} ({true_false_cbgs / total_cbgs:.0%}) \"\n",
    "            f\"census block groups that are {index1.method_name} priority communities but not {index2.method_name} priority communities.\\n\\n\"\n",
    "            \"There are \"\n",
    "            f\"{false_true_cbgs} ({false_true_cbgs / total_cbgs:.0%}) \"\n",
    "            f\"census block groups that are not {index1.method_name} priority communities but are {index2.method_name} priority communities.\\n\\n\"\n",
    "            \"There are \"\n",
    "            f\"{false_false_cbgs} ({false_false_cbgs / total_cbgs:.0%}) \"\n",
    "            f\"census block groups that are neither {index1.method_name} priority communities nor {index2.method_name} priority communities.\\n\\n\"\n",
    "            \"\\n\\n\"\n",
    "        )\n",
    "\n",
    "    return markdown_content\n",
    "\n",
    "\n",
    "def get_comparison_census_block_group_indices(\n",
    "    census_block_group_indices=typing.List[Index],\n",
    "    df=pd.DataFrame,\n",
    "    state_field=GEOID_STATE_FIELD_NAME,\n",
    ") -> pathlib.PosixPath:\n",
    "    markdown_content = get_markdown_comparing_census_block_group_indices(\n",
    "        census_block_group_indices=census_block_group_indices,\n",
    "        df=merged_with_state_information_df,\n",
    "    )\n",
    "\n",
    "    comparison_docx_file_path = write_markdown_and_docx_content(\n",
    "        markdown_content=markdown_content,\n",
    "        file_dir=COMPARISON_OUTPUTS_DIR,\n",
    "        file_name_without_extension=f\"Comparison report - All CBG indices\",\n",
    "    )\n",
    "\n",
    "    return comparison_docx_file_path\n",
    "\n",
    "\n",
    "# Compare multiple scores at the CBG level\n",
    "get_comparison_census_block_group_indices(\n",
    "    census_block_group_indices=census_block_group_indices,\n",
    "    df=merged_with_state_information_df,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25a10027",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This cell defines a variety of comparison functions. It does not run them.\n",
    "\n",
    "# Define a namedtuple for column names, which need to be shared between multiple parts of this comparison pipeline.\n",
    "# Named tuples are useful here because they provide guarantees that for each instance, all properties are defined and\n",
    "# can be accessed as properties (rather than as strings).\n",
    "\n",
    "# Note: if you'd like to add a field used throughout the comparison process, add it in three places.\n",
    "# For an example `new_field`,\n",
    "# 1. in this namedtuple, add the field as a string in `field_names` (e.g., `field_names=[..., \"new_field\"])`)\n",
    "# 2. in the function `get_comparison_field_names`, define how the field name should be created from input data\n",
    "#     (e.g., `...new_field=f\"New field compares {method_a_name} to {method_b_name}\")\n",
    "# 3. In the function `get_comparison_markdown_content`, add some reporting on the new field to the markdown content.\n",
    "#     (e.g., `The statistics indicate that {calculation_based_on_new_field} percent of census tracts are different between scores.`)\n",
    "ComparisonFieldNames = collections.namedtuple(\n",
    "    typename=\"ComparisonFieldNames\",\n",
    "    field_names=[\n",
    "        \"any_tract_has_at_least_one_method_a_cbg\",\n",
    "        \"method_b_tract_has_at_least_one_method_a_cbg\",\n",
    "        \"method_b_tract_has_100_percent_method_a_cbg\",\n",
    "        \"method_b_non_priority_tract_has_at_least_one_method_a_cbg\",\n",
    "        \"method_b_non_priority_tract_has_100_percent_method_a_cbg\",\n",
    "    ],\n",
    ")\n",
    "\n",
    "\n",
    "def get_comparison_field_names(\n",
    "    method_a_name: str,\n",
    "    method_b_name: str,\n",
    ") -> ComparisonFieldNames:\n",
    "    comparison_field_names = ComparisonFieldNames(\n",
    "        any_tract_has_at_least_one_method_a_cbg=(\n",
    "            f\"Any tract has at least one {method_a_name} Priority CBG?\"\n",
    "        ),\n",
    "        method_b_tract_has_at_least_one_method_a_cbg=(\n",
    "            f\"{method_b_name} priority tract has at least one {method_a_name} CBG?\"\n",
    "        ),\n",
    "        method_b_tract_has_100_percent_method_a_cbg=(\n",
    "            f\"{method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n",
    "        ),\n",
    "        method_b_non_priority_tract_has_at_least_one_method_a_cbg=(\n",
    "            f\"Non-priority {method_b_name} tract has at least one {method_a_name} priority CBG?\"\n",
    "        ),\n",
    "        method_b_non_priority_tract_has_100_percent_method_a_cbg=(\n",
    "            f\"Non-priority {method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n",
    "        ),\n",
    "    )\n",
    "    return comparison_field_names\n",
    "\n",
    "\n",
    "def get_df_with_only_shared_states(\n",
    "    df: pd.DataFrame,\n",
    "    field_a: str,\n",
    "    field_b: str,\n",
    "    state_field=GEOID_STATE_FIELD_NAME,\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Useful for looking at shared geographies across two fields.\n",
    "\n",
    "    For a data frame and two fields, return a data frame only for states where there are non-null\n",
    "    values for both fields in that state (or territory).\n",
    "\n",
    "    This is useful, for example, when running a comparison of CalEnviroScreen (only in California) against\n",
    "    a draft score that's national, and returning only the data for California for the entire data frame.\n",
    "    \"\"\"\n",
    "    field_a_states = df.loc[df[field_a].notnull(), state_field].unique()\n",
    "    field_b_states = df.loc[df[field_b].notnull(), state_field].unique()\n",
    "\n",
    "    shared_states = list(set(field_a_states) & set(field_b_states))\n",
    "\n",
    "    df = df.loc[df[state_field].isin(shared_states), :]\n",
    "\n",
    "    return df\n",
    "\n",
    "\n",
    "def get_comparison_df(\n",
    "    df: pd.DataFrame,\n",
    "    method_a_priority_census_block_groups_field: str,\n",
    "    method_b_priority_census_tracts_field: str,\n",
    "    other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n",
    "    comparison_field_names: ComparisonFieldNames,\n",
    "    output_dir: pathlib.PosixPath,\n",
    ") -> None:\n",
    "    \"\"\"Produces a comparison report for any two given boolean columns representing priority fields.\n",
    "\n",
    "    Args:\n",
    "      df: a pandas dataframe including the data for this comparison.\n",
    "      method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n",
    "        community field that defines communities at the level of census block groups (CBGs).\n",
    "      method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n",
    "        community field that defines communities at the level of census tracts.\n",
    "      other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n",
    "\n",
    "    Returns:\n",
    "      df: a pandas dataframe with one row with the results of this comparison\n",
    "    \"\"\"\n",
    "\n",
    "    def calculate_comparison(frame: pd.DataFrame) -> pd.DataFrame:\n",
    "        \"\"\"\n",
    "        This method will be applied to a `group_by` object.\n",
    "\n",
    "        Note: It inherits from outer scope `method_a_priority_census_block_groups_field`, `method_b_priority_census_tracts_field`,\n",
    "        and `other_census_tract_fields_to_keep`.\n",
    "        \"\"\"\n",
    "        # Keep all the tract values at the Census Tract Level\n",
    "        for field in other_census_tract_fields_to_keep:\n",
    "            if len(frame[field].unique()) != 1:\n",
    "                raise ValueError(\n",
    "                    f\"There are different values per CBG for field {field}.\"\n",
    "                    \"`other_census_tract_fields_to_keep` can only be used for fields at the census tract level.\"\n",
    "                )\n",
    "\n",
    "        df = frame.loc[\n",
    "            frame.index[0],\n",
    "            [\n",
    "                GEOID_TRACT_FIELD_NAME,\n",
    "                method_b_priority_census_tracts_field,\n",
    "            ]\n",
    "            + other_census_tract_fields_to_keep,\n",
    "        ]\n",
    "\n",
    "        # Convenience constant for whether the tract is or is not a method B priority community.\n",
    "        is_a_method_b_priority_tract = frame.loc[\n",
    "            frame.index[0], [method_b_priority_census_tracts_field]\n",
    "        ][0]\n",
    "\n",
    "        # Recall that NaN values are not falsy, so we need to check if `is_a_method_b_priority_tract` is True.\n",
    "        is_a_method_b_priority_tract = is_a_method_b_priority_tract is True\n",
    "\n",
    "        # Calculate whether the tract (whether or not it is a comparison priority tract) includes CBGs that are priority\n",
    "        # according to the current CBG score.\n",
    "        df[comparison_field_names.any_tract_has_at_least_one_method_a_cbg] = (\n",
    "            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
    "        )\n",
    "\n",
    "        # Calculate comparison\n",
    "        # A comparison priority tract has at least one CBG that is a priority CBG.\n",
    "        df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n",
    "            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
    "            if is_a_method_b_priority_tract\n",
    "            else None\n",
    "        )\n",
    "\n",
    "        # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n",
    "        df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n",
    "            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
    "            if is_a_method_b_priority_tract\n",
    "            else None\n",
    "        )\n",
    "\n",
    "        # Calculate the inverse\n",
    "        # A tract that is _not_ a comparison priority has at least one CBG priority CBG.\n",
    "        df[\n",
    "            comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg\n",
    "        ] = (\n",
    "            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
    "            if not is_a_method_b_priority_tract\n",
    "            else None\n",
    "        )\n",
    "\n",
    "        # A tract that is _not_ a comparison priority has all of its contained CBGs as CBG priority CBGs.\n",
    "        df[\n",
    "            comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n",
    "        ] = (\n",
    "            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
    "            if not is_a_method_b_priority_tract\n",
    "            else None\n",
    "        )\n",
    "\n",
    "        # For all remaining fields, calculate the average\n",
    "        # TODO: refactor to vectorize to make faster.\n",
    "        for field in [\n",
    "            \"Poverty (Less than 200% of federal poverty line)\",\n",
    "            \"Percent of households in linguistic isolation\",\n",
    "            \"Percent individuals age 25 or over with less than high school degree\",\n",
    "            \"Unemployed civilians (percent)\",\n",
    "        ]:\n",
    "            df[f\"{field} (average of CBGs)\"] = frame.loc[:, field].mean()\n",
    "\n",
    "        return df\n",
    "\n",
    "    # Group all data by the census tract.\n",
    "    grouped_df = df.groupby(GEOID_TRACT_FIELD_NAME)\n",
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    comparison_df = grouped_df.progress_apply(calculate_comparison)\n",
    "\n",
    "    return comparison_df\n",
    "\n",
    "\n",
    "def get_comparison_markdown_content(\n",
    "    original_df: pd.DataFrame,\n",
    "    comparison_df: pd.DataFrame,\n",
    "    comparison_field_names: ComparisonFieldNames,\n",
    "    method_a_name: str,\n",
    "    method_b_name: str,\n",
    "    method_a_priority_census_block_groups_field: str,\n",
    "    method_b_priority_census_tracts_field: str,\n",
    "    state_field: str = GEOID_STATE_FIELD_NAME,\n",
    ") -> str:\n",
    "    # Prepare some constants for use in the following Markdown content.\n",
    "    total_cbgs = len(original_df)\n",
    "\n",
    "    # List of all states/territories in their FIPS codes:\n",
    "    state_ids = sorted(original_df[state_field].unique())\n",
    "    state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
    "\n",
    "    # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n",
    "    # TODO: investigate why sums are sometimes series and sometimes scalar.\n",
    "    method_a_priority_cbgs = (\n",
    "        original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n",
    "    )\n",
    "    method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
    "\n",
    "    total_tracts_count = len(comparison_df)\n",
    "\n",
    "    method_b_priority_tracts_count = comparison_df.loc[\n",
    "        :, method_b_priority_census_tracts_field\n",
    "    ].sum()\n",
    "\n",
    "    method_b_priority_tracts_count_percent = (\n",
    "        f\"{method_b_priority_tracts_count / total_tracts_count:.0%}\"\n",
    "    )\n",
    "    method_b_non_priority_tracts_count = (\n",
    "        total_tracts_count - method_b_priority_tracts_count\n",
    "    )\n",
    "\n",
    "    method_a_tracts_count = (\n",
    "        comparison_df.loc[\n",
    "            :, comparison_field_names.any_tract_has_at_least_one_method_a_cbg\n",
    "        ]\n",
    "        .sum()\n",
    "        .squeeze()\n",
    "    )\n",
    "    method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
    "\n",
    "    # Method A priority community stats\n",
    "    method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
    "        :, comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n",
    "    ].sum()\n",
    "    method_b_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_tracts_with_at_least_one_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n",
    "\n",
    "    method_b_tracts_with_at_100_percent_method_a_cbg = comparison_df.loc[\n",
    "        :, comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n",
    "    ].sum()\n",
    "    method_b_tracts_with_at_100_percent_method_a_cbg_percent = f\"{method_b_tracts_with_at_100_percent_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n",
    "\n",
    "    # Method A non-priority community stats\n",
    "    method_b_non_priority_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
    "        :,\n",
    "        comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,\n",
    "    ].sum()\n",
    "\n",
    "    method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_at_least_one_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n",
    "\n",
    "    method_b_non_priority_tracts_with_100_percent_method_a_cbg = comparison_df.loc[\n",
    "        :,\n",
    "        comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg,\n",
    "    ].sum()\n",
    "    method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_100_percent_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n",
    "\n",
    "    # Create markdown content for comparisons.\n",
    "    markdown_content = f\"\"\"\n",
    "# {method_a_name} compared to {method_b_name}\n",
    "\n",
    "(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)\n",
    "\n",
    "This report analyzes the following US states and territories: {state_names}.\n",
    "\n",
    "Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n",
    "\n",
    "Within the geographic area analyzed, there are {method_b_priority_tracts_count} census tracts designated as priority communities by {method_b_name}, out of {total_tracts_count} total tracts ({method_b_priority_tracts_count_percent}). \n",
    "\n",
    "Within the geographic region analyzed, there are {method_a_priority_cbgs} census block groups considered as priority communities by {method_a_name}, out of {total_cbgs} CBGs ({method_a_priority_cbgs_percent}). They occupy {method_a_tracts_count} census tracts ({method_a_tracts_count_percent}) of the geographic area analyzed.\n",
    "\n",
    "Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_least_one_method_a_cbg} ({method_b_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by {method_a_name}.\n",
    "\n",
    "Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_100_percent_method_a_cbg} ({method_b_tracts_with_at_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by {method_a_name}.\n",
    "\n",
    "Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_at_least_one_method_a_cbg} ({method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n",
    "\n",
    "Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_100_percent_method_a_cbg} ({method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.\n",
    "\"\"\"\n",
    "\n",
    "    return markdown_content\n",
    "\n",
    "\n",
    "def get_secondary_comparison_df(\n",
    "    comparison_df: pd.DataFrame,\n",
    "    comparison_field_names: ComparisonFieldNames,\n",
    "    method_b_priority_census_tracts_field: str,\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"A secondary level of comparison.\n",
    "\n",
    "    The first level of comparison identifies census tracts prioritized by Method A,\n",
    "    compared to whether or not they're prioritized by Method B.\n",
    "\n",
    "    This comparison method analyzes characteristics of those census tracts, based on whether or not they are prioritized\n",
    "    or not by Method A and/or Method B.\n",
    "\n",
    "\n",
    "    E.g., it might show that tracts prioritized by A but not B have a higher average income,\n",
    "    or that tracts prioritized by B but not A have a lower percent of unemployed people.\"\"\"\n",
    "    grouped_df = comparison_df.groupby(\n",
    "        [\n",
    "            method_b_priority_census_tracts_field,\n",
    "            comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg,\n",
    "            comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,\n",
    "        ],\n",
    "        dropna=False,\n",
    "    )\n",
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    secondary_comparison_df = grouped_df.mean().reset_index()\n",
    "\n",
    "    return secondary_comparison_df\n",
    "\n",
    "\n",
    "def execute_comparison(\n",
    "    df: pd.DataFrame,\n",
    "    method_a_name: str,\n",
    "    method_b_name: str,\n",
    "    method_a_priority_census_block_groups_field: str,\n",
    "    method_b_priority_census_tracts_field: str,\n",
    "    other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n",
    ") -> pathlib.PosixPath:\n",
    "    \"\"\"Execute an individual comparison by creating the data frame and writing the report.\n",
    "\n",
    "    Args:\n",
    "      df: a pandas dataframe including the data for this comparison.\n",
    "      method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n",
    "        community field that defines communities at the level of census block groups (CBGs).\n",
    "      method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n",
    "        community field that defines communities at the level of census tracts.\n",
    "      other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n",
    "\n",
    "    Returns:\n",
    "      df: a pandas dataframe with one row with the results of this comparison\n",
    "\n",
    "    \"\"\"\n",
    "    comparison_field_names = get_comparison_field_names(\n",
    "        method_a_name=method_a_name, method_b_name=method_b_name\n",
    "    )\n",
    "\n",
    "    # Create or use a directory for outputs grouped by Method A.\n",
    "    output_dir = COMPARISON_OUTPUTS_DIR / method_a_name\n",
    "    output_dir.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "    df_with_only_shared_states = get_df_with_only_shared_states(\n",
    "        df=df,\n",
    "        field_a=method_a_priority_census_block_groups_field,\n",
    "        field_b=method_b_priority_census_tracts_field,\n",
    "    )\n",
    "\n",
    "    comparison_df = get_comparison_df(\n",
    "        df=df_with_only_shared_states,\n",
    "        method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n",
    "        method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n",
    "        comparison_field_names=comparison_field_names,\n",
    "        other_census_tract_fields_to_keep=other_census_tract_fields_to_keep,\n",
    "        output_dir=output_dir,\n",
    "    )\n",
    "\n",
    "    # Write comparison to CSV.\n",
    "    file_path = (\n",
    "        output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
    "    )\n",
    "    comparison_df.to_csv(\n",
    "        path_or_buf=file_path,\n",
    "        na_rep=\"\",\n",
    "        index=False,\n",
    "    )\n",
    "\n",
    "    # Secondary comparison DF\n",
    "    secondary_comparison_df = get_secondary_comparison_df(\n",
    "        comparison_df=comparison_df,\n",
    "        comparison_field_names=comparison_field_names,\n",
    "        method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n",
    "    )\n",
    "\n",
    "    # Write secondary comparison to CSV.\n",
    "    file_path = (\n",
    "        output_dir\n",
    "        / f\"Secondary Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
    "    )\n",
    "    secondary_comparison_df.to_csv(\n",
    "        path_or_buf=file_path,\n",
    "        na_rep=\"\",\n",
    "        index=False,\n",
    "    )\n",
    "\n",
    "    markdown_content = get_comparison_markdown_content(\n",
    "        original_df=df_with_only_shared_states,\n",
    "        comparison_df=comparison_df,\n",
    "        comparison_field_names=comparison_field_names,\n",
    "        method_a_name=method_a_name,\n",
    "        method_b_name=method_b_name,\n",
    "        method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n",
    "        method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n",
    "    )\n",
    "\n",
    "    comparison_docx_file_path = write_markdown_and_docx_content(\n",
    "        markdown_content=markdown_content,\n",
    "        file_dir=output_dir,\n",
    "        file_name_without_extension=f\"Comparison report - {method_a_name} and {method_b_name}\",\n",
    "    )\n",
    "\n",
    "    return comparison_docx_file_path\n",
    "\n",
    "\n",
    "def execute_comparisons(\n",
    "    df: pd.DataFrame,\n",
    "    census_block_group_indices: typing.List[Index],\n",
    "    census_tract_indices: typing.List[Index],\n",
    "):\n",
    "    \"\"\"Create multiple comparison reports.\"\"\"\n",
    "    comparison_docx_file_paths = []\n",
    "    for cbg_index in census_block_group_indices:\n",
    "        for census_tract_index in census_tract_indices:\n",
    "            print(\n",
    "                f\"Running comparisons for {cbg_index.method_name} against {census_tract_index.method_name}...\"\n",
    "            )\n",
    "\n",
    "            comparison_docx_file_path = execute_comparison(\n",
    "                df=df,\n",
    "                method_a_name=cbg_index.method_name,\n",
    "                method_b_name=census_tract_index.method_name,\n",
    "                method_a_priority_census_block_groups_field=cbg_index.priority_communities_field,\n",
    "                method_b_priority_census_tracts_field=census_tract_index.priority_communities_field,\n",
    "                other_census_tract_fields_to_keep=census_tract_index.other_census_tract_fields_to_keep,\n",
    "            )\n",
    "\n",
    "            comparison_docx_file_paths.append(comparison_docx_file_path)\n",
    "\n",
    "    return comparison_docx_file_paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b8b6d1e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running comparisons for Score A against CalEnviroScreen 4.0...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "531ec4deb2f54c26ad0f5311fdea0e60",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/8057 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running comparisons for Score A against HUD RECAP...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "170da68ae0734892bef4a452b5de45f7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/74001 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Actually execute the functions\n",
    "file_paths = execute_comparisons(\n",
    "    df=merged_df,\n",
    "    census_block_group_indices=census_block_group_indices,\n",
    "    census_tract_indices=census_tract_indices,\n",
    ")\n",
    "\n",
    "print(file_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "887ee948",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}