diff --git a/score/ipython/calenviroscreen_etl.ipynb b/score/ipython/calenviroscreen_etl.ipynb new file mode 100644 index 00000000..0333deef --- /dev/null +++ b/score/ipython/calenviroscreen_etl.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "20aa3891", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import numpy as np\n", + "import pandas as pd\n", + "import csv\n", + "import sys\n", + "import os\n", + "\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from etl.sources.census.etl_utils import get_state_fips_codes\n", + "from utils import unzip_file_from_url, remove_all_from_dir\n", + "\n", + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "TMP_PATH = DATA_PATH / \"tmp\"\n", + "CALENVIROSCREEN_FTP_URL = \"https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip\"\n", + "CSV_PATH = DATA_PATH / \"dataset\" / \"calenviroscreen4\"\n", + "\n", + "# Definining some variable names\n", + "CALENVIROSCREEN_SCORE_FIELD_NAME = \"calenviroscreen_score\"\n", + "CALENVIROSCREEN_PERCENTILE_FIELD_NAME = \"calenviroscreen_percentile\"\n", + "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = \"calenviroscreen_priority_community\"\n", + "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", + "\n", + "# Choosing constants.\n", + "# None of these numbers are final, but just for the purposes of comparison.\n", + "CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75\n", + "\n", + "print(DATA_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc3fb9ec", + "metadata": {}, + "outputs": [], + "source": [ + "# download file from ejscreen ftp\n", + "unzip_file_from_url(CALENVIROSCREEN_FTP_URL, TMP_PATH, TMP_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15f66756", + "metadata": {}, + "outputs": [], + "source": [ + "# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:\n", + "# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip\n", + "calenviroscreen_4_csv_name = \"CalEnviroScreen_4.0_2021.csv\"\n", + "calenviroscreen_data_path = TMP_PATH.joinpath(calenviroscreen_4_csv_name)\n", + "\n", + "# Load comparison index (CalEnviroScreen 4)\n", + "calenviroscreen_df = pd.read_csv(\n", + " calenviroscreen_data_path, dtype={\"Census Tract\": \"string\"}\n", + ")\n", + "\n", + "calenviroscreen_df.rename(\n", + " columns={\n", + " \"Census Tract\": GEOID_TRACT_FIELD_NAME,\n", + " \"DRAFT CES 4.0 Score\": CALENVIROSCREEN_SCORE_FIELD_NAME,\n", + " \"DRAFT CES 4.0 Percentile\": CALENVIROSCREEN_PERCENTILE_FIELD_NAME,\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# Add a leading \"0\" to the Census Tract to match our format in other data frames.\n", + "\n", + "calenviroscreen_df[GEOID_TRACT_FIELD_NAME] = (\n", + " \"0\" + calenviroscreen_df[GEOID_TRACT_FIELD_NAME]\n", + ")\n", + "\n", + "# Calculate the top K% of prioritized communities\n", + "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME] = (\n", + " calenviroscreen_df[CALENVIROSCREEN_PERCENTILE_FIELD_NAME]\n", + " >= CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD\n", + ")\n", + "\n", + "calenviroscreen_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fa2077a", + "metadata": {}, + "outputs": [], + "source": [ + "# write csv\n", + "CSV_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Matching other conventions in the ETL scripts, write only for the state (FIPS code 06).\n", + "calenviroscreen_df.to_csv(CSV_PATH / \"data06.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81b977f8", + "metadata": {}, + "outputs": [], + "source": [ + "# cleanup\n", + "remove_all_from_dir(TMP_PATH)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/ipython/hud_recap_etl.ipynb b/score/ipython/hud_recap_etl.ipynb new file mode 100644 index 00000000..7d4df434 --- /dev/null +++ b/score/ipython/hud_recap_etl.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "20aa3891", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import numpy as np\n", + "import pandas as pd\n", + "import csv\n", + "import sys\n", + "import os\n", + "\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from etl.sources.census.etl_utils import get_state_fips_codes\n", + "from utils import unzip_file_from_url, remove_all_from_dir\n", + "\n", + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "TMP_PATH = DATA_PATH / \"tmp\"\n", + "HUD_RECAP_CSV_URL = \"https://opendata.arcgis.com/api/v3/datasets/56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326\"\n", + "CSV_PATH = DATA_PATH / \"dataset\" / \"hud_recap\"\n", + "\n", + "# Definining some variable names\n", + "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", + "HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = \"hud_recap_priority_community\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9455da5", + "metadata": {}, + "outputs": [], + "source": [ + "# Data from https://hudgis-hud.opendata.arcgis.com/datasets/HUD::racially-or-ethnically-concentrated-areas-of-poverty-r-ecaps/about\n", + "df = pd.read_csv(HUD_RECAP_CSV_URL, dtype={\"GEOID\": \"string\"})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca63e66c", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename some fields\n", + "df.rename(\n", + " columns={\n", + " \"GEOID\": GEOID_TRACT_FIELD_NAME,\n", + " # Interestingly, there's no data dictionary for the RECAP data that I could find.\n", + " # However, this site (http://www.schousing.com/library/Tax%20Credit/2020/QAP%20Instructions%20(2).pdf)\n", + " # suggests:\n", + " # \"If RCAP_Current for the tract in which the site is located is 1, the tract is an R/ECAP. If RCAP_Current is 0, it is not.\"\n", + " \"RCAP_Current\": HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME,\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "# Convert to boolean\n", + "df[HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME] = df[\n", + " HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME\n", + "].astype(\"bool\")\n", + "\n", + "df[HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME].value_counts()\n", + "\n", + "df.sort_values(by=GEOID_TRACT_FIELD_NAME, inplace=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fa2077a", + "metadata": {}, + "outputs": [], + "source": [ + "# write csv\n", + "CSV_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Drop unnecessary columns.\n", + "df[[GEOID_TRACT_FIELD_NAME, HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME]].to_csv(\n", + " CSV_PATH / \"usa.csv\", index=False\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/ipython/score_calc.ipynb b/score/ipython/score_calc.ipynb index 39424812..e1eec406 100644 --- a/score/ipython/score_calc.ipynb +++ b/score/ipython/score_calc.ipynb @@ -16,6 +16,7 @@ "import collections\n", "import functools\n", "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import csv\n", "import os\n", @@ -363,7 +364,7 @@ }, "outputs": [], "source": [ - "# calculate percentiles\n", + "# Calculate percentiles for each data set.\n", "for data_set in data_sets:\n", " df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n", " data_set.renamed_field\n", @@ -379,7 +380,7 @@ "metadata": {}, "outputs": [], "source": [ - "# calculate min max\n", + "# Calculate min-max for each data set.\n", "# Math:\n", "# (\n", "# Observed value\n", @@ -410,6 +411,28 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4eec326", + "metadata": {}, + "outputs": [], + "source": [ + "# Graph distributions and correlations.\n", + "min_max_fields = [\n", + " f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"\n", + " for data_set in data_sets\n", + " if data_set.renamed_field != GEOID_FIELD_NAME\n", + "]\n", + "df.hist(\n", + " column=min_max_fields, layout=(len(min_max_fields), 1), figsize=(10, 30), bins=30\n", + ")\n", + "\n", + "plt.tight_layout()\n", + "\n", + "plt.show()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -476,7 +499,8 @@ "metadata": {}, "outputs": [], "source": [ - "fields_to_use_in_score = [\n", + "# Calculate scores D and E.\n", + "fields_to_use_in_score_d_and_e = [\n", " UNEMPLOYED_FIELD_NAME,\n", " LINGUISTIC_ISOLATION_FIELD_NAME,\n", " HOUSING_BURDEN_FIELD_NAME,\n", @@ -484,9 +508,11 @@ " HIGH_SCHOOL_FIELD_NAME,\n", "]\n", "\n", - "fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n", + "fields_min_max = [\n", + " f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n", + "]\n", "fields_percentile = [\n", - " f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n", + " f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n", "]\n", "\n", "# Calculate \"Score D\", which uses min-max normalization\n", @@ -498,6 +524,32 @@ "print(df[\"Score E\"].describe())" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a02e5bac", + "metadata": {}, + "outputs": [], + "source": [ + "# Graph distributions\n", + "df.hist(\n", + " column=fields_min_max, layout=(len(fields_min_max), 1), figsize=(10, 30), bins=30\n", + ")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0e608c8", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate correlations\n", + "df[fields_min_max].corr()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/score/ipython/scoring_comparison.ipynb b/score/ipython/scoring_comparison.ipynb index fc315009..64733e10 100644 --- a/score/ipython/scoring_comparison.ipynb +++ b/score/ipython/scoring_comparison.ipynb @@ -4,21 +4,40 @@ "cell_type": "code", "execution_count": null, "id": "54615cef", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "# Before running this script as it currently stands, you'll need to run two notebooks:\n", - "# 1. ejscreen_etl.ipynb\n", - "# 2. score_calc_0.1.ipynb\n", + "# Before running this script as it currently stands, you'll need to run these notebooks (in any order):\n", + "# * score_calc.ipynb\n", + "# * calenviroscreen_etl.ipynb\n", + "# * hud_recap_etl.ipynb\n", "\n", + "import collections\n", + "import functools\n", + "import IPython\n", "import numpy as np\n", + "import os\n", "import pandas as pd\n", - "from pathlib import Path\n", + "import pathlib\n", + "import pypandoc\n", "import requests\n", + "import string\n", + "import sys\n", + "import typing\n", + "import us\n", "import zipfile\n", + "\n", "from datetime import datetime\n", "from tqdm.notebook import tqdm_notebook\n", "\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from utils import remove_all_from_dir, get_excel_column_name\n", + "\n", "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", "tqdm_notebook.pandas()" ] @@ -27,86 +46,77 @@ "cell_type": "code", "execution_count": null, "id": "49a63129", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n", "pd.options.display.float_format = \"{:.2f}\".format\n", "\n", "# Set some global parameters\n", - "DATA_DIR = Path.cwd().parent / \"data\"\n", - "TEMP_DATA_DIR = Path.cwd().parent / \"data\" / \"tmp\"\n", - "# None of these numbers are final, but just for the purposes of comparison.\n", - "CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75\n", + "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", + "TEMP_DATA_DIR = pathlib.Path.cwd().parent / \"data\" / \"tmp\"\n", + "COMPARISON_OUTPUTS_DIR = TEMP_DATA_DIR / \"comparison_outputs\"\n", + "\n", + "# Make the dirs if they don't exist\n", + "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", "CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n", "\n", "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n", "# and introducing the risk of misspelling the field name.)\n", - "CENSUS_BLOCK_GROUP_ID_FIELD = \"census_block_group_id\"\n", - "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"census_block_group_population\"\n", - "CENSUS_TRACT_ID_FIELD = \"census_tract_id\"\n", - "CALENVIROSCREEN_SCORE_FIELD = \"calenviroscreen_score\"\n", - "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n", - "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n", "\n", - "# Note: we are pretending the EJSCREEN's low income percent is the actual score for now as a placeholder.\n", + "GEOID_FIELD_NAME = \"GEOID10\"\n", + "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", + "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n", + "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n", + "\n", "CEJST_SCORE_FIELD = \"cejst_score\"\n", "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n", "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", "\n", - "# Comparison field names\n", - "any_tract_has_at_least_one_cbg = \"Tract has at least one CEJST CBG?\"\n", - "tract_has_at_least_one_cbg = \"CES Tract has at least one CEJST CBG?\"\n", - "tract_has_100_percent_cbg = \"CES Tract has 100% CEJST CBGs?\"\n", - "non_ces_tract_has_at_least_one_cbg = \"Non-CES Tract has at least one CEJST CBG?\"\n", - "non_ces_tract_has_100_percent_cbg = \"Non-CES Tract has 100% CEJST CBGs?\"" + "# Define some suffixes\n", + "POPULATION_SUFFIX = \" (priority population)\"" ] }, { "cell_type": "code", "execution_count": null, "id": "2b26dccf", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Load CEJST score data\n", "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n", + "cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n", "\n", - "cejst_df = pd.read_csv(cejst_data_path)\n", + "# score_used = \"Score A\"\n", "\n", - "cejst_df.head()\n", - "\n", - "# Rename unclear name \"id\" to \"census_block_group_id\", as well as other renamings.\n", - "\n", - "score_used = \"Score A\"\n", - "\n", - "cejst_df.rename(\n", - " columns={\n", - " \"GEOID10\": CENSUS_BLOCK_GROUP_ID_FIELD,\n", - " \"Total population\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n", - " score_used: CEJST_SCORE_FIELD,\n", - " f\"{score_used} (percentile)\": CEJST_PERCENTILE_FIELD,\n", - " },\n", - " inplace=True,\n", - " errors=\"raise\",\n", - ")\n", - "\n", - "# Calculate the top K% of prioritized communities\n", - "cejst_df[CEJST_PRIORITY_COMMUNITY_FIELD] = (\n", - " cejst_df[CEJST_PERCENTILE_FIELD] >= CEJST_PRIORITY_COMMUNITY_THRESHOLD\n", - ")\n", + "# # Rename unclear name \"id\" to \"census_block_group_id\", as well as other renamings.\n", + "# cejst_df.rename(\n", + "# columns={\n", + "# \"Total population\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n", + "# score_used: CEJST_SCORE_FIELD,\n", + "# f\"{score_used} (percentile)\": CEJST_PERCENTILE_FIELD,\n", + "# },\n", + "# inplace=True,\n", + "# errors=\"raise\",\n", + "# )\n", "\n", "# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.\n", "# The CBG ID is the last one character.\n", "# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n", - "cejst_df.loc[:, CENSUS_TRACT_ID_FIELD] = (\n", - " cejst_df.loc[:, CENSUS_BLOCK_GROUP_ID_FIELD].astype(str).str[:-1].astype(np.int64)\n", + "cejst_df.loc[:, GEOID_TRACT_FIELD_NAME] = (\n", + " cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[:-1]\n", ")\n", "\n", - "# Remove all non-California data\n", - "cejst_df = cejst_df.loc[\n", - " cejst_df[CENSUS_BLOCK_GROUP_ID_FIELD].astype(str).str[0] == \"6\", :\n", - "]\n", + "cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n", + " cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[0:2]\n", + ")\n", "\n", "cejst_df.head()" ] @@ -114,65 +124,26 @@ { "cell_type": "code", "execution_count": null, - "id": "ec6b27e3", - "metadata": {}, + "id": "08962382", + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ - "# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:\n", - "# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip\n", + "# Load CalEnviroScreen 4.0\n", + "CALENVIROSCREEN_SCORE_FIELD = \"calenviroscreen_score\"\n", + "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n", + "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n", "\n", - "download = requests.get(\n", - " \"https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip\",\n", - " verify=False,\n", - ")\n", - "file_contents = download.content\n", - "zip_file_path = TEMP_DATA_DIR\n", - "zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n", - "zip_file.write(file_contents)\n", - "zip_file.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdf08971", - "metadata": {}, - "outputs": [], - "source": [ - "# Extract zip\n", - "print(zip_file_path)\n", - "with zipfile.ZipFile(zip_file_path / \"downloaded.zip\", \"r\") as zip_ref:\n", - " zip_ref.extractall(zip_file_path)\n", - "calenviroscreen_4_csv_name = \"CalEnviroScreen_4.0_2021.csv\"\n", - "calenviroscreen_data_path = TEMP_DATA_DIR.joinpath(calenviroscreen_4_csv_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29c14b29", - "metadata": {}, - "outputs": [], - "source": [ - "# Load comparison index (CalEnviroScreen 4)\n", - "\n", - "calenviroscreen_df = pd.read_csv(calenviroscreen_data_path)\n", - "\n", - "calenviroscreen_df.rename(\n", - " columns={\n", - " \"Census Tract\": CENSUS_TRACT_ID_FIELD,\n", - " \"DRAFT CES 4.0 Score\": CALENVIROSCREEN_SCORE_FIELD,\n", - " \"DRAFT CES 4.0 Percentile\": CALENVIROSCREEN_PERCENTILE_FIELD,\n", - " },\n", - " inplace=True,\n", + "calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n", + "calenviroscreen_df = pd.read_csv(\n", + " calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", ")\n", "\n", - "\n", - "# Calculate the top K% of prioritized communities\n", - "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = (\n", - " calenviroscreen_df[CALENVIROSCREEN_PERCENTILE_FIELD]\n", - " >= CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD\n", - ")\n", + "# Convert priority community field to a bool.\n", + "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n", + " CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n", + "].astype(bool)\n", "\n", "calenviroscreen_df.head()" ] @@ -180,222 +151,729 @@ { "cell_type": "code", "execution_count": null, - "id": "813e5656", - "metadata": {}, + "id": "42bd28d4", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "# Join CalEnviroScreen and CEJST data.\n", + "# Load HUD data\n", + "hud_recap_data_path = DATA_DIR / \"dataset\" / \"hud_recap\" / \"usa.csv\"\n", + "hud_recap_df = pd.read_csv(\n", + " hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", + ")\n", + "\n", + "hud_recap_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d77cd872", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Join all dataframes that use tracts\n", + "census_tract_dfs = [calenviroscreen_df, hud_recap_df]\n", + "\n", + "census_tract_df = functools.reduce(\n", + " lambda left, right: pd.merge(\n", + " left=left, right=right, on=GEOID_TRACT_FIELD_NAME, how=\"outer\"\n", + " ),\n", + " census_tract_dfs,\n", + ")\n", + "\n", + "if census_tract_df[GEOID_TRACT_FIELD_NAME].str.len().unique() != [11]:\n", + " raise ValueError(\"Some of the census tract data has the wrong length.\")\n", + "\n", + "if len(census_tract_df) > 74134:\n", + " raise ValueError(\"Too many rows in the join.\")\n", + "\n", + "census_tract_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "813e5656", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Join tract indices and CEJST data.\n", "# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,\n", "# creating multiple rows of the same CES data.\n", - "\n", - "# For simplicity, we'll only keep certain columns from each data frame.\n", - "cejst_columns_to_keep = [\n", - " CENSUS_BLOCK_GROUP_ID_FIELD,\n", - " CENSUS_TRACT_ID_FIELD,\n", - " CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n", - " CEJST_SCORE_FIELD,\n", - " CEJST_PERCENTILE_FIELD,\n", - " CEJST_PRIORITY_COMMUNITY_FIELD,\n", - "]\n", - "\n", - "calenviroscreen_columns_to_keep = [\n", - " CENSUS_TRACT_ID_FIELD,\n", - " CALENVIROSCREEN_SCORE_FIELD,\n", - " CALENVIROSCREEN_PERCENTILE_FIELD,\n", - " CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD,\n", - "]\n", - "\n", - "merged_df = cejst_df.loc[:, cejst_columns_to_keep].merge(\n", - " calenviroscreen_df.loc[:, calenviroscreen_columns_to_keep],\n", + "merged_df = cejst_df.merge(\n", + " census_tract_df,\n", " how=\"left\",\n", - " on=CENSUS_TRACT_ID_FIELD,\n", + " on=GEOID_TRACT_FIELD_NAME,\n", ")\n", "\n", + "\n", + "if len(merged_df) > 220333:\n", + " raise ValueError(\"Too many rows in the join.\")\n", + "\n", "merged_df.head()\n", "\n", + "\n", "# merged_df.to_csv(\n", - "# path_or_buf=TEMP_DATA_DIR / \"merged.csv\",\n", - "# na_rep=\"\",\n", - "# index=False\n", + "# path_or_buf=COMPARISON_OUTPUTS_DIR / \"merged.csv\", na_rep=\"\", index=False\n", "# )" ] }, { "cell_type": "code", "execution_count": null, - "id": "939baea4", + "id": "8a801121", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# Create analysis\n", - "def calculate_comparison(frame):\n", - " # Keep all the CES values at the Census Tract Level\n", - " df = frame.loc[\n", - " frame.index[0],\n", - " [\n", - " CENSUS_TRACT_ID_FIELD,\n", - " CALENVIROSCREEN_SCORE_FIELD,\n", - " CALENVIROSCREEN_PERCENTILE_FIELD,\n", - " CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD,\n", - " ],\n", - " ]\n", + "cejst_priority_communities_fields = [\n", + " \"Score A (top 25th percentile)\",\n", + " \"Score B (top 25th percentile)\",\n", + " \"Score C (top 25th percentile)\",\n", + " \"Score D (top 25th percentile)\",\n", + " \"Score E (top 25th percentile)\",\n", + "]\n", "\n", - " # Convenience constant for whether the tract is or is not a CalEnviroScreen priority community.\n", - " is_a_ces_priority_tract = frame.loc[\n", - " frame.index[0], [CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD]\n", - " ][0]\n", - "\n", - " # Recall that NaN values are not falsy, so we need to check if `is_a_ces_priority_tract` is True.\n", - " is_a_ces_priority_tract = is_a_ces_priority_tract is True\n", - "\n", - " # Calculate whether the tract (whether or not it is a CES priority tract) includes CBGs that are priority\n", - " # according to the current CEJST score.\n", - " df[any_tract_has_at_least_one_cbg] = (\n", - " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", - " )\n", - "\n", - " # Calculate comparison\n", - " # A CES priority tract has at least one CEJST priority CBG.\n", - " df[tract_has_at_least_one_cbg] = (\n", - " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", - " if is_a_ces_priority_tract\n", - " else None\n", - " )\n", - "\n", - " # A CES priority tract has all of its contained CBGs as CEJST priority CBGs.\n", - " df[tract_has_100_percent_cbg] = (\n", - " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n", - " if is_a_ces_priority_tract\n", - " else None\n", - " )\n", - "\n", - " # Calculate the inverse\n", - " # A tract that is _not_ a CES priority has at least one CEJST priority CBG.\n", - " df[non_ces_tract_has_at_least_one_cbg] = (\n", - " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", - " if not is_a_ces_priority_tract\n", - " else None\n", - " )\n", - "\n", - " # A tract that is _not_ a CES priority has all of its contained CBGs as CEJST priority CBGs.\n", - " df[non_ces_tract_has_100_percent_cbg] = (\n", - " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n", - " if not is_a_ces_priority_tract\n", - " else None\n", - " )\n", - "\n", - " return df\n", - "\n", - "\n", - "# Group all data by the census tract.\n", - "grouped_df = merged_df.groupby(CENSUS_TRACT_ID_FIELD)\n", - "\n", - "# Run the comparison function on the groups.\n", - "comparison_df = grouped_df.progress_apply(calculate_comparison)\n", - "\n", - "# Sort descending by highest CES Score for convenience when viewing output file\n", - "comparison_df.sort_values(\n", - " by=[CALENVIROSCREEN_PERCENTILE_FIELD], ascending=False, inplace=True\n", - ")\n", - "\n", - "# Write comparison to CSV.\n", - "comparison_df.to_csv(\n", - " path_or_buf=TEMP_DATA_DIR / \"Comparison Output.csv\", na_rep=\"\", index=False\n", - ")\n", - "\n", - "print(comparison_df.head())" + "comparison_priority_communities_fields = [\n", + " \"calenviroscreen_priority_community\",\n", + " \"hud_recap_priority_community\",\n", + "]" ] }, { "cell_type": "code", "execution_count": null, - "id": "85709225", + "id": "9fef0da9", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# Prepare some constants for use in the following Markdown cell.\n", - "total_cbgs_ca_only = len(cejst_df)\n", - "cejst_cbgs_ca_only = cejst_df.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum()\n", - "cejst_cbgs_ca_only_percent = f\"{cejst_cbgs_ca_only / total_cbgs_ca_only:.0%}\"\n", + "def get_state_distributions(\n", + " df: pd.DataFrame, priority_communities_fields: typing.List[str]\n", + ") -> pd.DataFrame:\n", + " \"\"\"For each boolean field of priority communities, calculate distribution across states and territories.\"\"\"\n", "\n", - "total_tracts_count = len(comparison_df)\n", - "ces_tracts_count = comparison_df.loc[:, CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].sum()\n", - "ces_tracts_count_percent = f\"{ces_tracts_count / total_tracts_count:.0%}\"\n", - "non_ces_tracts_count = total_tracts_count - ces_tracts_count\n", + " # Ensure each field is boolean.\n", + " for priority_communities_field in priority_communities_fields:\n", + " if df[priority_communities_field].dtype != bool:\n", + " print(f\"Converting {priority_communities_field} to boolean.\")\n", "\n", - "total_tracts_count = len(comparison_df[CENSUS_TRACT_ID_FIELD])\n", - "cejst_tracts_count = comparison_df.loc[:, any_tract_has_at_least_one_cbg].sum()\n", - "cejst_tracts_count_percent = f\"{cejst_tracts_count / total_tracts_count:.0%}\"\n", + " # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n", + " df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n", + " df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n", + " )\n", "\n", - "# CES stats\n", - "at_least_one_sum = comparison_df.loc[:, tract_has_at_least_one_cbg].sum()\n", - "at_least_one_sum_percent = f\"{at_least_one_sum / ces_tracts_count:.0%}\"\n", + " def calculate_state_comparison(frame: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " This method will be applied to a `group_by` object. Inherits some parameters from outer scope.\n", + " \"\"\"\n", + " state_id = frame[GEOID_STATE_FIELD_NAME].unique()[0]\n", "\n", - "all_100_sum = comparison_df.loc[:, tract_has_100_percent_cbg].sum()\n", - "all_100_sum_percent = f\"{all_100_sum / ces_tracts_count:.0%}\"\n", + " summary_dict = {}\n", + " summary_dict[GEOID_STATE_FIELD_NAME] = state_id\n", + " summary_dict[\"State name\"] = us.states.lookup(state_id).name\n", + " summary_dict[\"Total CBGs in state\"] = len(frame)\n", + " summary_dict[\"Total population in state\"] = frame[\n", + " CENSUS_BLOCK_GROUP_POPULATION_FIELD\n", + " ].sum()\n", "\n", - "# Non-CES stats:\n", - "non_ces_at_least_one_sum = comparison_df.loc[\n", - " :, non_ces_tract_has_at_least_one_cbg\n", - "].sum()\n", - "non_ces_at_least_one_sum_percent = (\n", - " f\"{non_ces_at_least_one_sum / non_ces_tracts_count:.0%}\"\n", + " for priority_communities_field in priority_communities_fields:\n", + " summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n", + " f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n", + " ].sum()\n", + "\n", + " summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n", + " f\"{priority_communities_field}\"\n", + " ].sum()\n", + "\n", + " # Calculate some combinations of other variables.\n", + " summary_dict[f\"{priority_communities_field} (percent CBGs)\"] = (\n", + " summary_dict[f\"{priority_communities_field} (total CBGs)\"]\n", + " / summary_dict[\"Total CBGs in state\"]\n", + " )\n", + "\n", + " summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n", + " summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n", + " / summary_dict[\"Total population in state\"]\n", + " )\n", + "\n", + " df = pd.DataFrame(summary_dict, index=[0])\n", + "\n", + " return df\n", + "\n", + " grouped_df = df.groupby(GEOID_STATE_FIELD_NAME)\n", + "\n", + " # Run the comparison function on the groups.\n", + " state_distribution_df = grouped_df.progress_apply(calculate_state_comparison)\n", + "\n", + " return state_distribution_df\n", + "\n", + "\n", + "def write_state_distribution_excel(\n", + " state_distribution_df: pd.DataFrame, file_path: pathlib.PosixPath\n", + ") -> None:\n", + " \"\"\"Write the dataframe to excel with special formatting.\"\"\"\n", + " # Create a Pandas Excel writer using XlsxWriter as the engine.\n", + " writer = pd.ExcelWriter(file_path, engine=\"xlsxwriter\")\n", + "\n", + " # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n", + " # index column at the left of the output dataframe.\n", + " state_distribution_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n", + "\n", + " # Get the xlsxwriter workbook and worksheet objects.\n", + " workbook = writer.book\n", + " worksheet = writer.sheets[\"Sheet1\"]\n", + " worksheet.autofilter(\n", + " 0, 0, state_distribution_df.shape[0], state_distribution_df.shape[1]\n", + " )\n", + "\n", + " for column in state_distribution_df.columns:\n", + " # Special formatting for columns that capture the percent of population considered priority.\n", + " if \"(percent population)\" in column:\n", + " # Turn the column index into excel ranges (e.g., column #95 is \"CR\" and the range may be \"CR2:CR53\").\n", + " column_index = state_distribution_df.columns.get_loc(column)\n", + " column_character = get_excel_column_name(column_index)\n", + " column_ranges = (\n", + " f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n", + " )\n", + "\n", + " # Add green to red conditional formatting.\n", + " worksheet.conditional_format(\n", + " column_ranges,\n", + " # Min: green, max: red.\n", + " {\n", + " \"type\": \"2_color_scale\",\n", + " \"min_color\": \"#00FF7F\",\n", + " \"max_color\": \"#C82538\",\n", + " },\n", + " )\n", + "\n", + " # TODO: text wrapping not working, fix.\n", + " text_wrap = workbook.add_format({\"text_wrap\": True})\n", + "\n", + " # Make these columns wide enough that you can read them.\n", + " worksheet.set_column(\n", + " f\"{column_character}:{column_character}\", 40, text_wrap\n", + " )\n", + "\n", + " writer.save()\n", + "\n", + "\n", + "state_distribution_df = get_state_distributions(\n", + " df=merged_df,\n", + " priority_communities_fields=cejst_priority_communities_fields\n", + " + comparison_priority_communities_fields,\n", ")\n", "\n", - "non_ces_all_100_sum = comparison_df.loc[:, non_ces_tract_has_100_percent_cbg].sum()\n", - "non_ces_all_100_sum_percent = f\"{non_ces_all_100_sum / non_ces_tracts_count:.0%}\"\n", + "state_distribution_df.to_csv(\n", + " path_or_buf=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.csv\",\n", + " na_rep=\"\",\n", + " index=False,\n", + ")\n", "\n", - "# Note, for the following Markdown cell to render the variables properly, follow the steps at\n", - "# \"Activating variable-enabled Markdown for Jupyter notebooks\" within `score/README.md`." + "write_state_distribution_excel(\n", + " state_distribution_df=state_distribution_df,\n", + " file_path=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.xlsx\",\n", + ")\n", + "\n", + "state_distribution_df.head()" ] }, { - "cell_type": "markdown", - "id": "0c534966", - "metadata": { - "variables": { - " total_tracts_count": "8057", - "all_100_sum": "1168", - "all_100_sum_percent": "59%", - "at_least_one_sum": "1817", - "at_least_one_sum_percent": "92%", - "cejst_cbgs_ca_only": "6987", - "cejst_cbgs_ca_only_percent": "30%", - "cejst_tracts_count": "3516", - "cejst_tracts_count_percent": "44%", - "ces_tracts_count": "1983", - "ces_tracts_count_percent": "25%", - "datetime.today().strftime('%Y-%m-%d')": "2021-06-28", - "non_ces_all_100_sum": "438", - "non_ces_all_100_sum_percent": "7%", - "non_ces_at_least_one_sum": "1699", - "non_ces_at_least_one_sum_percent": "28%", - "score_used": "Score A", - "total_cbgs_ca_only": "23212" - } - }, + "cell_type": "code", + "execution_count": null, + "id": "d46667cf", + "metadata": {}, + "outputs": [], "source": [ - "# Summary of findings for {{score_used}}\n", + "# This cell defines a couple of comparison functions. It does not run them.\n", "\n", - "(Calculated on {{datetime.today().strftime('%Y-%m-%d')}})\n", + "# Define a namedtuple for column names, which need to be shared between multiple parts of this comparison pipeline.\n", + "# Named tuples are useful here because they provide guarantees that for each instance, all properties are defined and\n", + "# can be accessed as properties (rather than as strings).\n", + "\n", + "# Note: if you'd like to add a field used throughout the comparison process, add it in three places.\n", + "# For an example `new_field`,\n", + "# 1. in this namedtuple, add the field as a string in `field_names` (e.g., `field_names=[..., \"new_field\"])`)\n", + "# 2. in the function `get_comparison_field_names`, define how the field name should be created from input data\n", + "# (e.g., `...new_field=f\"New field compares {method_a_name} to {method_b_name}\")\n", + "# 3. In the function `get_comparison_markdown_content`, add some reporting on the new field to the markdown content.\n", + "# (e.g., `The statistics indicate that {calculation_based_on_new_field} percent of census tracts are different between scores.`)\n", + "ComparisonFieldNames = collections.namedtuple(\n", + " typename=\"ComparisonFieldNames\",\n", + " field_names=[\n", + " \"any_tract_has_at_least_one_method_a_cbg\",\n", + " \"method_b_tract_has_at_least_one_method_a_cbg\",\n", + " \"method_b_tract_has_100_percent_method_a_cbg\",\n", + " \"method_b_non_priority_tract_has_at_least_one_method_a_cbg\",\n", + " \"method_b_non_priority_tract_has_100_percent_method_a_cbg\",\n", + " ],\n", + ")\n", + "\n", + "# Define a namedtuple for indices.\n", + "Index = collections.namedtuple(\n", + " typename=\"Index\",\n", + " field_names=[\n", + " \"method_name\",\n", + " \"priority_communities_field\",\n", + " # Note: this field only used by indices defined at the census tract level.\n", + " \"other_census_tract_fields_to_keep\",\n", + " ],\n", + ")\n", + "\n", + "\n", + "def get_comparison_field_names(\n", + " method_a_name: str,\n", + " method_b_name: str,\n", + ") -> ComparisonFieldNames:\n", + " comparison_field_names = ComparisonFieldNames(\n", + " any_tract_has_at_least_one_method_a_cbg=(\n", + " f\"Any tract has at least one {method_a_name} Priority CBG?\"\n", + " ),\n", + " method_b_tract_has_at_least_one_method_a_cbg=(\n", + " f\"{method_b_name} priority tract has at least one {method_a_name} CBG?\"\n", + " ),\n", + " method_b_tract_has_100_percent_method_a_cbg=(\n", + " f\"{method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n", + " ),\n", + " method_b_non_priority_tract_has_at_least_one_method_a_cbg=(\n", + " f\"Non-priority {method_b_name} tract has at least one {method_a_name} priority CBG?\"\n", + " ),\n", + " method_b_non_priority_tract_has_100_percent_method_a_cbg=(\n", + " f\"Non-priority {method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n", + " ),\n", + " )\n", + " return comparison_field_names\n", + "\n", + "\n", + "def get_df_with_only_shared_states(\n", + " df: pd.DataFrame,\n", + " field_a: str,\n", + " field_b: str,\n", + " state_field=GEOID_STATE_FIELD_NAME,\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Useful for looking at shared geographies across two fields.\n", + "\n", + " For a data frame and two fields, return a data frame only for states where there are non-null\n", + " values for both fields in that state (or territory).\n", + "\n", + " This is useful, for example, when running a comparison of CalEnviroScreen (only in California) against\n", + " a draft score that's national, and returning only the data for California for the entire data frame.\n", + " \"\"\"\n", + " field_a_states = df.loc[df[field_a].notnull(), state_field].unique()\n", + " field_b_states = df.loc[df[field_b].notnull(), state_field].unique()\n", + "\n", + " shared_states = list(set(field_a_states) & set(field_b_states))\n", + "\n", + " df = df.loc[df[state_field].isin(shared_states), :]\n", + "\n", + " return df\n", + "\n", + "\n", + "def get_comparison_df(\n", + " df: pd.DataFrame,\n", + " method_a_priority_census_block_groups_field: str,\n", + " method_b_priority_census_tracts_field: str,\n", + " other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n", + " comparison_field_names: ComparisonFieldNames,\n", + " output_dir: pathlib.PosixPath,\n", + ") -> None:\n", + " \"\"\"Produces a comparison report for any two given boolean columns representing priority fields.\n", + "\n", + " Args:\n", + " df: a pandas dataframe including the data for this comparison.\n", + " method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n", + " community field that defines communities at the level of census block groups (CBGs).\n", + " method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n", + " community field that defines communities at the level of census tracts.\n", + " other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n", + "\n", + " Returns:\n", + " df: a pandas dataframe with one row with the results of this comparison\n", + " \"\"\"\n", + "\n", + " def calculate_comparison(frame: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " This method will be applied to a `group_by` object.\n", + "\n", + " Note: It inherits from outer scope `method_a_priority_census_block_groups_field`, `method_b_priority_census_tracts_field`,\n", + " and `other_census_tract_fields_to_keep`.\n", + " \"\"\"\n", + " # Keep all the tract values at the Census Tract Level\n", + " for field in other_census_tract_fields_to_keep:\n", + " if len(frame[field].unique()) != 1:\n", + " raise ValueError(\n", + " f\"There are different values per CBG for field {field}.\"\n", + " \"`other_census_tract_fields_to_keep` can only be used for fields at the census tract level.\"\n", + " )\n", + "\n", + " df = frame.loc[\n", + " frame.index[0],\n", + " [\n", + " GEOID_TRACT_FIELD_NAME,\n", + " method_b_priority_census_tracts_field,\n", + " ]\n", + " + other_census_tract_fields_to_keep,\n", + " ]\n", + "\n", + " # Convenience constant for whether the tract is or is not a method B priority community.\n", + " is_a_method_b_priority_tract = frame.loc[\n", + " frame.index[0], [method_b_priority_census_tracts_field]\n", + " ][0]\n", + "\n", + " # Recall that NaN values are not falsy, so we need to check if `is_a_method_b_priority_tract` is True.\n", + " is_a_method_b_priority_tract = is_a_method_b_priority_tract is True\n", + "\n", + " # Calculate whether the tract (whether or not it is a comparison priority tract) includes CBGs that are priority\n", + " # according to the current CBG score.\n", + " df[comparison_field_names.any_tract_has_at_least_one_method_a_cbg] = (\n", + " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", + " )\n", + "\n", + " # Calculate comparison\n", + " # A comparison priority tract has at least one CBG that is a priority CBG.\n", + " df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n", + " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", + " if is_a_method_b_priority_tract\n", + " else None\n", + " )\n", + "\n", + " # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n", + " df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n", + " frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n", + " if is_a_method_b_priority_tract\n", + " else None\n", + " )\n", + "\n", + " # Calculate the inverse\n", + " # A tract that is _not_ a comparison priority has at least one CBG priority CBG.\n", + " df[\n", + " comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg\n", + " ] = (\n", + " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", + " if not is_a_method_b_priority_tract\n", + " else None\n", + " )\n", + "\n", + " # A tract that is _not_ a comparison priority has all of its contained CBGs as CBG priority CBGs.\n", + " df[\n", + " comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n", + " ] = (\n", + " frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n", + " if not is_a_method_b_priority_tract\n", + " else None\n", + " )\n", + "\n", + " return df\n", + "\n", + " # Group all data by the census tract.\n", + " grouped_df = df.groupby(GEOID_TRACT_FIELD_NAME)\n", + "\n", + " # Run the comparison function on the groups.\n", + " comparison_df = grouped_df.progress_apply(calculate_comparison)\n", + "\n", + " return comparison_df\n", + "\n", + "\n", + "def get_comparison_markdown_content(\n", + " original_df: pd.DataFrame,\n", + " comparison_df: pd.DataFrame,\n", + " comparison_field_names: ComparisonFieldNames,\n", + " method_a_name: str,\n", + " method_b_name: str,\n", + " method_a_priority_census_block_groups_field: str,\n", + " method_b_priority_census_tracts_field: str,\n", + " state_field: str = GEOID_STATE_FIELD_NAME,\n", + ") -> str:\n", + " # Prepare some constants for use in the following Markdown content.\n", + " total_cbgs = len(original_df)\n", + "\n", + " # List of all states/territories in their FIPS codes:\n", + " state_ids = sorted(original_df[state_field].unique())\n", + " state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n", + "\n", + " # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n", + " # TODO: investigate why sums are sometimes series and sometimes scalar.\n", + " method_a_priority_cbgs = (\n", + " original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n", + " )\n", + " method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n", + "\n", + " total_tracts_count = len(comparison_df)\n", + "\n", + " method_b_priority_tracts_count = comparison_df.loc[\n", + " :, method_b_priority_census_tracts_field\n", + " ].sum()\n", + "\n", + " method_b_priority_tracts_count_percent = (\n", + " f\"{method_b_priority_tracts_count / total_tracts_count:.0%}\"\n", + " )\n", + " method_b_non_priority_tracts_count = (\n", + " total_tracts_count - method_b_priority_tracts_count\n", + " )\n", + "\n", + " method_a_tracts_count = (\n", + " comparison_df.loc[\n", + " :, comparison_field_names.any_tract_has_at_least_one_method_a_cbg\n", + " ]\n", + " .sum()\n", + " .squeeze()\n", + " )\n", + " method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n", + "\n", + " # Method A priority community stats\n", + " method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n", + " :, comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n", + " ].sum()\n", + " method_b_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_tracts_with_at_least_one_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n", + "\n", + " method_b_tracts_with_at_100_percent_method_a_cbg = comparison_df.loc[\n", + " :, comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n", + " ].sum()\n", + " method_b_tracts_with_at_100_percent_method_a_cbg_percent = f\"{method_b_tracts_with_at_100_percent_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n", + "\n", + " # Method A non-priority community stats\n", + " method_b_non_priority_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n", + " :,\n", + " comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,\n", + " ].sum()\n", + "\n", + " method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_at_least_one_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n", + "\n", + " method_b_non_priority_tracts_with_100_percent_method_a_cbg = comparison_df.loc[\n", + " :,\n", + " comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg,\n", + " ].sum()\n", + " method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_100_percent_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n", + "\n", + " # Create markdown content for comparisons.\n", + " markdown_content = f\"\"\"\n", + "# {method_a_name} compared to {method_b_name}\n", + "\n", + "(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)\n", + "\n", + "This report analyzes the following US states and territories: {state_names}.\n", "\n", "Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n", "\n", - "There are {{ces_tracts_count}} census tracts designated as Disadvantaged Communities by CalEnviroScreen 4.0, out of {{total_tracts_count}} total tracts ({{ces_tracts_count_percent}}). \n", + "Within the geographic area analyzed, there are {method_b_priority_tracts_count} census tracts designated as priority communities by {method_b_name}, out of {total_tracts_count} total tracts ({method_b_priority_tracts_count_percent}). \n", "\n", - "Within California, there are {{cejst_cbgs_ca_only}} census block groups considered as priority communities by the current version of the CEJST score used in this analysis, out of {{total_cbgs_ca_only}} CBGs in the state ({{cejst_cbgs_ca_only_percent}}). They occupy {{cejst_tracts_count}} ({{cejst_tracts_count_percent}}) of all the census tracts in California.\n", + "Within the geographic region analyzed, there are {method_a_priority_cbgs} census block groups considered as priority communities by {method_a_name}, out of {total_cbgs} CBGs ({method_a_priority_cbgs_percent}). They occupy {method_a_tracts_count} census tracts ({method_a_tracts_count_percent}) of the geographic area analyzed.\n", "\n", - "Out of every CalEnviroScreen Disadvantaged Community census tract, {{at_least_one_sum}} ({{at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", + "Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_least_one_method_a_cbg} ({method_b_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by {method_a_name}.\n", "\n", - "Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.\n", + "Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_100_percent_method_a_cbg} ({method_b_tracts_with_at_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by {method_a_name}.\n", "\n", - "Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_at_least_one_sum}} ({{non_ces_at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", + "Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_at_least_one_method_a_cbg} ({method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", "\n", - "Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_all_100_sum}} ({{non_ces_all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score." + "Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_100_percent_method_a_cbg} ({method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.\n", + "\"\"\"\n", + "\n", + " return markdown_content\n", + "\n", + "\n", + "def write_markdown_and_docx_content(\n", + " markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str\n", + ") -> pathlib.PosixPath:\n", + " \"\"\"Write Markdown content to both .md and .docx files.\"\"\"\n", + " # Set the file paths for both files.\n", + " markdown_file_path = file_dir / f\"{file_name_without_extension}.md\"\n", + " docx_file_path = file_dir / f\"{file_name_without_extension}.docx\"\n", + "\n", + " # Write the markdown content to file.\n", + " with open(markdown_file_path, \"w\") as text_file:\n", + " text_file.write(markdown_content)\n", + "\n", + " # Convert markdown file to Word doc.\n", + " pypandoc.convert_file(\n", + " source_file=str(markdown_file_path),\n", + " to=\"docx\",\n", + " outputfile=str(docx_file_path),\n", + " extra_args=[],\n", + " )\n", + "\n", + " return docx_file_path\n", + "\n", + "\n", + "def execute_comparison(\n", + " df: pd.DataFrame,\n", + " method_a_name: str,\n", + " method_b_name: str,\n", + " method_a_priority_census_block_groups_field: str,\n", + " method_b_priority_census_tracts_field: str,\n", + " other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n", + ") -> pathlib.PosixPath:\n", + " \"\"\"Execute an individual comparison by creating the data frame and writing the report.\n", + "\n", + " Args:\n", + " df: a pandas dataframe including the data for this comparison.\n", + " method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n", + " community field that defines communities at the level of census block groups (CBGs).\n", + " method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n", + " community field that defines communities at the level of census tracts.\n", + " other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n", + "\n", + " Returns:\n", + " df: a pandas dataframe with one row with the results of this comparison\n", + "\n", + " \"\"\"\n", + " comparison_field_names = get_comparison_field_names(\n", + " method_a_name=method_a_name, method_b_name=method_b_name\n", + " )\n", + "\n", + " # Create or use a directory for outputs grouped by Method A.\n", + " output_dir = COMPARISON_OUTPUTS_DIR / method_a_name\n", + " output_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + " df_with_only_shared_states = get_df_with_only_shared_states(\n", + " df=df,\n", + " field_a=method_a_priority_census_block_groups_field,\n", + " field_b=method_b_priority_census_tracts_field,\n", + " )\n", + "\n", + " comparison_df = get_comparison_df(\n", + " df=df_with_only_shared_states,\n", + " method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n", + " method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n", + " comparison_field_names=comparison_field_names,\n", + " other_census_tract_fields_to_keep=other_census_tract_fields_to_keep,\n", + " output_dir=output_dir,\n", + " )\n", + "\n", + " # Choose output path\n", + " file_path = (\n", + " output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n", + " )\n", + "\n", + " # Write comparison to CSV.\n", + " comparison_df.to_csv(\n", + " path_or_buf=file_path,\n", + " na_rep=\"\",\n", + " index=False,\n", + " )\n", + "\n", + " markdown_content = get_comparison_markdown_content(\n", + " original_df=df_with_only_shared_states,\n", + " comparison_df=comparison_df,\n", + " comparison_field_names=comparison_field_names,\n", + " method_a_name=method_a_name,\n", + " method_b_name=method_b_name,\n", + " method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n", + " method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n", + " )\n", + "\n", + " comparison_docx_file_path = write_markdown_and_docx_content(\n", + " markdown_content=markdown_content,\n", + " file_dir=output_dir,\n", + " file_name_without_extension=f\"Comparison report - {method_a_name} and {method_b_name}\",\n", + " )\n", + "\n", + " return comparison_docx_file_path\n", + "\n", + "\n", + "def execute_comparisons(\n", + " df: pd.DataFrame,\n", + " census_block_group_indices: typing.List[Index],\n", + " census_tract_indices: typing.List[Index],\n", + "):\n", + " \"\"\"Create multiple comparison reports.\"\"\"\n", + " comparison_docx_file_paths = []\n", + " for cbg_index in census_block_group_indices:\n", + " for census_tract_index in census_tract_indices:\n", + " print(\n", + " f\"Running comparisons for {cbg_index.method_name} against {census_tract_index.method_name}...\"\n", + " )\n", + "\n", + " comparison_docx_file_path = execute_comparison(\n", + " df=df,\n", + " method_a_name=cbg_index.method_name,\n", + " method_b_name=census_tract_index.method_name,\n", + " method_a_priority_census_block_groups_field=cbg_index.priority_communities_field,\n", + " method_b_priority_census_tracts_field=census_tract_index.priority_communities_field,\n", + " other_census_tract_fields_to_keep=census_tract_index.other_census_tract_fields_to_keep,\n", + " )\n", + "\n", + " comparison_docx_file_paths.append(comparison_docx_file_path)\n", + "\n", + " return comparison_docx_file_paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48d9bf6b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Actually execute the functions\n", + "\n", + "# # California only\n", + "# cal_df = merged_df[merged_df[GEOID_TRACT_FIELD_NAME].astype(str).str[0:2] == \"06\"]\n", + "# # cal_df = cal_df[0:1000]\n", + "# print(len(cal_df))\n", + "\n", + "census_block_group_indices = [\n", + " Index(\n", + " method_name=\"Score A\",\n", + " priority_communities_field=\"Score A (top 25th percentile)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " # Index(\n", + " # method_name=\"Score B\",\n", + " # priority_communities_field=\"Score B (top 25th percentile)\",\n", + " # other_census_tract_fields_to_keep=[],\n", + " # ),\n", + " Index(\n", + " method_name=\"Score C\",\n", + " priority_communities_field=\"Score C (top 25th percentile)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"Score D\",\n", + " priority_communities_field=\"Score D (top 25th percentile)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " # Index(\n", + " # method_name=\"Score E\",\n", + " # priority_communities_field=\"Score E (top 25th percentile)\",\n", + " # other_census_tract_fields_to_keep=[],\n", + " # ),\n", + "]\n", + "\n", + "census_tract_indices = [\n", + " Index(\n", + " method_name=\"CalEnviroScreen 4.0\",\n", + " priority_communities_field=\"calenviroscreen_priority_community\",\n", + " other_census_tract_fields_to_keep=[\n", + " CALENVIROSCREEN_SCORE_FIELD,\n", + " CALENVIROSCREEN_PERCENTILE_FIELD,\n", + " ],\n", + " ),\n", + " Index(\n", + " method_name=\"HUD RECAP\",\n", + " priority_communities_field=\"hud_recap_priority_community\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + "]\n", + "\n", + "file_paths = execute_comparisons(\n", + " df=merged_df,\n", + " census_block_group_indices=census_block_group_indices,\n", + " census_tract_indices=census_tract_indices,\n", + ")\n", + "\n", + "print(file_paths)" ] } ], diff --git a/score/requirements.txt b/score/requirements.txt index a3e543b4..3fd8170b 100644 Binary files a/score/requirements.txt and b/score/requirements.txt differ diff --git a/score/utils.py b/score/utils.py index 34b4beaa..dea5a3a6 100644 --- a/score/utils.py +++ b/score/utils.py @@ -74,3 +74,1014 @@ def unzip_file_from_url( # cleanup temporary file os.remove(zip_file_path) + +def get_excel_column_name(index: int) -> str: + """This is used to map a numeric index to the appropriate column in Excel. + + E.g., column #95 is "CR". + + Only works for the first 1000 columns. + """ + excel_column_names = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "AA", + "AB", + "AC", + "AD", + "AE", + "AF", + "AG", + "AH", + "AI", + "AJ", + "AK", + "AL", + "AM", + "AN", + "AO", + "AP", + "AQ", + "AR", + "AS", + "AT", + "AU", + "AV", + "AW", + "AX", + "AY", + "AZ", + "BA", + "BB", + "BC", + "BD", + "BE", + "BF", + "BG", + "BH", + "BI", + "BJ", + "BK", + "BL", + "BM", + "BN", + "BO", + "BP", + "BQ", + "BR", + "BS", + "BT", + "BU", + "BV", + "BW", + "BX", + "BY", + "BZ", + "CA", + "CB", + "CC", + "CD", + "CE", + "CF", + "CG", + "CH", + "CI", + "CJ", + "CK", + "CL", + "CM", + "CN", + "CO", + "CP", + "CQ", + "CR", + "CS", + "CT", + "CU", + "CV", + "CW", + "CX", + "CY", + "CZ", + "DA", + "DB", + "DC", + "DD", + "DE", + "DF", + "DG", + "DH", + "DI", + "DJ", + "DK", + "DL", + "DM", + "DN", + "DO", + "DP", + "DQ", + "DR", + "DS", + "DT", + "DU", + "DV", + "DW", + "DX", + "DY", + "DZ", + "EA", + "EB", + "EC", + "ED", + "EE", + "EF", + "EG", + "EH", + "EI", + "EJ", + "EK", + "EL", + "EM", + "EN", + "EO", + "EP", + "EQ", + "ER", + "ES", + "ET", + "EU", + "EV", + "EW", + "EX", + "EY", + "EZ", + "FA", + "FB", + "FC", + "FD", + "FE", + "FF", + "FG", + "FH", + "FI", + "FJ", + "FK", + "FL", + "FM", + "FN", + "FO", + "FP", + "FQ", + "FR", + "FS", + "FT", + "FU", + "FV", + "FW", + "FX", + "FY", + "FZ", + "GA", + "GB", + "GC", + "GD", + "GE", + "GF", + "GG", + "GH", + "GI", + "GJ", + "GK", + "GL", + "GM", + "GN", + "GO", + "GP", + "GQ", + "GR", + "GS", + "GT", + "GU", + "GV", + "GW", + "GX", + "GY", + "GZ", + "HA", + "HB", + "HC", + "HD", + "HE", + "HF", + "HG", + "HH", + "HI", + "HJ", + "HK", + "HL", + "HM", + "HN", + "HO", + "HP", + "HQ", + "HR", + "HS", + "HT", + "HU", + "HV", + "HW", + "HX", + "HY", + "HZ", + "IA", + "IB", + "IC", + "ID", + "IE", + "IF", + "IG", + "IH", + "II", + "IJ", + "IK", + "IL", + "IM", + "IN", + "IO", + "IP", + "IQ", + "IR", + "IS", + "IT", + "IU", + "IV", + "IW", + "IX", + "IY", + "IZ", + "JA", + "JB", + "JC", + "JD", + "JE", + "JF", + "JG", + "JH", + "JI", + "JJ", + "JK", + "JL", + "JM", + "JN", + "JO", + "JP", + "JQ", + "JR", + "JS", + "JT", + "JU", + "JV", + "JW", + "JX", + "JY", + "JZ", + "KA", + "KB", + "KC", + "KD", + "KE", + "KF", + "KG", + "KH", + "KI", + "KJ", + "KK", + "KL", + "KM", + "KN", + "KO", + "KP", + "KQ", + "KR", + "KS", + "KT", + "KU", + "KV", + "KW", + "KX", + "KY", + "KZ", + "LA", + "LB", + "LC", + "LD", + "LE", + "LF", + "LG", + "LH", + "LI", + "LJ", + "LK", + "LL", + "LM", + "LN", + "LO", + "LP", + "LQ", + "LR", + "LS", + "LT", + "LU", + "LV", + "LW", + "LX", + "LY", + "LZ", + "MA", + "MB", + "MC", + "MD", + "ME", + "MF", + "MG", + "MH", + "MI", + "MJ", + "MK", + "ML", + "MM", + "MN", + "MO", + "MP", + "MQ", + "MR", + "MS", + "MT", + "MU", + "MV", + "MW", + "MX", + "MY", + "MZ", + "NA", + "NB", + "NC", + "ND", + "NE", + "NF", + "NG", + "NH", + "NI", + "NJ", + "NK", + "NL", + "NM", + "NN", + "NO", + "NP", + "NQ", + "NR", + "NS", + "NT", + "NU", + "NV", + "NW", + "NX", + "NY", + "NZ", + "OA", + "OB", + "OC", + "OD", + "OE", + "OF", + "OG", + "OH", + "OI", + "OJ", + "OK", + "OL", + "OM", + "ON", + "OO", + "OP", + "OQ", + "OR", + "OS", + "OT", + "OU", + "OV", + "OW", + "OX", + "OY", + "OZ", + "PA", + "PB", + "PC", + "PD", + "PE", + "PF", + "PG", + "PH", + "PI", + "PJ", + "PK", + "PL", + "PM", + "PN", + "PO", + "PP", + "PQ", + "PR", + "PS", + "PT", + "PU", + "PV", + "PW", + "PX", + "PY", + "PZ", + "QA", + "QB", + "QC", + "QD", + "QE", + "QF", + "QG", + "QH", + "QI", + "QJ", + "QK", + "QL", + "QM", + "QN", + "QO", + "QP", + "QQ", + "QR", + "QS", + "QT", + "QU", + "QV", + "QW", + "QX", + "QY", + "QZ", + "RA", + "RB", + "RC", + "RD", + "RE", + "RF", + "RG", + "RH", + "RI", + "RJ", + "RK", + "RL", + "RM", + "RN", + "RO", + "RP", + "RQ", + "RR", + "RS", + "RT", + "RU", + "RV", + "RW", + "RX", + "RY", + "RZ", + "SA", + "SB", + "SC", + "SD", + "SE", + "SF", + "SG", + "SH", + "SI", + "SJ", + "SK", + "SL", + "SM", + "SN", + "SO", + "SP", + "SQ", + "SR", + "SS", + "ST", + "SU", + "SV", + "SW", + "SX", + "SY", + "SZ", + "TA", + "TB", + "TC", + "TD", + "TE", + "TF", + "TG", + "TH", + "TI", + "TJ", + "TK", + "TL", + "TM", + "TN", + "TO", + "TP", + "TQ", + "TR", + "TS", + "TT", + "TU", + "TV", + "TW", + "TX", + "TY", + "TZ", + "UA", + "UB", + "UC", + "UD", + "UE", + "UF", + "UG", + "UH", + "UI", + "UJ", + "UK", + "UL", + "UM", + "UN", + "UO", + "UP", + "UQ", + "UR", + "US", + "UT", + "UU", + "UV", + "UW", + "UX", + "UY", + "UZ", + "VA", + "VB", + "VC", + "VD", + "VE", + "VF", + "VG", + "VH", + "VI", + "VJ", + "VK", + "VL", + "VM", + "VN", + "VO", + "VP", + "VQ", + "VR", + "VS", + "VT", + "VU", + "VV", + "VW", + "VX", + "VY", + "VZ", + "WA", + "WB", + "WC", + "WD", + "WE", + "WF", + "WG", + "WH", + "WI", + "WJ", + "WK", + "WL", + "WM", + "WN", + "WO", + "WP", + "WQ", + "WR", + "WS", + "WT", + "WU", + "WV", + "WW", + "WX", + "WY", + "WZ", + "XA", + "XB", + "XC", + "XD", + "XE", + "XF", + "XG", + "XH", + "XI", + "XJ", + "XK", + "XL", + "XM", + "XN", + "XO", + "XP", + "XQ", + "XR", + "XS", + "XT", + "XU", + "XV", + "XW", + "XX", + "XY", + "XZ", + "YA", + "YB", + "YC", + "YD", + "YE", + "YF", + "YG", + "YH", + "YI", + "YJ", + "YK", + "YL", + "YM", + "YN", + "YO", + "YP", + "YQ", + "YR", + "YS", + "YT", + "YU", + "YV", + "YW", + "YX", + "YY", + "YZ", + "ZA", + "ZB", + "ZC", + "ZD", + "ZE", + "ZF", + "ZG", + "ZH", + "ZI", + "ZJ", + "ZK", + "ZL", + "ZM", + "ZN", + "ZO", + "ZP", + "ZQ", + "ZR", + "ZS", + "ZT", + "ZU", + "ZV", + "ZW", + "ZX", + "ZY", + "ZZ", + "AAA", + "AAB", + "AAC", + "AAD", + "AAE", + "AAF", + "AAG", + "AAH", + "AAI", + "AAJ", + "AAK", + "AAL", + "AAM", + "AAN", + "AAO", + "AAP", + "AAQ", + "AAR", + "AAS", + "AAT", + "AAU", + "AAV", + "AAW", + "AAX", + "AAY", + "AAZ", + "ABA", + "ABB", + "ABC", + "ABD", + "ABE", + "ABF", + "ABG", + "ABH", + "ABI", + "ABJ", + "ABK", + "ABL", + "ABM", + "ABN", + "ABO", + "ABP", + "ABQ", + "ABR", + "ABS", + "ABT", + "ABU", + "ABV", + "ABW", + "ABX", + "ABY", + "ABZ", + "ACA", + "ACB", + "ACC", + "ACD", + "ACE", + "ACF", + "ACG", + "ACH", + "ACI", + "ACJ", + "ACK", + "ACL", + "ACM", + "ACN", + "ACO", + "ACP", + "ACQ", + "ACR", + "ACS", + "ACT", + "ACU", + "ACV", + "ACW", + "ACX", + "ACY", + "ACZ", + "ADA", + "ADB", + "ADC", + "ADD", + "ADE", + "ADF", + "ADG", + "ADH", + "ADI", + "ADJ", + "ADK", + "ADL", + "ADM", + "ADN", + "ADO", + "ADP", + "ADQ", + "ADR", + "ADS", + "ADT", + "ADU", + "ADV", + "ADW", + "ADX", + "ADY", + "ADZ", + "AEA", + "AEB", + "AEC", + "AED", + "AEE", + "AEF", + "AEG", + "AEH", + "AEI", + "AEJ", + "AEK", + "AEL", + "AEM", + "AEN", + "AEO", + "AEP", + "AEQ", + "AER", + "AES", + "AET", + "AEU", + "AEV", + "AEW", + "AEX", + "AEY", + "AEZ", + "AFA", + "AFB", + "AFC", + "AFD", + "AFE", + "AFF", + "AFG", + "AFH", + "AFI", + "AFJ", + "AFK", + "AFL", + "AFM", + "AFN", + "AFO", + "AFP", + "AFQ", + "AFR", + "AFS", + "AFT", + "AFU", + "AFV", + "AFW", + "AFX", + "AFY", + "AFZ", + "AGA", + "AGB", + "AGC", + "AGD", + "AGE", + "AGF", + "AGG", + "AGH", + "AGI", + "AGJ", + "AGK", + "AGL", + "AGM", + "AGN", + "AGO", + "AGP", + "AGQ", + "AGR", + "AGS", + "AGT", + "AGU", + "AGV", + "AGW", + "AGX", + "AGY", + "AGZ", + "AHA", + "AHB", + "AHC", + "AHD", + "AHE", + "AHF", + "AHG", + "AHH", + "AHI", + "AHJ", + "AHK", + "AHL", + "AHM", + "AHN", + "AHO", + "AHP", + "AHQ", + "AHR", + "AHS", + "AHT", + "AHU", + "AHV", + "AHW", + "AHX", + "AHY", + "AHZ", + "AIA", + "AIB", + "AIC", + "AID", + "AIE", + "AIF", + "AIG", + "AIH", + "AII", + "AIJ", + "AIK", + "AIL", + "AIM", + "AIN", + "AIO", + "AIP", + "AIQ", + "AIR", + "AIS", + "AIT", + "AIU", + "AIV", + "AIW", + "AIX", + "AIY", + "AIZ", + "AJA", + "AJB", + "AJC", + "AJD", + "AJE", + "AJF", + "AJG", + "AJH", + "AJI", + "AJJ", + "AJK", + "AJL", + "AJM", + "AJN", + "AJO", + "AJP", + "AJQ", + "AJR", + "AJS", + "AJT", + "AJU", + "AJV", + "AJW", + "AJX", + "AJY", + "AJZ", + "AKA", + "AKB", + "AKC", + "AKD", + "AKE", + "AKF", + "AKG", + "AKH", + "AKI", + "AKJ", + "AKK", + "AKL", + "AKM", + "AKN", + "AKO", + "AKP", + "AKQ", + "AKR", + "AKS", + "AKT", + "AKU", + "AKV", + "AKW", + "AKX", + "AKY", + "AKZ", + "ALA", + "ALB", + "ALC", + "ALD", + "ALE", + "ALF", + "ALG", + "ALH", + "ALI", + "ALJ", + "ALK", + ] + + return excel_column_names[index]