From a2a321d93dacfe4ad5b5f00bf23a44f5afe6562f Mon Sep 17 00:00:00 2001 From: Lucas Merrill Brown Date: Tue, 22 Jun 2021 08:57:59 -0700 Subject: [PATCH] Score comparison tool, first draft (#140) --- .gitattributes | 3 - score/.vscode/settings.json | 3 + score/README.md | 11 + score/ipython/score_calc_0.1.ipynb | 324 ++++++++- score/ipython/scoring_comparison.ipynb | 907 +++++++++++++++++++++++++ score/ipython/test.ipynb | 133 ---- score/requirements.txt | 1 + 7 files changed, 1226 insertions(+), 156 deletions(-) delete mode 100644 .gitattributes create mode 100644 score/.vscode/settings.json create mode 100644 score/ipython/scoring_comparison.ipynb delete mode 100644 score/ipython/test.ipynb diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 9512cafa..00000000 --- a/.gitattributes +++ /dev/null @@ -1,3 +0,0 @@ -* text=auto -*.sh text eol=lf -*.conf text eol=lf diff --git a/score/.vscode/settings.json b/score/.vscode/settings.json new file mode 100644 index 00000000..84a192ce --- /dev/null +++ b/score/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "venv\\Scripts\\python.exe" +} \ No newline at end of file diff --git a/score/README.md b/score/README.md index 1a67e3e0..2c945617 100644 --- a/score/README.md +++ b/score/README.md @@ -18,6 +18,17 @@ - Activate your virtualenv (see above) - Type `jupyter notebook`. Your browser should open with a Jupyter Notebook tab +## Activating variable-enabled Markdown for Jupyter notebooks + +- Change to this directory (i.e. `cd score`) +- Run `jupyter contrib nbextension install --user` +- Run `jupyter nbextension enable python-markdown/main` +- Make sure you've loaded the Jupyter notebook in a "Trusted" state. (See button near + top right of Notebook screen.) + +For more information, see [nbextensions docs](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html) and +see [python-markdown docs](https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tree/master/src/jupyter_contrib_nbextensions/nbextensions/python-markdown). + ## Downloading Census Block Groups GeoJSON and Generating CBG CSVs - Make sure you have Docker running in your machine diff --git a/score/ipython/score_calc_0.1.ipynb b/score/ipython/score_calc_0.1.ipynb index 7e722062..01cd19b2 100644 --- a/score/ipython/score_calc_0.1.ipynb +++ b/score/ipython/score_calc_0.1.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "a664f981", "metadata": {}, "outputs": [], @@ -18,10 +18,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "7df430cb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDACSTOTPOPLESSHSPCTLOWINCPCT
00100102010016360.2081340.385220
101001020100212870.0406780.163170
20100102020018100.1355630.501247
301001020200212180.1920000.393701
401001020300126410.1254730.308217
\n", + "
" + ], + "text/plain": [ + " ID ACSTOTPOP LESSHSPCT LOWINCPCT\n", + "0 010010201001 636 0.208134 0.385220\n", + "1 010010201002 1287 0.040678 0.163170\n", + "2 010010202001 810 0.135563 0.501247\n", + "3 010010202002 1218 0.192000 0.393701\n", + "4 010010203001 2641 0.125473 0.308217" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# EJSCreen csv Load\n", "ejscreen_csv = data_path / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n", @@ -31,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "27677132", "metadata": { "scrolled": true @@ -45,37 +126,182 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "1f7b864f", - "metadata": {}, - "outputs": [], + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDACSTOTPOPLESSHSPCTLOWINCPCTlesshs_percentilelowin_percentilescore_ascore_bscore_a_percentilescore_b_percentilescore_a_top_percentile_25score_b_top_percentile_25
00100102010016360.2081340.3852200.7932920.6250150.7091540.4958200.7395400.743311FalseFalse
101001020100212870.0406780.1631700.2385500.2467220.2426360.0588560.2068050.249590FalseFalse
20100102020018100.1355630.5012470.6343900.7720020.7031960.4897500.7330090.738859FalseFalse
301001020200212180.1920000.3937010.7651260.6371580.7011420.4875060.7308480.737357FalseFalse
401001020300126410.1254730.3082170.6038410.5049770.5544090.3049250.5685710.586058FalseFalse
\n", + "
" + ], + "text/plain": [ + " ID ACSTOTPOP LESSHSPCT LOWINCPCT lesshs_percentile \\\n", + "0 010010201001 636 0.208134 0.385220 0.793292 \n", + "1 010010201002 1287 0.040678 0.163170 0.238550 \n", + "2 010010202001 810 0.135563 0.501247 0.634390 \n", + "3 010010202002 1218 0.192000 0.393701 0.765126 \n", + "4 010010203001 2641 0.125473 0.308217 0.603841 \n", + "\n", + " lowin_percentile score_a score_b score_a_percentile \\\n", + "0 0.625015 0.709154 0.495820 0.739540 \n", + "1 0.246722 0.242636 0.058856 0.206805 \n", + "2 0.772002 0.703196 0.489750 0.733009 \n", + "3 0.637158 0.701142 0.487506 0.730848 \n", + "4 0.504977 0.554409 0.304925 0.568571 \n", + "\n", + " score_b_percentile score_a_top_percentile_25 score_b_top_percentile_25 \n", + "0 0.743311 False False \n", + "1 0.249590 False False \n", + "2 0.738859 False False \n", + "3 0.737357 False False \n", + "4 0.586058 False False " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# calculate scores\n", - "df['score_a'] = df[['lesshs_percentile', 'lowin_percentile']].mean(axis=1)\n", - "df['score_b'] = df.lesshs_percentile * df.lowin_percentile\n", + "df[\"score_a\"] = df[[\"lesshs_percentile\", \"lowin_percentile\"]].mean(axis=1)\n", + "df[\"score_b\"] = df.lesshs_percentile * df.lowin_percentile\n", "\n", "# Create percentiles for the scores \n", - "df['score_a_percentile'] = df.score_a.rank(pct = True)\n", - "df['score_b_percentile'] = df.score_b.rank(pct = True)\n", - "df['score_a_top_percentile_25'] = df['score_a_percentile'] >= 0.75\n", - "df['score_b_top_percentile_25'] = df['score_b_percentile'] >= 0.75\n", + "df[\"score_a_percentile\"] = df.score_a.rank(pct = True)\n", + "df[\"score_b_percentile\"] = df.score_b.rank(pct = True)\n", + "df[\"score_a_top_percentile_25\"] = df[\"score_a_percentile\"] >= 0.75\n", + "df[\"score_b_top_percentile_25\"] = df[\"score_b_percentile\"] >= 0.75\n", "df.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "91755bcf", "metadata": {}, "outputs": [], "source": [ "# strip calculations\n", - "df = df[[\"ID\", \"score_a_percentile\", \"score_b_percentile\",\"score_a_top_percentile_25\",\"score_b_top_percentile_25\"]]" + "df = df[[\"ID\", \"ACSTOTPOP\", \"score_a\",\"score_b\", \"score_a_percentile\", \"score_b_percentile\",\"score_a_top_percentile_25\",\"score_b_top_percentile_25\"]]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "b3a65af4", "metadata": {}, "outputs": [], @@ -86,10 +312,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "58ddd8b3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating data01 csv\n", + "Generating data02 csv\n", + "Generating data04 csv\n", + "Generating data05 csv\n", + "Generating data06 csv\n", + "Generating data08 csv\n", + "Generating data09 csv\n", + "Generating data10 csv\n", + "Generating data11 csv\n", + "Generating data12 csv\n", + "Generating data13 csv\n", + "Generating data15 csv\n", + "Generating data16 csv\n", + "Generating data17 csv\n", + "Generating data18 csv\n", + "Generating data19 csv\n", + "Generating data20 csv\n", + "Generating data21 csv\n", + "Generating data22 csv\n", + "Generating data23 csv\n", + "Generating data24 csv\n", + "Generating data25 csv\n", + "Generating data26 csv\n", + "Generating data27 csv\n", + "Generating data28 csv\n", + "Generating data29 csv\n", + "Generating data30 csv\n", + "Generating data31 csv\n", + "Generating data32 csv\n", + "Generating data33 csv\n", + "Generating data34 csv\n", + "Generating data35 csv\n", + "Generating data36 csv\n", + "Generating data37 csv\n", + "Generating data38 csv\n", + "Generating data39 csv\n", + "Generating data40 csv\n", + "Generating data41 csv\n", + "Generating data42 csv\n", + "Generating data44 csv\n", + "Generating data45 csv\n", + "Generating data46 csv\n", + "Generating data47 csv\n", + "Generating data48 csv\n", + "Generating data49 csv\n", + "Generating data50 csv\n", + "Generating data51 csv\n", + "Generating data53 csv\n", + "Generating data54 csv\n", + "Generating data55 csv\n", + "Generating data56 csv\n" + ] + } + ], "source": [ "# write per state csvs\n", "with open(fips_csv_path) as csv_file:\n", @@ -110,7 +394,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bce50823", + "id": "e545623b", "metadata": {}, "outputs": [], "source": [] @@ -132,7 +416,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.9.0" } }, "nbformat": 4, diff --git a/score/ipython/scoring_comparison.ipynb b/score/ipython/scoring_comparison.ipynb new file mode 100644 index 00000000..1b6b401c --- /dev/null +++ b/score/ipython/scoring_comparison.ipynb @@ -0,0 +1,907 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "54615cef", + "metadata": {}, + "outputs": [], + "source": [ + "# Before running this script as it currently stands, you'll need to run two notebooks:\n", + "# 1. ejscreen_etl.ipynb\n", + "# 2. score_calc_0.1.ipynb\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import requests\n", + "import zipfile" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "49a63129", + "metadata": {}, + "outputs": [], + "source": [ + "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "\n", + "# Set some global parameters\n", + "DATA_DIR = Path.cwd().parent / \"data\"\n", + "TEMP_DATA_DIR = Path.cwd().parent / \"data\" / \"tmp\"\n", + "# None of these numbers are final, but just for the purposes of comparison.\n", + "CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75\n", + "CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n", + "\n", + "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n", + "# and introducing the risk of misspelling the field name.)\n", + "CENSUS_BLOCK_GROUP_ID_FIELD = \"census_block_group_id\"\n", + "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"census_block_group_population\"\n", + "CENSUS_TRACT_ID_FIELD = \"census_tract_id\"\n", + "CALENVIROSCREEN_SCORE_FIELD = \"calenviroscreen_score\"\n", + "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n", + "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n", + "\n", + "# Note: we are pretending the EJSCREEN's low income percent is the actual score for now as a placeholder.\n", + "CEJST_SCORE_FIELD = \"cejst_score\"\n", + "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n", + "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", + "\n", + "# Comparison field names\n", + "tract_has_at_least_one_cbg = \"CES Tract has at least one CEJST CBG?\"\n", + "tract_has_100_percent_cbg = \"CES Tract has 100% CEJST CBGs?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2b26dccf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
census_block_group_idcensus_block_group_populationcejst_scorescore_bcejst_percentilescore_b_percentilescore_a_top_percentile_25score_b_top_percentile_25cejst_priority_communitycensus_tract_id
102976001400100131150.140.020.100.14FalseFalseFalse6001400100
102986001400200110370.090.010.050.07FalseFalseFalse6001400200
10299600140020029880.150.020.110.12FalseFalseFalse6001400200
103006001400300111370.030.000.010.02FalseFalseFalse6001400300
103016001400300214040.340.090.310.31FalseFalseFalse6001400300
\n", + "
" + ], + "text/plain": [ + " census_block_group_id census_block_group_population cejst_score \\\n", + "10297 60014001001 3115 0.14 \n", + "10298 60014002001 1037 0.09 \n", + "10299 60014002002 988 0.15 \n", + "10300 60014003001 1137 0.03 \n", + "10301 60014003002 1404 0.34 \n", + "\n", + " score_b cejst_percentile score_b_percentile \\\n", + "10297 0.02 0.10 0.14 \n", + "10298 0.01 0.05 0.07 \n", + "10299 0.02 0.11 0.12 \n", + "10300 0.00 0.01 0.02 \n", + "10301 0.09 0.31 0.31 \n", + "\n", + " score_a_top_percentile_25 score_b_top_percentile_25 \\\n", + "10297 False False \n", + "10298 False False \n", + "10299 False False \n", + "10300 False False \n", + "10301 False False \n", + "\n", + " cejst_priority_community census_tract_id \n", + "10297 False 6001400100 \n", + "10298 False 6001400200 \n", + "10299 False 6001400200 \n", + "10300 False 6001400300 \n", + "10301 False 6001400300 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load CEJST score data\n", + "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n", + "\n", + "cejst_df = pd.read_csv(cejst_data_path)\n", + "\n", + "cejst_df.head()\n", + "\n", + "# Rename unclear name \"id\" to \"census_block_group_id\", as well as other renamings.\n", + "cejst_df.rename(\n", + " columns={\n", + " \"ID\": CENSUS_BLOCK_GROUP_ID_FIELD,\n", + " \"ACSTOTPOP\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n", + " \"score_a\": CEJST_SCORE_FIELD,\n", + " \"score_a_percentile\": CEJST_PERCENTILE_FIELD,\n", + " },\n", + " inplace=True,\n", + " errors=\"raise\",\n", + ")\n", + "\n", + "# Calculate the top K% of prioritized communities\n", + "cejst_df[CEJST_PRIORITY_COMMUNITY_FIELD] = (\n", + " cejst_df[CEJST_PERCENTILE_FIELD] >= CEJST_PRIORITY_COMMUNITY_THRESHOLD\n", + ")\n", + "\n", + "# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.\n", + "# The CBG ID is the last one character.\n", + "# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n", + "cejst_df.loc[:, CENSUS_TRACT_ID_FIELD] = (\n", + " cejst_df.loc[:, CENSUS_BLOCK_GROUP_ID_FIELD].astype(str).str[:-1].astype(np.int64)\n", + ")\n", + "\n", + "# Remove all non-California data\n", + "cejst_df = cejst_df.loc[\n", + " cejst_df[CENSUS_BLOCK_GROUP_ID_FIELD].astype(str).str[0] == \"6\", :\n", + "]\n", + "\n", + "cejst_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ec6b27e3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\opt\\justice40-tool\\score\\venv\\lib\\site-packages\\urllib3\\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'justice40-data.s3.amazonaws.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:\n", + "# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip\n", + "\n", + "download = requests.get(\"https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip\", verify=False)\n", + "file_contents = download.content\n", + "zip_file_path = TEMP_DATA_DIR\n", + "zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n", + "zip_file.write(file_contents)\n", + "zip_file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "bdf08971", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C:\\opt\\justice40-tool\\score\\data\\tmp\n" + ] + } + ], + "source": [ + "# Extract zip\n", + "print(zip_file_path)\n", + "with zipfile.ZipFile(zip_file_path / \"downloaded.zip\", \"r\") as zip_ref:\n", + " zip_ref.extractall(zip_file_path)\n", + "calenviroscreen_4_csv_name = \"CalEnviroScreen_4.0_2021.csv\"\n", + "calenviroscreen_data_path = TEMP_DATA_DIR.joinpath(calenviroscreen_4_csv_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "29c14b29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
census_tract_idTotal PopulationCalifornia CountyZIPNearby City \\r\\n(to help approximate location only)LongitudeLatitudecalenviroscreen_scorecalenviroscreen_percentileDRAFT CES 4.0\\r\\nPercentile Range...PovertyPoverty PctlUnemploymentUnemployment PctlHousing BurdenHousing Burden PctlPop. Char.Pop. Char. ScorePop. Char. Pctlcalenviroscreen_priority_community
060190011002760Fresno93706Fresno-119.7836.7194.61100.0095-100% (highest scores)...76.6098.4316.2097.1530.7090.6193.739.7299.87True
160770007004177San Joaquin95206Stockton-121.2937.9490.8399.9995-100% (highest scores)...70.6096.4318.5098.4535.2095.6193.409.6899.84True
260770001004055San Joaquin95202Stockton-121.2937.9585.7599.9795-100% (highest scores)...81.8099.5017.9098.1736.4096.5195.719.9299.97True
360710016005527San Bernardino91761Ontario-117.6234.0683.5699.9695-100% (highest scores)...67.1094.826.7057.2032.1092.6580.598.3693.06True
460372049202639Los Angeles90023Los Angeles-118.2034.0282.9099.9595-100% (highest scores)...64.9093.515.6043.8125.0077.9583.958.7095.78True
\n", + "

5 rows × 59 columns

\n", + "
" + ], + "text/plain": [ + " census_tract_id Total Population California County ZIP \\\n", + "0 6019001100 2760 Fresno 93706 \n", + "1 6077000700 4177 San Joaquin 95206 \n", + "2 6077000100 4055 San Joaquin 95202 \n", + "3 6071001600 5527 San Bernardino 91761 \n", + "4 6037204920 2639 Los Angeles 90023 \n", + "\n", + " Nearby City \\r\\n(to help approximate location only) Longitude Latitude \\\n", + "0 Fresno -119.78 36.71 \n", + "1 Stockton -121.29 37.94 \n", + "2 Stockton -121.29 37.95 \n", + "3 Ontario -117.62 34.06 \n", + "4 Los Angeles -118.20 34.02 \n", + "\n", + " calenviroscreen_score calenviroscreen_percentile \\\n", + "0 94.61 100.00 \n", + "1 90.83 99.99 \n", + "2 85.75 99.97 \n", + "3 83.56 99.96 \n", + "4 82.90 99.95 \n", + "\n", + " DRAFT CES 4.0\\r\\nPercentile Range ... Poverty Poverty Pctl Unemployment \\\n", + "0 95-100% (highest scores) ... 76.60 98.43 16.20 \n", + "1 95-100% (highest scores) ... 70.60 96.43 18.50 \n", + "2 95-100% (highest scores) ... 81.80 99.50 17.90 \n", + "3 95-100% (highest scores) ... 67.10 94.82 6.70 \n", + "4 95-100% (highest scores) ... 64.90 93.51 5.60 \n", + "\n", + " Unemployment Pctl Housing Burden Housing Burden Pctl Pop. Char. \\\n", + "0 97.15 30.70 90.61 93.73 \n", + "1 98.45 35.20 95.61 93.40 \n", + "2 98.17 36.40 96.51 95.71 \n", + "3 57.20 32.10 92.65 80.59 \n", + "4 43.81 25.00 77.95 83.95 \n", + "\n", + " Pop. Char. Score Pop. Char. Pctl calenviroscreen_priority_community \n", + "0 9.72 99.87 True \n", + "1 9.68 99.84 True \n", + "2 9.92 99.97 True \n", + "3 8.36 93.06 True \n", + "4 8.70 95.78 True \n", + "\n", + "[5 rows x 59 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load comparison index (CalEnviroScreen 4)\n", + "\n", + "calenviroscreen_df = pd.read_csv(calenviroscreen_data_path)\n", + "\n", + "calenviroscreen_df.rename(\n", + " columns={\n", + " \"Census Tract\": CENSUS_TRACT_ID_FIELD,\n", + " \"DRAFT CES 4.0 Score\": CALENVIROSCREEN_SCORE_FIELD,\n", + " \"DRAFT CES 4.0 Percentile\": CALENVIROSCREEN_PERCENTILE_FIELD,\n", + " },\n", + " inplace=True,\n", + ")\n", + "\n", + "\n", + "# Calculate the top K% of prioritized communities\n", + "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = (\n", + " calenviroscreen_df[CALENVIROSCREEN_PERCENTILE_FIELD]\n", + " >= CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD\n", + ")\n", + "\n", + "calenviroscreen_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "813e5656", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
census_block_group_idcensus_tract_idcensus_block_group_populationcejst_scorecejst_percentilecejst_priority_communitycalenviroscreen_scorecalenviroscreen_percentilecalenviroscreen_priority_community
060014001001600140010031150.140.10False4.402.38False
160014002001600140020010370.090.05False5.053.48False
26001400200260014002009880.150.11False5.053.48False
360014003001600140030011370.030.01False9.9213.44False
460014003002600140030014040.340.31False9.9213.44False
\n", + "
" + ], + "text/plain": [ + " census_block_group_id census_tract_id census_block_group_population \\\n", + "0 60014001001 6001400100 3115 \n", + "1 60014002001 6001400200 1037 \n", + "2 60014002002 6001400200 988 \n", + "3 60014003001 6001400300 1137 \n", + "4 60014003002 6001400300 1404 \n", + "\n", + " cejst_score cejst_percentile cejst_priority_community \\\n", + "0 0.14 0.10 False \n", + "1 0.09 0.05 False \n", + "2 0.15 0.11 False \n", + "3 0.03 0.01 False \n", + "4 0.34 0.31 False \n", + "\n", + " calenviroscreen_score calenviroscreen_percentile \\\n", + "0 4.40 2.38 \n", + "1 5.05 3.48 \n", + "2 5.05 3.48 \n", + "3 9.92 13.44 \n", + "4 9.92 13.44 \n", + "\n", + " calenviroscreen_priority_community \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Join CalEnviroScreen and CEJST data.\n", + "# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,\n", + "# creating multiple rows of the same CES data.\n", + "\n", + "# For simplicity, we'll only keep certain columns from each data frame.\n", + "cejst_columns_to_keep = [\n", + " CENSUS_BLOCK_GROUP_ID_FIELD,\n", + " CENSUS_TRACT_ID_FIELD,\n", + " CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n", + " CEJST_SCORE_FIELD,\n", + " CEJST_PERCENTILE_FIELD,\n", + " CEJST_PRIORITY_COMMUNITY_FIELD,\n", + "]\n", + "\n", + "calenviroscreen_columns_to_keep = [\n", + " CENSUS_TRACT_ID_FIELD,\n", + " CALENVIROSCREEN_SCORE_FIELD,\n", + " CALENVIROSCREEN_PERCENTILE_FIELD,\n", + " CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD,\n", + "]\n", + "\n", + "merged_df = cejst_df.loc[:, cejst_columns_to_keep].merge(\n", + " calenviroscreen_df.loc[:, calenviroscreen_columns_to_keep],\n", + " how=\"left\",\n", + " on=CENSUS_TRACT_ID_FIELD,\n", + ")\n", + "\n", + "merged_df.head()\n", + "\n", + "# merged_df.to_csv(\n", + "# path_or_buf=TEMP_DATA_DIR / \"merged.csv\",\n", + "# na_rep=\"\",\n", + "# index=False\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "939baea4", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " census_tract_id calenviroscreen_score \\\n", + "census_tract_id \n", + "6019001100 6019001100 94.61 \n", + "6077000700 6077000700 90.83 \n", + "6077000100 6077000100 85.75 \n", + "6071001600 6071001600 83.56 \n", + "6037204920 6037204920 82.90 \n", + "\n", + " calenviroscreen_percentile \\\n", + "census_tract_id \n", + "6019001100 100.00 \n", + "6077000700 99.99 \n", + "6077000100 99.97 \n", + "6071001600 99.96 \n", + "6037204920 99.95 \n", + "\n", + " calenviroscreen_priority_community \\\n", + "census_tract_id \n", + "6019001100 True \n", + "6077000700 True \n", + "6077000100 True \n", + "6071001600 True \n", + "6037204920 True \n", + "\n", + " CES Tract has at least one CEJST CBG? \\\n", + "census_tract_id \n", + "6019001100 True \n", + "6077000700 True \n", + "6077000100 True \n", + "6071001600 True \n", + "6037204920 True \n", + "\n", + " CES Tract has 100% CEJST CBGs? \n", + "census_tract_id \n", + "6019001100 True \n", + "6077000700 True \n", + "6077000100 True \n", + "6071001600 False \n", + "6037204920 True \n" + ] + } + ], + "source": [ + "# Create analysis\n", + "def calculate_comparison(frame):\n", + " # Keep all the CES values at the Census Tract Level\n", + " df = frame.loc[\n", + " frame.index[0],\n", + " [\n", + " CENSUS_TRACT_ID_FIELD,\n", + " CALENVIROSCREEN_SCORE_FIELD,\n", + " CALENVIROSCREEN_PERCENTILE_FIELD,\n", + " CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD,\n", + " ],\n", + " ]\n", + "\n", + " # Convenience constant for whether the tract is or is not a CalEnviroScreen priority community.\n", + " is_a_ces_priority_tract = frame.loc[\n", + " frame.index[0], [CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD]\n", + " ][0]\n", + "\n", + " # Recall that NaN values are not falsy, so we need to check if `is_a_ces_priority_tract` is True.\n", + " is_a_ces_priority_tract = is_a_ces_priority_tract is True\n", + "\n", + " # Calculate comparison\n", + " df[tract_has_at_least_one_cbg] = (\n", + " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", + " if is_a_ces_priority_tract\n", + " else None\n", + " )\n", + " df[tract_has_100_percent_cbg] = (\n", + " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n", + " if is_a_ces_priority_tract\n", + " else None\n", + " )\n", + "\n", + " return df\n", + "\n", + "\n", + "# Group all data by the census tract.\n", + "grouped_df = merged_df.groupby(CENSUS_TRACT_ID_FIELD)\n", + "\n", + "# Run the comparison function on the groups.\n", + "comparison_df = grouped_df.apply(calculate_comparison)\n", + "\n", + "# Sort descending by highest CES Score for convenience when viewing output file\n", + "comparison_df.sort_values(\n", + " by=[CALENVIROSCREEN_PERCENTILE_FIELD], ascending=False, inplace=True\n", + ")\n", + "\n", + "# Write comparison to CSV.\n", + "comparison_df.to_csv(\n", + " path_or_buf=TEMP_DATA_DIR / \"Comparison Output.csv\", na_rep=\"\", index=False\n", + ")\n", + "\n", + "print(comparison_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "85709225", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Prepare some constants for use in the following Markdown cell.\n", + "\n", + "cejst_cbgs_ca_only = cejst_df.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum()\n", + "ces_tracts_count = comparison_df.loc[:, CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].sum()\n", + "at_least_one_sum = comparison_df.loc[:, tract_has_at_least_one_cbg].sum()\n", + "at_least_one_sum_percent = f\"{at_least_one_sum / ces_tracts_count:.0%}\"\n", + "\n", + "all_100_sum = comparison_df.loc[:, tract_has_100_percent_cbg].sum()\n", + "all_100_sum_percent = f\"{all_100_sum / ces_tracts_count:.0%}\"\n", + "\n", + "# Note, for the following Markdown cell to render the variables properly, follow the steps at\n", + "# \"Activating variable-enabled Markdown for Jupyter notebooks\" within `score/README.md`." + ] + }, + { + "cell_type": "markdown", + "id": "0c534966", + "metadata": { + "variables": { + "all_100_sum": "1168", + "all_100_sum_percent": "59%", + "at_least_one_sum": "1817", + "at_least_one_sum_percent": "92%", + "cejst_cbgs_ca_only": "6987", + "ces_tracts_count": "1983" + } + }, + "source": [ + "# Summary of findings\n", + "\n", + "Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n", + "\n", + "There are {{ces_tracts_count}} census tracts designated as Disadvantaged Communities by CalEnviroScreen 4.0. \n", + "\n", + "Within California, there are {{cejst_cbgs_ca_only}} census block groups considered as priority communities by the current version of the CEJST score used in this analysis.\n", + "\n", + "Out of every CalEnviroScreen Disadvantaged Community census tract, {{at_least_one_sum}} ({{at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", + "\n", + "Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db3c7d38", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/ipython/test.ipynb b/score/ipython/test.ipynb deleted file mode 100644 index da17a9c4..00000000 --- a/score/ipython/test.ipynb +++ /dev/null @@ -1,133 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "1a4c0c68", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "70b3a793", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('data/fips_states.csv') " - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "c514aad8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
fipsstate_name
01Alabama
12Alaska
24Arizona
35Arkansas
46California
\n", - "
" - ], - "text/plain": [ - " fips state_name\n", - "0 1 Alabama \n", - "1 2 Alaska \n", - "2 4 Arizona \n", - "3 5 Arkansas \n", - "4 6 California" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9ee44d9", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/score/requirements.txt b/score/requirements.txt index f3f609d7..87d00c77 100644 --- a/score/requirements.txt +++ b/score/requirements.txt @@ -1,5 +1,6 @@ ipython jupyter +jupyter_contrib_nbextensions numpy pandas requests