In [None]:
# Before running this script as it currently stands, you'll need to run two notebooks:
# 1. ejscreen_etl.ipynb
# 2. score_calc_0.1.ipynb

import numpy as np
import pandas as pd
from pathlib import Path
import requests
import zipfile
from datetime import datetime
from tqdm.notebook import tqdm_notebook

# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
tqdm_notebook.pandas()

In [None]:
# Suppress scientific notation in pandas (this shows up for census tract IDs)
pd.options.display.float_format = "{:.2f}".format

# Set some global parameters
DATA_DIR = Path.cwd().parent / "data"
TEMP_DATA_DIR = Path.cwd().parent / "data" / "tmp"
# None of these numbers are final, but just for the purposes of comparison.
CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75
CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75

# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings
# and introducing the risk of misspelling the field name.)
CENSUS_BLOCK_GROUP_ID_FIELD = "census_block_group_id"
CENSUS_BLOCK_GROUP_POPULATION_FIELD = "census_block_group_population"
CENSUS_TRACT_ID_FIELD = "census_tract_id"
CALENVIROSCREEN_SCORE_FIELD = "calenviroscreen_score"
CALENVIROSCREEN_PERCENTILE_FIELD = "calenviroscreen_percentile"
CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = "calenviroscreen_priority_community"

# Note: we are pretending the EJSCREEN's low income percent is the actual score for now as a placeholder.
CEJST_SCORE_FIELD = "cejst_score"
CEJST_PERCENTILE_FIELD = "cejst_percentile"
CEJST_PRIORITY_COMMUNITY_FIELD = "cejst_priority_community"

# Comparison field names
any_tract_has_at_least_one_cbg = "Tract has at least one CEJST CBG?"
tract_has_at_least_one_cbg = "CES Tract has at least one CEJST CBG?"
tract_has_100_percent_cbg = "CES Tract has 100% CEJST CBGs?"
non_ces_tract_has_at_least_one_cbg = "Non-CES Tract has at least one CEJST CBG?"
non_ces_tract_has_100_percent_cbg = "Non-CES Tract has 100% CEJST CBGs?"

In [None]:
# Load CEJST score data
cejst_data_path = DATA_DIR / "score" / "csv" / "usa.csv"

cejst_df = pd.read_csv(cejst_data_path)

cejst_df.head()

# Rename unclear name "id" to "census_block_group_id", as well as other renamings.

score_used = "Score A"

cejst_df.rename(
 columns={
 "GEOID10": CENSUS_BLOCK_GROUP_ID_FIELD,
 "Total population": CENSUS_BLOCK_GROUP_POPULATION_FIELD,
 score_used: CEJST_SCORE_FIELD,
 f"{score_used} (percentile)": CEJST_PERCENTILE_FIELD,
 },
 inplace=True,
 errors="raise",
)

# Calculate the top K% of prioritized communities
cejst_df[CEJST_PRIORITY_COMMUNITY_FIELD] = (
 cejst_df[CEJST_PERCENTILE_FIELD] >= CEJST_PRIORITY_COMMUNITY_THRESHOLD
)

# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.
# The CBG ID is the last one character.
# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.
cejst_df.loc[:, CENSUS_TRACT_ID_FIELD] = (
 cejst_df.loc[:, CENSUS_BLOCK_GROUP_ID_FIELD].astype(str).str[:-1].astype(np.int64)
)

# Remove all non-California data
cejst_df = cejst_df.loc[
 cejst_df[CENSUS_BLOCK_GROUP_ID_FIELD].astype(str).str[0] == "6", :
]

cejst_df.head()

In [None]:
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip

download = requests.get(
 "https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip",
 verify=False,
)
file_contents = download.content
zip_file_path = TEMP_DATA_DIR
zip_file = open(zip_file_path / "downloaded.zip", "wb")
zip_file.write(file_contents)
zip_file.close()

In [None]:
# Extract zip
print(zip_file_path)
with zipfile.ZipFile(zip_file_path / "downloaded.zip", "r") as zip_ref:
 zip_ref.extractall(zip_file_path)
calenviroscreen_4_csv_name = "CalEnviroScreen_4.0_2021.csv"
calenviroscreen_data_path = TEMP_DATA_DIR.joinpath(calenviroscreen_4_csv_name)

In [None]:
# Load comparison index (CalEnviroScreen 4)

calenviroscreen_df = pd.read_csv(calenviroscreen_data_path)

calenviroscreen_df.rename(
 columns={
 "Census Tract": CENSUS_TRACT_ID_FIELD,
 "DRAFT CES 4.0 Score": CALENVIROSCREEN_SCORE_FIELD,
 "DRAFT CES 4.0 Percentile": CALENVIROSCREEN_PERCENTILE_FIELD,
 },
 inplace=True,
)


# Calculate the top K% of prioritized communities
calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = (
 calenviroscreen_df[CALENVIROSCREEN_PERCENTILE_FIELD]
 >= CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD
)

calenviroscreen_df.head()

In [None]:
# Join CalEnviroScreen and CEJST data.
# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,
# creating multiple rows of the same CES data.

# For simplicity, we'll only keep certain columns from each data frame.
cejst_columns_to_keep = [
 CENSUS_BLOCK_GROUP_ID_FIELD,
 CENSUS_TRACT_ID_FIELD,
 CENSUS_BLOCK_GROUP_POPULATION_FIELD,
 CEJST_SCORE_FIELD,
 CEJST_PERCENTILE_FIELD,
 CEJST_PRIORITY_COMMUNITY_FIELD,
]

calenviroscreen_columns_to_keep = [
 CENSUS_TRACT_ID_FIELD,
 CALENVIROSCREEN_SCORE_FIELD,
 CALENVIROSCREEN_PERCENTILE_FIELD,
 CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD,
]

merged_df = cejst_df.loc[:, cejst_columns_to_keep].merge(
 calenviroscreen_df.loc[:, calenviroscreen_columns_to_keep],
 how="left",
 on=CENSUS_TRACT_ID_FIELD,
)

merged_df.head()

# merged_df.to_csv(
# path_or_buf=TEMP_DATA_DIR / "merged.csv",
# na_rep="",
# index=False
# )

In [None]:
# Create analysis
def calculate_comparison(frame):
 # Keep all the CES values at the Census Tract Level
 df = frame.loc[
 frame.index[0],
 [
 CENSUS_TRACT_ID_FIELD,
 CALENVIROSCREEN_SCORE_FIELD,
 CALENVIROSCREEN_PERCENTILE_FIELD,
 CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD,
 ],
 ]

 # Convenience constant for whether the tract is or is not a CalEnviroScreen priority community.
 is_a_ces_priority_tract = frame.loc[
 frame.index[0], [CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD]
 ][0]

 # Recall that NaN values are not falsy, so we need to check if `is_a_ces_priority_tract` is True.
 is_a_ces_priority_tract = is_a_ces_priority_tract is True

 # Calculate whether the tract (whether or not it is a CES priority tract) includes CBGs that are priority
 # according to the current CEJST score.
 df[any_tract_has_at_least_one_cbg] = (
 frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0
 )

 # Calculate comparison
 # A CES priority tract has at least one CEJST priority CBG.
 df[tract_has_at_least_one_cbg] = (
 frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0
 if is_a_ces_priority_tract
 else None
 )

 # A CES priority tract has all of its contained CBGs as CEJST priority CBGs.
 df[tract_has_100_percent_cbg] = (
 frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1
 if is_a_ces_priority_tract
 else None
 )

 # Calculate the inverse
 # A tract that is _not_ a CES priority has at least one CEJST priority CBG.
 df[non_ces_tract_has_at_least_one_cbg] = (
 frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0
 if not is_a_ces_priority_tract
 else None
 )

 # A tract that is _not_ a CES priority has all of its contained CBGs as CEJST priority CBGs.
 df[non_ces_tract_has_100_percent_cbg] = (
 frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1
 if not is_a_ces_priority_tract
 else None
 )

 return df


# Group all data by the census tract.
grouped_df = merged_df.groupby(CENSUS_TRACT_ID_FIELD)

# Run the comparison function on the groups.
comparison_df = grouped_df.progress_apply(calculate_comparison)

# Sort descending by highest CES Score for convenience when viewing output file
comparison_df.sort_values(
 by=[CALENVIROSCREEN_PERCENTILE_FIELD], ascending=False, inplace=True
)

# Write comparison to CSV.
comparison_df.to_csv(
 path_or_buf=TEMP_DATA_DIR / "Comparison Output.csv", na_rep="", index=False
)

print(comparison_df.head())

In [None]:
# Prepare some constants for use in the following Markdown cell.
total_cbgs_ca_only = len(cejst_df)
cejst_cbgs_ca_only = cejst_df.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum()
cejst_cbgs_ca_only_percent = f"{cejst_cbgs_ca_only / total_cbgs_ca_only:.0%}"

total_tracts_count = len(comparison_df)
ces_tracts_count = comparison_df.loc[:, CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].sum()
ces_tracts_count_percent = f"{ces_tracts_count / total_tracts_count:.0%}"
non_ces_tracts_count = total_tracts_count - ces_tracts_count

total_tracts_count = len(comparison_df[CENSUS_TRACT_ID_FIELD])
cejst_tracts_count = comparison_df.loc[:, any_tract_has_at_least_one_cbg].sum()
cejst_tracts_count_percent = f"{cejst_tracts_count / total_tracts_count:.0%}"

# CES stats
at_least_one_sum = comparison_df.loc[:, tract_has_at_least_one_cbg].sum()
at_least_one_sum_percent = f"{at_least_one_sum / ces_tracts_count:.0%}"

all_100_sum = comparison_df.loc[:, tract_has_100_percent_cbg].sum()
all_100_sum_percent = f"{all_100_sum / ces_tracts_count:.0%}"

# Non-CES stats:
non_ces_at_least_one_sum = comparison_df.loc[
 :, non_ces_tract_has_at_least_one_cbg
].sum()
non_ces_at_least_one_sum_percent = (
 f"{non_ces_at_least_one_sum / non_ces_tracts_count:.0%}"
)

non_ces_all_100_sum = comparison_df.loc[:, non_ces_tract_has_100_percent_cbg].sum()
non_ces_all_100_sum_percent = f"{non_ces_all_100_sum / non_ces_tracts_count:.0%}"

# Note, for the following Markdown cell to render the variables properly, follow the steps at
# "Activating variable-enabled Markdown for Jupyter notebooks" within `score/README.md`.

# Summary of findings for {{score_used}}

(Calculated on {{datetime.today().strftime('%Y-%m-%d')}})

Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.

There are {{ces_tracts_count}} census tracts designated as Disadvantaged Communities by CalEnviroScreen 4.0, out of {{total_tracts_count}} total tracts ({{ces_tracts_count_percent}}). 

Within California, there are {{cejst_cbgs_ca_only}} census block groups considered as priority communities by the current version of the CEJST score used in this analysis, out of {{total_cbgs_ca_only}} CBGs in the state ({{cejst_cbgs_ca_only_percent}}). They occupy {{cejst_tracts_count}} ({{cejst_tracts_count_percent}}) of all the census tracts in California.

Out of every CalEnviroScreen Disadvantaged Community census tract, {{at_least_one_sum}} ({{at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.

Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.

Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_at_least_one_sum}} ({{non_ces_at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.

Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_all_100_sum}} ({{non_ces_all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.