In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import csv
import sys
import os

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from etl.sources.census.etl_utils import get_state_fips_codes
from utils import unzip_file_from_url, remove_all_from_dir

DATA_PATH = Path.cwd().parent / "data"
TMP_PATH = DATA_PATH / "tmp"
CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip"
CSV_PATH = DATA_PATH / "dataset" / "calenviroscreen4"

# Definining some variable names
CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile"
CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = "calenviroscreen_priority_community"
GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"

# Choosing constants.
# None of these numbers are final, but just for the purposes of comparison.
CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75

print(DATA_PATH)

In [None]:
# download file from ejscreen ftp
unzip_file_from_url(CALENVIROSCREEN_FTP_URL, TMP_PATH, TMP_PATH)

In [None]:
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
calenviroscreen_4_csv_name = "CalEnviroScreen_4.0_2021.csv"
calenviroscreen_data_path = TMP_PATH.joinpath(calenviroscreen_4_csv_name)

# Load comparison index (CalEnviroScreen 4)
calenviroscreen_df = pd.read_csv(
    calenviroscreen_data_path, dtype={"Census Tract": "string"}
)

calenviroscreen_df.rename(
    columns={
        "Census Tract": GEOID_TRACT_FIELD_NAME,
        "DRAFT CES 4.0 Score": CALENVIROSCREEN_SCORE_FIELD_NAME,
        "DRAFT CES 4.0 Percentile": CALENVIROSCREEN_PERCENTILE_FIELD_NAME,
    },
    inplace=True,
)

# Add a leading "0" to the Census Tract to match our format in other data frames.

calenviroscreen_df[GEOID_TRACT_FIELD_NAME] = (
    "0" + calenviroscreen_df[GEOID_TRACT_FIELD_NAME]
)

# Calculate the top K% of prioritized communities
calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME] = (
    calenviroscreen_df[CALENVIROSCREEN_PERCENTILE_FIELD_NAME]
    >= CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD
)

calenviroscreen_df.head()

In [None]:
# write csv
CSV_PATH.mkdir(parents=True, exist_ok=True)

# Matching other conventions in the ETL scripts, write only for the state (FIPS code 06).
calenviroscreen_df.to_csv(CSV_PATH / "data06.csv", index=False)

In [None]:
# cleanup
remove_all_from_dir(TMP_PATH)