In [None]:
# Before running this notebook, you must run the notebook `ejscreen_etl.ipynb`.

import collections
from pathlib import Path
import pandas as pd
import csv

# Define some global parameters
BUCKET_SOCIOECONOMIC = "Socioeconomic Factors"
BUCKET_SENSITIVE = "Sensitive populations"
BUCKET_ENVIRONMENTAL = "Environmental effects"
BUCKET_EXPOSURES = "Exposures"
BUCKETS = [
    BUCKET_SOCIOECONOMIC,
    BUCKET_SENSITIVE,
    BUCKET_ENVIRONMENTAL,
    BUCKET_EXPOSURES,
]

# There's another aggregation level (a second level of "buckets").
AGGREGATION_POLLUTION = "Pollution Burden"
AGGREGATION_POPULATION = "Population Characteristics"

PERCENTILE_FIELD_SUFFIX = " (percentile)"

data_path = Path.cwd().parent / "data"
fips_csv_path = data_path / "fips_states_2010.csv"
score_csv_path = data_path / "score" / "csv"

# Tell pandas to display all columns
pd.set_option("display.max_columns", None)

In [None]:
# EJSCreen csv Load
ejscreen_csv = data_path / "dataset" / "ejscreen_2020" / "usa.csv"
df = pd.read_csv(ejscreen_csv, dtype={"ID": "string"}, low_memory=False)
df.head()

In [None]:
# Define a named tuple that will be used for each data set input.
DataSet = collections.namedtuple(
    typename="DataSet", field_names=["input_field", "renamed_field", "bucket"]
)

data_sets = [
    # The following data sets have `bucket=None`, because it's not used in the score.
    DataSet(
        input_field="ID", 
        # Use the name `GEOID10` to enable geoplatform.gov's workflow.
        renamed_field="GEOID10", bucket=None
    ),
    DataSet(input_field="ACSTOTPOP", renamed_field="Total population", bucket=None),
    # The following data sets have buckets, because they're used in the score
    DataSet(
        input_field="CANCER",
        renamed_field="Air toxics cancer risk",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="RESP",
        renamed_field="Respiratory hazard index",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="DSLPM",
        renamed_field="Diesel particulate matter",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="PM25",
        renamed_field="Particulate matter (PM2.5)",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(input_field="OZONE", renamed_field="Ozone", bucket=BUCKET_EXPOSURES),
    DataSet(
        input_field="PTRAF",
        renamed_field="Traffic proximity and volume",
        bucket=BUCKET_EXPOSURES,
    ),
    DataSet(
        input_field="PRMP",
        renamed_field="Proximity to RMP sites",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PTSDF",
        renamed_field="Proximity to TSDF sites",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PNPL",
        renamed_field="Proximity to NPL sites",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PWDIS",
        renamed_field="Wastewater discharge",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="PRE1960PCT",
        renamed_field="Percent pre-1960s housing (lead paint indicator)",
        bucket=BUCKET_ENVIRONMENTAL,
    ),
    DataSet(
        input_field="UNDER5PCT",
        renamed_field="Individuals under 5 years old",
        bucket=BUCKET_SENSITIVE,
    ),
    DataSet(
        input_field="OVER64PCT",
        renamed_field="Individuals over 64 years old",
        bucket=BUCKET_SENSITIVE,
    ),
    DataSet(
        input_field="LINGISOPCT",
        renamed_field="Percent of households in linguistic isolation",
        bucket=BUCKET_SOCIOECONOMIC,
    ),
    DataSet(
        input_field="LOWINCPCT",
        renamed_field="Poverty (Less than 200% of federal poverty line)",
        bucket=BUCKET_SOCIOECONOMIC,
    ),
    DataSet(
        input_field="LESSHSPCT",
        renamed_field="Percent individuals age 25 or over with less than high school degree",
        bucket=BUCKET_SOCIOECONOMIC,
    ),
]

In [None]:
# Rename columns:
renaming_dict = {data_set.input_field: data_set.renamed_field for data_set in data_sets}

df.rename(
    columns=renaming_dict,
    inplace=True,
    errors="raise",
)

columns_to_keep = [data_set.renamed_field for data_set in data_sets]
df = df[columns_to_keep]

df.head()

In [None]:
# calculate percentiles
for data_set in data_sets:
    df[f"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}"] = df[
        data_set.renamed_field
    ].rank(pct=True)

df.head()

In [None]:
# Calculate score "A" and score "B"
df["Score A"] = df[
    [
        "Poverty (Less than 200% of federal poverty line) (percentile)",
        "Percent individuals age 25 or over with less than high school degree (percentile)",
    ]
].mean(axis=1)
df["Score B"] = (
    df["Poverty (Less than 200% of federal poverty line) (percentile)"]
    * df[
        "Percent individuals age 25 or over with less than high school degree (percentile)"
    ]
)

In [None]:
# Calculate "CalEnviroScreen for the US" score
# Average all the percentile values in each bucket into a single score for each of the four buckets.
for bucket in BUCKETS:
    fields_in_bucket = [
        f"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}"
        for data_set in data_sets
        if data_set.bucket == bucket
    ]
    df[f"{bucket}"] = df[fields_in_bucket].mean(axis=1)

# Combine the score from the two Exposures and Environmental Effects buckets into a single score called "Pollution Burden". The math for this score is: (1.0 * Exposures Score + 0.5 * Environment Effects score) / 1.5.
df[AGGREGATION_POLLUTION] = (
    1.0 * df[f"{BUCKET_EXPOSURES}"] + 0.5 * df[f"{BUCKET_ENVIRONMENTAL}"]
) / 1.5

# Average the score from the two Sensitive populations and Socioeconomic factors buckets into a single score called "Population Characteristics".
df[AGGREGATION_POPULATION] = df[
    [f"{BUCKET_SENSITIVE}", f"{BUCKET_SOCIOECONOMIC}"]
].mean(axis=1)

# Multiply the "Pollution Burden" score and the "Population Characteristics" together to produce the cumulative impact score.
df["Score C"] = df[AGGREGATION_POLLUTION] * df[AGGREGATION_POPULATION]

df.head()

In [None]:
# Create percentiles for the scores
for score_field in ["Score A", "Score B", "Score C"]:
    df[f"{score_field}{PERCENTILE_FIELD_SUFFIX}"] = df[score_field].rank(pct=True)
    df[f"{score_field} (top 25th percentile)"] = (
        df[f"{score_field}{PERCENTILE_FIELD_SUFFIX}"] >= 0.75
    )
df.head()

In [None]:
# write nationwide csv
df.to_csv(score_csv_path / f"usa.csv", index=False)

In [None]:
# write per state csvs
with open(fips_csv_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=",")
    line_count = 0

    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            states_fips = row[0].strip()
            print(f"Generating data{states_fips} csv")
            df1 = df[df["GEOID10"].str[:2] == states_fips]
            # we need to name the file data01.csv for ogr2ogr csv merge to work
            df1.to_csv(score_csv_path / f"data{states_fips}.csv", index=False)