In [None]:
import collections
from datetime import datetime
import functools
import glob
import itertools
import os
import pathlib
import requests
import string
import sys
import typing
import zipfile

import IPython
import numpy as np
import pandas as pd
import pypandoc

from tqdm.notebook import tqdm_notebook

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_pipeline.utils import remove_all_from_dir, get_excel_column_name
from data_pipeline.etl.sources.census.etl_utils import get_state_information

# Turn on TQDM for pandas so that we can have progress bars when running `apply`.
tqdm_notebook.pandas()

In [None]:
# Suppress scientific notation in pandas (this shows up for census tract IDs)
pd.options.display.float_format = "{:.2f}".format
pd.set_option('max_columns', None)

# Set some global parameters
DATA_DIR = pathlib.Path.cwd().parent / "data"
TEMP_DATA_DIR = DATA_DIR / "tmp"
COMPARISON_OUTPUTS_DIR = DATA_DIR / "comparison_outputs"

## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts
EJSCREEN_DATA_DIR = DATA_DIR / "ejscreen"
EJSCREEN_CEQ_NAT_DIR = EJSCREEN_DATA_DIR / "CEQ_NationalExports"
EJSCREEN_CEQ_STA_DIR = EJSCREEN_DATA_DIR / "CEQ_StateExports"

# Make the dirs if they don't exist
TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)
COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings
# and introducing the risk of misspelling the field name.)

GEOID_FIELD_NAME = "GEOID10"
GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
GEOID_STATE_FIELD_NAME = "GEOID10_STATE"
GEOID_CBG_FIELD_NAME = "GEOID10_CBG"
COUNTRY_FIELD_NAME = "Country"
CENSUS_BLOCK_GROUP_POPULATION_FIELD = "Total population"

CEJST_SCORE_FIELD = "cejst_score"
CEJST_PERCENTILE_FIELD = "cejst_percentile"
CEJST_PRIORITY_COMMUNITY_FIELD = "cejst_priority_community"

# Define some suffixes
POPULATION_SUFFIX = " (priority population)"

## Loading EJ Screen CEQ Data

### National

In [None]:
# Replace this with something like glob when you have internet
filenames = [
    'CEQ_EJSCREEN_National_70.csv',
    'CEQ_EJSCREEN_National_75.csv',
    'CEQ_EJSCREEN_National_80.csv',
    'CEQ_EJSCREEN_National_85.csv',
    'CEQ_EJSCREEN_National_90.csv',
    'CEQ_EJSCREEN_National_95.csv',
]

In [None]:
dfs = []
for f in filenames:
    percentile = f[-6:][:-4]
    print(percentile)

    df = pd.read_csv(
        os.path.join(EJSCREEN_CEQ_NAT_DIR, 'CEQ_EJSCREEN_National_{}.csv'.format(percentile)),
        encoding = "ISO-8859-1",
        dtype='str',
    )
    df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])

    df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)
    df['percentile'] = percentile
    df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]
    dfs.append(df)
    
df = pd.concat(dfs)

In [None]:
df.head()

In [None]:
df_reshaped_nat = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')
df_reshaped_nat.columns = \
    ['EJSCREEN Areas of Concern, National, {}th percentile'.format(p) for p in df_reshaped_nat.columns]
df_reshaped_nat.fillna(0, inplace=True)

for c in df_reshaped_nat.columns:
    df_reshaped_nat[c + ',  (communities)'] = (df_reshaped_nat[c] > 0) * 1
df_reshaped_nat.reset_index(inplace=True)

In [None]:
df_reshaped_nat.head()

In [None]:
df_reshaped_nat.describe()

In [None]:
pd.isnull(df_reshaped_nat).describe()

### State

In [None]:
# Replace this with something like glob when you have internet
filenames = [
    'CEQ_EJSCREEN_State_70.csv',
    'CEQ_EJSCREEN_State_75.csv',
    'CEQ_EJSCREEN_State_80.csv',
    'CEQ_EJSCREEN_State_85.csv',
    'CEQ_EJSCREEN_State_90.csv',
    'CEQ_EJSCREEN_State_95.csv',
]

In [None]:
dfs = []
for f in filenames:
    percentile = f[-6:][:-4]
    print(percentile)

    df = pd.read_csv(
        os.path.join(EJSCREEN_CEQ_STA_DIR, 'CEQ_EJSCREEN_State_{}.csv'.format(percentile)),
        encoding = "ISO-8859-1",
        dtype='str',
    )
    df['EXCEED_COUNT'] = pd.to_numeric(df['EXCEED_COUNT'])

    df.rename(columns={'ID': GEOID_CBG_FIELD_NAME}, inplace=True)
    df['percentile'] = percentile
    df = df[[GEOID_CBG_FIELD_NAME, 'percentile', 'EXCEED_COUNT']]
    dfs.append(df)
    
df = pd.concat(dfs)

In [None]:
df.head()

In [None]:
df_reshaped_sta = df.pivot(index=GEOID_CBG_FIELD_NAME, columns='percentile', values='EXCEED_COUNT')
df_reshaped_sta.columns = ['EJSCREEN Areas of Concern, State, {}th percentile'.format(p) for p in df_reshaped_sta.columns]
df_reshaped_sta.fillna(0, inplace=True)

for c in df_reshaped_sta.columns:
    df_reshaped_sta[c + ',  (communities)'] = (df_reshaped_sta[c] > 0) * 1
df_reshaped_sta.reset_index(inplace=True)

In [None]:
df_reshaped_sta.head()

In [None]:
df_reshaped_nat.describe()

In [None]:
pd.isnull(df_reshaped_sta).describe()

In [None]:
df_reshaped = df_reshaped_nat.merge(
    df_reshaped_sta,
    on=GEOID_CBG_FIELD_NAME)

In [None]:
df_reshaped.head()

In [None]:
df_reshaped.to_csv(
    path_or_buf=EJSCREEN_DATA_DIR / "ejscreen_areas_of_concerns_indicators.csv", na_rep="", index=False
)

# Next Steps / Questions
Lucas, here's what the output file looks like. For each CBG I have new columns corresponding to the different percentiles for both State and National. For each percentile there are two columns: one for the number of `EXCEED_COUNT` and a boolean indicator for whether `EXCEED_COUNT > 0` for that percentile. I think that's what we wanted right?

1. Do we have a list of all CBGs? The reason for asking is I created a CSV that lists each CBG and the number of EJSCREEN Areas of Concerns for each percentile. It's not going to have all the CBGs in them since if the CBG doesn't have an area concern at least at the 70th percentile, then the CBG wouldn't have appeared in the source data set. Do we want to make sure to add all the remaining CBGs with 0's across the board? 
1. Definitely need to clean up the code, at least not make it so duplicatous across national and state