In [1]:
import pandas as pd
import csv
from pathlib import Path
import os
import sys

In [2]:
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
DATA_PATH = Path.cwd().parent / "data"
TMP_PATH: Path = DATA_PATH / "tmp"
OUTPUT_PATH = (
            DATA_PATH / "dataset" / "ejscreen_2019"
        )
CENSUS_USA_CSV = (
            DATA_PATH / "census" / "csv" / "us.csv"
        )

In [5]:
cbg_usa_df = pd.read_csv(
            CENSUS_USA_CSV,
            names=['GEOID10'],
            dtype={"GEOID10": "string"},
            low_memory=False,
            header=None
        )

In [6]:
cbg_usa_df.head()

Unnamed: 0,GEOID10
0,100010414002
1,100010415002
2,100010417011
3,100010417012
4,100010422011


In [7]:
cbg_usa_df.dtypes

GEOID10    string
dtype: object

In [11]:
ejscreen_df = pd.read_csv(
            OUTPUT_PATH / "usa.csv",
            dtype={"ID": "string"},
            low_memory=False,
        )

In [15]:
ejscreen_df.rename(
            columns={"ID": "GEOID10"},
            inplace=True,
        )

In [16]:
ejscreen_df.head()

Unnamed: 0,OBJECTID,GEOID10,STATE_NAME,ST_ABBREV,REGION,ACSTOTPOP,D_PM25_2,B_PM25_D2,P_PM25_D2,D_OZONE_2,...,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_PWDIS,T_PWDIS_D2,Shape_Length,Shape_Area
0,1,10010201001,Alabama,AL,4,692,-1161.544049,5,43.0,-4661.186378,...,0.071 facilities/km distance (79%ile),26%ile,0.085 facilities/km distance (24%ile),47%ile,0.066 facilities/km distance (21%ile),48%ile,0 toxicity-weighted concentration/meters dista...,62%ile,13435.97556,6026828.0
1,2,10010201002,Alabama,AL,4,1153,-2084.690717,4,31.0,-8365.702519,...,0.064 facilities/km distance (76%ile),19%ile,0.074 facilities/km distance (18%ile),41%ile,0.06 facilities/km distance (18%ile),42%ile,0 toxicity-weighted concentration/meters dista...,62%ile,11945.584679,7848121.0
2,3,10010202001,Alabama,AL,4,1020,2641.389659,9,81.0,10550.793324,...,0.069 facilities/km distance (78%ile),87%ile,0.078 facilities/km distance (20%ile),71%ile,0.065 facilities/km distance (20%ile),71%ile,0 toxicity-weighted concentration/meters dista...,62%ile,7770.915121,2900774.0
3,4,10010202002,Alabama,AL,4,1152,693.118534,7,65.0,2768.599617,...,0.076 facilities/km distance (81%ile),75%ile,0.087 facilities/km distance (25%ile),63%ile,0.07 facilities/km distance (23%ile),63%ile,0 toxicity-weighted concentration/meters dista...,62%ile,6506.804784,1793332.0
4,5,10010203001,Alabama,AL,4,2555,1034.343525,7,68.0,4120.531837,...,0.074 facilities/km distance (80%ile),79%ile,0.08 facilities/km distance (21%ile),64%ile,0.07 facilities/km distance (23%ile),65%ile,0 toxicity-weighted concentration/meters dista...,62%ile,11070.367848,5461602.0


In [17]:
ejscreen_df.dtypes

OBJECTID          int64
GEOID10          string
STATE_NAME       object
ST_ABBREV        object
REGION            int64
                 ...   
T_PTSDF_D2       object
T_PWDIS          object
T_PWDIS_D2       object
Shape_Length    float64
Shape_Area      float64
Length: 128, dtype: object

In [18]:
merged_df = cbg_usa_df.merge(
            ejscreen_df, on="GEOID10", how="left"
        )

In [19]:
merged_df.head()

Unnamed: 0,GEOID10,OBJECTID,STATE_NAME,ST_ABBREV,REGION,ACSTOTPOP,D_PM25_2,B_PM25_D2,P_PM25_D2,D_OZONE_2,...,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_PWDIS,T_PWDIS_D2,Shape_Length,Shape_Area
0,100010414002,39652.0,Delaware,DE,3.0,1187.0,3655.279721,10.0,90.0,22778.314495,...,1.7 facilities/km distance (99%ile),100%ile,0.23 facilities/km distance (40%ile),80%ile,1.6 facilities/km distance (63%ile),87%ile,0 toxicity-weighted concentration/meters dista...,69%ile,4866.135943,1156165.0
1,100010415002,39654.0,Delaware,DE,3.0,1088.0,100.877666,7.0,65.0,629.604923,...,0.32 facilities/km distance (69%ile),66%ile,0.14 facilities/km distance (20%ile),64%ile,1 facilities/km distance (52%ile),66%ile,0 toxicity-weighted concentration/meters dista...,69%ile,7972.275657,2821805.0
2,100010417011,39656.0,Delaware,DE,3.0,1554.0,-1256.221548,5.0,45.0,-7833.701886,...,0.21 facilities/km distance (52%ile),31%ile,0.11 facilities/km distance (11%ile),53%ile,1.3 facilities/km distance (58%ile),22%ile,0 toxicity-weighted concentration/meters dista...,69%ile,17643.717513,8143206.0
3,100010417012,39657.0,Delaware,DE,3.0,4543.0,-2095.065215,4.0,32.0,-13064.667094,...,0.17 facilities/km distance (43%ile),25%ile,0.1 facilities/km distance (7%ile),48%ile,1.1 facilities/km distance (54%ile),18%ile,0 toxicity-weighted concentration/meters dista...,69%ile,15645.341219,9723460.0
4,100010422011,39671.0,Delaware,DE,3.0,5153.0,-723.497337,6.0,53.0,-4534.212814,...,0.24 facilities/km distance (58%ile),41%ile,0.11 facilities/km distance (8%ile),58%ile,0.3 facilities/km distance (33%ile),50%ile,0 toxicity-weighted concentration/meters dista...,69%ile,20959.959236,20661920.0


In [21]:
merged_df[merged_df["Shape_Area"].isnull()]

Unnamed: 0,GEOID10,OBJECTID,STATE_NAME,ST_ABBREV,REGION,ACSTOTPOP,D_PM25_2,B_PM25_D2,P_PM25_D2,D_OZONE_2,...,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_PWDIS,T_PWDIS_D2,Shape_Length,Shape_Area
10614,515150501002,,,,,,,,,,...,,,,,,,,,,
10615,515150501003,,,,,,,,,,...,,,,,,,,,,
10627,515150501001,,,,,,,,,,...,,,,,,,,,,
10628,515150501005,,,,,,,,,,...,,,,,,,,,,
10629,515150501004,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174140,040190029031,,,,,,,,,,...,,,,,,,,,,
174143,040190027012,,,,,,,,,,...,,,,,,,,,,
174184,040190027011,,,,,,,,,,...,,,,,,,,,,
174242,040194105021,,,,,,,,,,...,,,,,,,,,,
