j40-cejst-2/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb
Lucas Merrill Brown 1d541be447
Add EJSCREEN Areas of Concern (#843)
* Adding ej screen areas of concern

* Uses it where user has local files, but not otherwise

Co-authored-by: VincentLaUSDS <vincent.la@omb.eop.gov>
2021-11-02 15:38:42 -04:00

2308 lines
99 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "51412a14",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import collections\n",
"from datetime import datetime\n",
"import functools\n",
"import glob\n",
"import itertools\n",
"import os\n",
"import pathlib\n",
"import requests\n",
"import string\n",
"import sys\n",
"import typing\n",
"import zipfile\n",
"\n",
"import IPython\n",
"import numpy as np\n",
"import pandas as pd\n",
"import pypandoc\n",
"\n",
"from tqdm.notebook import tqdm_notebook\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
"\n",
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
"tqdm_notebook.pandas()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e3234c61",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
"pd.options.display.float_format = \"{:.2f}\".format\n",
"pd.set_option(\"max_columns\", None)\n",
"\n",
"# Set some global parameters\n",
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
"\n",
"## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
"EJSCREEN_DATA_DIR = DATA_DIR / \"ejscreen\"\n",
"LOCAL_DATA_OUTPUT_DIR = DATA_DIR / \"local\"\n",
"EJSCREEN_CEQ_NAT_DIR = EJSCREEN_DATA_DIR / \"CEQ_NationalExports\"\n",
"EJSCREEN_CEQ_STA_DIR = EJSCREEN_DATA_DIR / \"CEQ_StateExports\"\n",
"\n",
"# Make the dirs if they don't exist\n",
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
"COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
"# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
"# and introducing the risk of misspelling the field name.)\n",
"\n",
"GEOID_FIELD_NAME = \"GEOID10\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
"COUNTRY_FIELD_NAME = \"Country\"\n",
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
"\n",
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
"\n",
"# Define some suffixes\n",
"POPULATION_SUFFIX = \" (priority population)\""
]
},
{
"cell_type": "markdown",
"id": "376f5b2e",
"metadata": {},
"source": [
"## Loading EJ Screen CEQ Data"
]
},
{
"cell_type": "markdown",
"id": "186c15bf",
"metadata": {},
"source": [
"### National"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4843efbd",
"metadata": {},
"outputs": [],
"source": [
"# Replace this with something like glob when you have internet\n",
"filenames = [\n",
" \"CEQ_EJSCREEN_National_70.csv\",\n",
" \"CEQ_EJSCREEN_National_75.csv\",\n",
" \"CEQ_EJSCREEN_National_80.csv\",\n",
" \"CEQ_EJSCREEN_National_85.csv\",\n",
" \"CEQ_EJSCREEN_National_90.csv\",\n",
" \"CEQ_EJSCREEN_National_95.csv\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0a146972",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70\n",
"75\n",
"80\n",
"85\n",
"90\n",
"95\n"
]
}
],
"source": [
"dfs = []\n",
"for f in filenames:\n",
" percentile = f[-6:][:-4]\n",
" print(percentile)\n",
"\n",
" df = pd.read_csv(\n",
" os.path.join(\n",
" EJSCREEN_CEQ_NAT_DIR,\n",
" \"CEQ_EJSCREEN_National_{}.csv\".format(percentile),\n",
" ),\n",
" encoding=\"ISO-8859-1\",\n",
" dtype=\"str\",\n",
" )\n",
" df[\"EXCEED_COUNT\"] = pd.to_numeric(df[\"EXCEED_COUNT\"])\n",
"\n",
" df.rename(columns={\"ID\": GEOID_FIELD_NAME}, inplace=True)\n",
" df[\"percentile\"] = percentile\n",
" df = df[[GEOID_FIELD_NAME, \"percentile\", \"EXCEED_COUNT\"]]\n",
" dfs.append(df)\n",
"\n",
"df = pd.concat(dfs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "65622cbd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10_CBG</th>\n",
" <th>percentile</th>\n",
" <th>EXCEED_COUNT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010010201001</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010010202002</td>\n",
" <td>70</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010010203002</td>\n",
" <td>70</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010010206001</td>\n",
" <td>70</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010010206002</td>\n",
" <td>70</td>\n",
" <td>9</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10_CBG percentile EXCEED_COUNT\n",
"0 010010201001 70 3\n",
"1 010010202002 70 5\n",
"2 010010203002 70 7\n",
"3 010010206001 70 8\n",
"4 010010206002 70 9"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "75e2d572",
"metadata": {},
"outputs": [],
"source": [
"df_reshaped_nat = df.pivot(\n",
" index=GEOID_FIELD_NAME, columns=\"percentile\", values=\"EXCEED_COUNT\"\n",
")\n",
"df_reshaped_nat.columns = [\n",
" \"EJSCREEN Areas of Concern, National, {}th percentile\".format(p)\n",
" for p in df_reshaped_nat.columns\n",
"]\n",
"df_reshaped_nat.fillna(0, inplace=True)\n",
"\n",
"for c in df_reshaped_nat.columns:\n",
" df_reshaped_nat[c + \" (communities)\"] = (df_reshaped_nat[c] > 0) * 1\n",
"df_reshaped_nat.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "78276a83",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10_CBG</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile (communities)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010010201001</td>\n",
" <td>3.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010010202002</td>\n",
" <td>5.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010010203002</td>\n",
" <td>7.00</td>\n",
" <td>5.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010010206001</td>\n",
" <td>8.00</td>\n",
" <td>4.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010010206002</td>\n",
" <td>9.00</td>\n",
" <td>8.00</td>\n",
" <td>5.00</td>\n",
" <td>3.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10_CBG EJSCREEN Areas of Concern, National, 70th percentile \\\n",
"0 010010201001 3.00 \n",
"1 010010202002 5.00 \n",
"2 010010203002 7.00 \n",
"3 010010206001 8.00 \n",
"4 010010206002 9.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 5.00 \n",
"3 4.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 1.00 \n",
"3 1.00 \n",
"4 5.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 1.00 \n",
"4 3.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 1.00 \n",
"4 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 0.00 \n",
"4 0.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile (communities) \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reshaped_nat.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7eedff74",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile (communities)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>7.33</td>\n",
" <td>5.92</td>\n",
" <td>4.54</td>\n",
" <td>3.24</td>\n",
" <td>2.01</td>\n",
" <td>0.90</td>\n",
" <td>1.00</td>\n",
" <td>0.92</td>\n",
" <td>0.81</td>\n",
" <td>0.68</td>\n",
" <td>0.52</td>\n",
" <td>0.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3.36</td>\n",
" <td>3.76</td>\n",
" <td>3.79</td>\n",
" <td>3.50</td>\n",
" <td>2.90</td>\n",
" <td>1.93</td>\n",
" <td>0.00</td>\n",
" <td>0.27</td>\n",
" <td>0.39</td>\n",
" <td>0.47</td>\n",
" <td>0.50</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>9.00</td>\n",
" <td>6.00</td>\n",
" <td>4.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>10.00</td>\n",
" <td>10.00</td>\n",
" <td>8.00</td>\n",
" <td>6.00</td>\n",
" <td>3.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" EJSCREEN Areas of Concern, National, 70th percentile \\\n",
"count 93500.00 \n",
"mean 7.33 \n",
"std 3.36 \n",
"min 1.00 \n",
"25% 5.00 \n",
"50% 9.00 \n",
"75% 10.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile \\\n",
"count 93500.00 \n",
"mean 5.92 \n",
"std 3.76 \n",
"min 0.00 \n",
"25% 2.00 \n",
"50% 6.00 \n",
"75% 10.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile \\\n",
"count 93500.00 \n",
"mean 4.54 \n",
"std 3.79 \n",
"min 0.00 \n",
"25% 1.00 \n",
"50% 4.00 \n",
"75% 8.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile \\\n",
"count 93500.00 \n",
"mean 3.24 \n",
"std 3.50 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 2.00 \n",
"75% 6.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile \\\n",
"count 93500.00 \n",
"mean 2.01 \n",
"std 2.90 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 1.00 \n",
"75% 3.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile \\\n",
"count 93500.00 \n",
"mean 0.90 \n",
"std 1.93 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 0.00 \n",
"75% 1.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 1.00 \n",
"std 0.00 \n",
"min 1.00 \n",
"25% 1.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.92 \n",
"std 0.27 \n",
"min 0.00 \n",
"25% 1.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.81 \n",
"std 0.39 \n",
"min 0.00 \n",
"25% 1.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.68 \n",
"std 0.47 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.52 \n",
"std 0.50 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile (communities) \n",
"count 93500.00 \n",
"mean 0.31 \n",
"std 0.46 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 0.00 \n",
"75% 1.00 \n",
"max 1.00 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reshaped_nat.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "428b94f3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10_CBG</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile (communities)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" <td>93500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10_CBG EJSCREEN Areas of Concern, National, 70th percentile \\\n",
"count 93500 93500 \n",
"unique 1 1 \n",
"top False False \n",
"freq 93500 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile (communities) \n",
"count 93500 \n",
"unique 1 \n",
"top False \n",
"freq 93500 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.isnull(df_reshaped_nat).describe()"
]
},
{
"cell_type": "markdown",
"id": "7bc0f71c",
"metadata": {},
"source": [
"### State"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2de68aa5",
"metadata": {},
"outputs": [],
"source": [
"# Replace this with something like glob when you have internet\n",
"filenames = [\n",
" \"CEQ_EJSCREEN_State_70.csv\",\n",
" \"CEQ_EJSCREEN_State_75.csv\",\n",
" \"CEQ_EJSCREEN_State_80.csv\",\n",
" \"CEQ_EJSCREEN_State_85.csv\",\n",
" \"CEQ_EJSCREEN_State_90.csv\",\n",
" \"CEQ_EJSCREEN_State_95.csv\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "fccb416e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70\n",
"75\n",
"80\n",
"85\n",
"90\n",
"95\n"
]
}
],
"source": [
"dfs = []\n",
"for f in filenames:\n",
" percentile = f[-6:][:-4]\n",
" print(percentile)\n",
"\n",
" df = pd.read_csv(\n",
" os.path.join(\n",
" EJSCREEN_CEQ_STA_DIR, \"CEQ_EJSCREEN_State_{}.csv\".format(percentile)\n",
" ),\n",
" encoding=\"ISO-8859-1\",\n",
" dtype=\"str\",\n",
" )\n",
" df[\"EXCEED_COUNT\"] = pd.to_numeric(df[\"EXCEED_COUNT\"])\n",
"\n",
" df.rename(columns={\"ID\": GEOID_FIELD_NAME}, inplace=True)\n",
" df[\"percentile\"] = percentile\n",
" df = df[[GEOID_FIELD_NAME, \"percentile\", \"EXCEED_COUNT\"]]\n",
" dfs.append(df)\n",
"\n",
"df = pd.concat(dfs)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8300e454",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10_CBG</th>\n",
" <th>percentile</th>\n",
" <th>EXCEED_COUNT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010010202002</td>\n",
" <td>70</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010010203002</td>\n",
" <td>70</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010010206001</td>\n",
" <td>70</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010010206002</td>\n",
" <td>70</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010010207001</td>\n",
" <td>70</td>\n",
" <td>11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10_CBG percentile EXCEED_COUNT\n",
"0 010010202002 70 4\n",
"1 010010203002 70 3\n",
"2 010010206001 70 4\n",
"3 010010206002 70 9\n",
"4 010010207001 70 11"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "5be30b4f",
"metadata": {},
"outputs": [],
"source": [
"df_reshaped_sta = df.pivot(\n",
" index=GEOID_FIELD_NAME, columns=\"percentile\", values=\"EXCEED_COUNT\"\n",
")\n",
"df_reshaped_sta.columns = [\n",
" \"EJSCREEN Areas of Concern, State, {}th percentile\".format(p)\n",
" for p in df_reshaped_sta.columns\n",
"]\n",
"df_reshaped_sta.fillna(0, inplace=True)\n",
"\n",
"for c in df_reshaped_sta.columns:\n",
" df_reshaped_sta[c + \" (communities)\"] = (df_reshaped_sta[c] > 0) * 1\n",
"df_reshaped_sta.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "9206132b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10_CBG</th>\n",
" <th>EJSCREEN Areas of Concern, State, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 95th percentile (communities)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010010202002</td>\n",
" <td>4.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010010203002</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>2.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010010206001</td>\n",
" <td>4.00</td>\n",
" <td>3.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010010206002</td>\n",
" <td>9.00</td>\n",
" <td>8.00</td>\n",
" <td>7.00</td>\n",
" <td>4.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010010207001</td>\n",
" <td>11.00</td>\n",
" <td>10.00</td>\n",
" <td>10.00</td>\n",
" <td>8.00</td>\n",
" <td>8.00</td>\n",
" <td>8.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10_CBG EJSCREEN Areas of Concern, State, 70th percentile \\\n",
"0 010010202002 4.00 \n",
"1 010010203002 3.00 \n",
"2 010010206001 4.00 \n",
"3 010010206002 9.00 \n",
"4 010010207001 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 75th percentile \\\n",
"0 0.00 \n",
"1 3.00 \n",
"2 3.00 \n",
"3 8.00 \n",
"4 10.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 80th percentile \\\n",
"0 0.00 \n",
"1 3.00 \n",
"2 2.00 \n",
"3 7.00 \n",
"4 10.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 85th percentile \\\n",
"0 0.00 \n",
"1 2.00 \n",
"2 1.00 \n",
"3 4.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 90th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 1.00 \n",
"3 2.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 95th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 1.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 70th percentile (communities) \\\n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 75th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 80th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 85th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 90th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 95th percentile (communities) \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reshaped_sta.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b551a4df",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile (communities)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" <td>93500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>7.33</td>\n",
" <td>5.92</td>\n",
" <td>4.54</td>\n",
" <td>3.24</td>\n",
" <td>2.01</td>\n",
" <td>0.90</td>\n",
" <td>1.00</td>\n",
" <td>0.92</td>\n",
" <td>0.81</td>\n",
" <td>0.68</td>\n",
" <td>0.52</td>\n",
" <td>0.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3.36</td>\n",
" <td>3.76</td>\n",
" <td>3.79</td>\n",
" <td>3.50</td>\n",
" <td>2.90</td>\n",
" <td>1.93</td>\n",
" <td>0.00</td>\n",
" <td>0.27</td>\n",
" <td>0.39</td>\n",
" <td>0.47</td>\n",
" <td>0.50</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>9.00</td>\n",
" <td>6.00</td>\n",
" <td>4.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>10.00</td>\n",
" <td>10.00</td>\n",
" <td>8.00</td>\n",
" <td>6.00</td>\n",
" <td>3.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" EJSCREEN Areas of Concern, National, 70th percentile \\\n",
"count 93500.00 \n",
"mean 7.33 \n",
"std 3.36 \n",
"min 1.00 \n",
"25% 5.00 \n",
"50% 9.00 \n",
"75% 10.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile \\\n",
"count 93500.00 \n",
"mean 5.92 \n",
"std 3.76 \n",
"min 0.00 \n",
"25% 2.00 \n",
"50% 6.00 \n",
"75% 10.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile \\\n",
"count 93500.00 \n",
"mean 4.54 \n",
"std 3.79 \n",
"min 0.00 \n",
"25% 1.00 \n",
"50% 4.00 \n",
"75% 8.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile \\\n",
"count 93500.00 \n",
"mean 3.24 \n",
"std 3.50 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 2.00 \n",
"75% 6.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile \\\n",
"count 93500.00 \n",
"mean 2.01 \n",
"std 2.90 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 1.00 \n",
"75% 3.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile \\\n",
"count 93500.00 \n",
"mean 0.90 \n",
"std 1.93 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 0.00 \n",
"75% 1.00 \n",
"max 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 1.00 \n",
"std 0.00 \n",
"min 1.00 \n",
"25% 1.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.92 \n",
"std 0.27 \n",
"min 0.00 \n",
"25% 1.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.81 \n",
"std 0.39 \n",
"min 0.00 \n",
"25% 1.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.68 \n",
"std 0.47 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n",
"count 93500.00 \n",
"mean 0.52 \n",
"std 0.50 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 1.00 \n",
"75% 1.00 \n",
"max 1.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile (communities) \n",
"count 93500.00 \n",
"mean 0.31 \n",
"std 0.46 \n",
"min 0.00 \n",
"25% 0.00 \n",
"50% 0.00 \n",
"75% 1.00 \n",
"max 1.00 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reshaped_nat.describe()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c3cb5696",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10_CBG</th>\n",
" <th>EJSCREEN Areas of Concern, State, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 95th percentile (communities)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" <td>87555</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10_CBG EJSCREEN Areas of Concern, State, 70th percentile \\\n",
"count 87555 87555 \n",
"unique 1 1 \n",
"top False False \n",
"freq 87555 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 75th percentile \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 80th percentile \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 85th percentile \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 90th percentile \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 95th percentile \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 70th percentile (communities) \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 75th percentile (communities) \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 80th percentile (communities) \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 85th percentile (communities) \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 90th percentile (communities) \\\n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 \n",
"\n",
" EJSCREEN Areas of Concern, State, 95th percentile (communities) \n",
"count 87555 \n",
"unique 1 \n",
"top False \n",
"freq 87555 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.isnull(df_reshaped_sta).describe()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "099cca8c",
"metadata": {},
"outputs": [],
"source": [
"df_reshaped = df_reshaped_nat.merge(df_reshaped_sta, on=GEOID_FIELD_NAME)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "23097787",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10_CBG</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, National, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, National, 95th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 70th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 75th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 80th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 85th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 90th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 95th percentile</th>\n",
" <th>EJSCREEN Areas of Concern, State, 70th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 75th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 80th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 85th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 90th percentile (communities)</th>\n",
" <th>EJSCREEN Areas of Concern, State, 95th percentile (communities)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010010202002</td>\n",
" <td>5.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010010203002</td>\n",
" <td>7.00</td>\n",
" <td>5.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>2.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010010206001</td>\n",
" <td>8.00</td>\n",
" <td>4.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>4.00</td>\n",
" <td>3.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010010206002</td>\n",
" <td>9.00</td>\n",
" <td>8.00</td>\n",
" <td>5.00</td>\n",
" <td>3.00</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>9.00</td>\n",
" <td>8.00</td>\n",
" <td>7.00</td>\n",
" <td>4.00</td>\n",
" <td>2.00</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010010207001</td>\n",
" <td>11.00</td>\n",
" <td>11.00</td>\n",
" <td>8.00</td>\n",
" <td>8.00</td>\n",
" <td>6.00</td>\n",
" <td>3.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>11.00</td>\n",
" <td>10.00</td>\n",
" <td>10.00</td>\n",
" <td>8.00</td>\n",
" <td>8.00</td>\n",
" <td>8.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10_CBG EJSCREEN Areas of Concern, National, 70th percentile \\\n",
"0 010010202002 5.00 \n",
"1 010010203002 7.00 \n",
"2 010010206001 8.00 \n",
"3 010010206002 9.00 \n",
"4 010010207001 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile \\\n",
"0 0.00 \n",
"1 5.00 \n",
"2 4.00 \n",
"3 8.00 \n",
"4 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile \\\n",
"0 0.00 \n",
"1 1.00 \n",
"2 1.00 \n",
"3 5.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 1.00 \n",
"3 3.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 1.00 \n",
"3 1.00 \n",
"4 6.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 0.00 \n",
"4 3.00 \n",
"\n",
" EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, National, 95th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 70th percentile \\\n",
"0 4.00 \n",
"1 3.00 \n",
"2 4.00 \n",
"3 9.00 \n",
"4 11.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 75th percentile \\\n",
"0 0.00 \n",
"1 3.00 \n",
"2 3.00 \n",
"3 8.00 \n",
"4 10.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 80th percentile \\\n",
"0 0.00 \n",
"1 3.00 \n",
"2 2.00 \n",
"3 7.00 \n",
"4 10.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 85th percentile \\\n",
"0 0.00 \n",
"1 2.00 \n",
"2 1.00 \n",
"3 4.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 90th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 1.00 \n",
"3 2.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 95th percentile \\\n",
"0 0.00 \n",
"1 0.00 \n",
"2 0.00 \n",
"3 1.00 \n",
"4 8.00 \n",
"\n",
" EJSCREEN Areas of Concern, State, 70th percentile (communities) \\\n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 75th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 80th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 85th percentile (communities) \\\n",
"0 0 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 90th percentile (communities) \\\n",
"0 0 \n",
"1 0 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"\n",
" EJSCREEN Areas of Concern, State, 95th percentile (communities) \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reshaped.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "56098d7b",
"metadata": {},
"outputs": [],
"source": [
"df_reshaped.to_csv(\n",
" path_or_buf=LOCAL_DATA_OUTPUT_DIR\n",
" / \"ejscreen_areas_of_concerns_indicators.csv\",\n",
" na_rep=\"\",\n",
" index=False,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "403dfbc6",
"metadata": {},
"source": [
"# Next Steps / Questions\n",
"Lucas, here's what the output file looks like. For each CBG I have new columns corresponding to the different percentiles for both State and National. For each percentile there are two columns: one for the number of `EXCEED_COUNT` and a boolean indicator for whether `EXCEED_COUNT > 0` for that percentile. I think that's what we wanted right?\n",
"\n",
"1. Do we have a list of all CBGs? The reason for asking is I created a CSV that lists each CBG and the number of EJSCREEN Areas of Concerns for each percentile. It's not going to have all the CBGs in them since if the CBG doesn't have an area concern at least at the 70th percentile, then the CBG wouldn't have appeared in the source data set. Do we want to make sure to add all the remaining CBGs with 0's across the board? \n",
"1. Definitely need to clean up the code, at least not make it so duplicatous across national and state"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}