diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index de9e3443..de56e634 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -12,7 +12,7 @@ DATASET_LIST = [ { "name": "ejscreen", "module_dir": "ejscreen", - "class_name": "EJScreenETL", + "class_name": "EJSCREENETL", }, { "name": "housing_and_transportation", @@ -69,6 +69,11 @@ DATASET_LIST = [ "module_dir": "persistent_poverty", "class_name": "PersistentPovertyETL", }, + { + "name": "ejscreen_areas_of_concern", + "module_dir": "ejscreen_areas_of_concern", + "class_name": "EJSCREENAreasOfConcernETL", + }, ] CENSUS_INFO = { "name": "census", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index f19f0976..23506b41 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -8,7 +8,6 @@ from data_pipeline.etl.score import constants from data_pipeline.utils import get_module_logger - logger = get_module_logger(__name__) @@ -231,6 +230,7 @@ class ScoreETL(ExtractTransformLoad): self.census_acs_median_incomes_df, self.national_risk_index_df, ] + census_block_group_df = self._join_cbg_dfs(census_block_group_dfs) # Join all the data sources that use census tracts @@ -312,10 +312,12 @@ class ScoreETL(ExtractTransformLoad): field_names.UNEMPLOYMENT_FIELD, field_names.HT_INDEX_FIELD, ] + non_numeric_columns = [ self.GEOID_FIELD_NAME, field_names.PERSISTENT_POVERTY_FIELD, ] + columns_to_keep = non_numeric_columns + numeric_columns df = df[columns_to_keep] diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py index cef48080..8bb78f60 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py @@ -6,7 +6,7 @@ from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) -class EJScreenETL(ExtractTransformLoad): +class EJSCREENETL(ExtractTransformLoad): def __init__(self): self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip" self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv" diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/README.md b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/README.md new file mode 100644 index 00000000..cbcbb27e --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/README.md @@ -0,0 +1,3 @@ +# EJ Screen Areas of Concern Data + +Note, this dataset is actually not public, so the data file must be stored locally in order for this to run diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py new file mode 100644 index 00000000..0bb036d5 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py @@ -0,0 +1,73 @@ +import pandas as pd + +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +class EJSCREENAreasOfConcernETL(ExtractTransformLoad): + # Note: while we normally set these properties in `__init__`, + # we are setting them as class properties here so they can be accessed by the + # class method `ejscreen_areas_of_concern_data_exists`. + LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local" + EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = ( + LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv" + ) + + def __init__(self): + self.OUTPUT_PATH = ( + self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern" + ) + + # TO DO: Load from actual source; the issue is that this dataset is not public for now + self.df: pd.DataFrame + + @classmethod + def ejscreen_areas_of_concern_data_exists(cls): + """Check whether or not the EJSCREEN areas of concern data exists. + + Note: this data is provided privately and is not currently publicly available. + + To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person + running it has access to that data, the following method checks whether the source file exists. + + If it does exist, code can and should include to this data. If it does not exist, code should + not reference this data. + + """ + return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file() + + def extract(self) -> None: + if self.ejscreen_areas_of_concern_data_exists(): + logger.info("Loading EJSCREEN Areas of Concern Data Locally") + self.df = pd.read_csv( + filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA, + dtype={ + self.GEOID_FIELD_NAME: "string", + }, + low_memory=False, + ) + else: + logger.info( + "EJSCREEN areas of concern data does not exist locally. Not loading the data." + ) + + def transform(self) -> None: + logger.info("Transforming EJSCREEN Areas of Concern Data") + + # TO DO: As a one off we did all the processing in a separate Notebook + # Can add here later for a future PR + pass + + def load(self) -> None: + if self.ejscreen_areas_of_concern_data_exists(): + logger.info("Saving EJSCREEN Areas of Concern Data") + # write nationwide csv + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False) + + else: + logger.info( + "EJSCREEN areas of concern data does not exist locally. Not saving the data." + ) diff --git a/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb b/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb new file mode 100644 index 00000000..1042698e --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb @@ -0,0 +1,2308 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "51412a14", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import collections\n", + "from datetime import datetime\n", + "import functools\n", + "import glob\n", + "import itertools\n", + "import os\n", + "import pathlib\n", + "import requests\n", + "import string\n", + "import sys\n", + "import typing\n", + "import zipfile\n", + "\n", + "import IPython\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pypandoc\n", + "\n", + "from tqdm.notebook import tqdm_notebook\n", + "\n", + "module_path = os.path.abspath(os.path.join(\"../..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n", + "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n", + "\n", + "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", + "tqdm_notebook.pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e3234c61", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"max_columns\", None)\n", + "\n", + "# Set some global parameters\n", + "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", + "TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n", + "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n", + "\n", + "## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n", + "EJSCREEN_DATA_DIR = DATA_DIR / \"ejscreen\"\n", + "LOCAL_DATA_OUTPUT_DIR = DATA_DIR / \"local\"\n", + "EJSCREEN_CEQ_NAT_DIR = EJSCREEN_DATA_DIR / \"CEQ_NationalExports\"\n", + "EJSCREEN_CEQ_STA_DIR = EJSCREEN_DATA_DIR / \"CEQ_StateExports\"\n", + "\n", + "# Make the dirs if they don't exist\n", + "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n", + "# and introducing the risk of misspelling the field name.)\n", + "\n", + "GEOID_FIELD_NAME = \"GEOID10\"\n", + "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", + "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n", + "COUNTRY_FIELD_NAME = \"Country\"\n", + "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n", + "\n", + "CEJST_SCORE_FIELD = \"cejst_score\"\n", + "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n", + "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", + "\n", + "# Define some suffixes\n", + "POPULATION_SUFFIX = \" (priority population)\"" + ] + }, + { + "cell_type": "markdown", + "id": "376f5b2e", + "metadata": {}, + "source": [ + "## Loading EJ Screen CEQ Data" + ] + }, + { + "cell_type": "markdown", + "id": "186c15bf", + "metadata": {}, + "source": [ + "### National" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4843efbd", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace this with something like glob when you have internet\n", + "filenames = [\n", + " \"CEQ_EJSCREEN_National_70.csv\",\n", + " \"CEQ_EJSCREEN_National_75.csv\",\n", + " \"CEQ_EJSCREEN_National_80.csv\",\n", + " \"CEQ_EJSCREEN_National_85.csv\",\n", + " \"CEQ_EJSCREEN_National_90.csv\",\n", + " \"CEQ_EJSCREEN_National_95.csv\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0a146972", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "70\n", + "75\n", + "80\n", + "85\n", + "90\n", + "95\n" + ] + } + ], + "source": [ + "dfs = []\n", + "for f in filenames:\n", + " percentile = f[-6:][:-4]\n", + " print(percentile)\n", + "\n", + " df = pd.read_csv(\n", + " os.path.join(\n", + " EJSCREEN_CEQ_NAT_DIR,\n", + " \"CEQ_EJSCREEN_National_{}.csv\".format(percentile),\n", + " ),\n", + " encoding=\"ISO-8859-1\",\n", + " dtype=\"str\",\n", + " )\n", + " df[\"EXCEED_COUNT\"] = pd.to_numeric(df[\"EXCEED_COUNT\"])\n", + "\n", + " df.rename(columns={\"ID\": GEOID_FIELD_NAME}, inplace=True)\n", + " df[\"percentile\"] = percentile\n", + " df = df[[GEOID_FIELD_NAME, \"percentile\", \"EXCEED_COUNT\"]]\n", + " dfs.append(df)\n", + "\n", + "df = pd.concat(dfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "65622cbd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10_CBGpercentileEXCEED_COUNT
0010010201001703
1010010202002705
2010010203002707
3010010206001708
4010010206002709
\n", + "
" + ], + "text/plain": [ + " GEOID10_CBG percentile EXCEED_COUNT\n", + "0 010010201001 70 3\n", + "1 010010202002 70 5\n", + "2 010010203002 70 7\n", + "3 010010206001 70 8\n", + "4 010010206002 70 9" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "75e2d572", + "metadata": {}, + "outputs": [], + "source": [ + "df_reshaped_nat = df.pivot(\n", + " index=GEOID_FIELD_NAME, columns=\"percentile\", values=\"EXCEED_COUNT\"\n", + ")\n", + "df_reshaped_nat.columns = [\n", + " \"EJSCREEN Areas of Concern, National, {}th percentile\".format(p)\n", + " for p in df_reshaped_nat.columns\n", + "]\n", + "df_reshaped_nat.fillna(0, inplace=True)\n", + "\n", + "for c in df_reshaped_nat.columns:\n", + " df_reshaped_nat[c + \" (communities)\"] = (df_reshaped_nat[c] > 0) * 1\n", + "df_reshaped_nat.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "78276a83", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10_CBGEJSCREEN Areas of Concern, National, 70th percentileEJSCREEN Areas of Concern, National, 75th percentileEJSCREEN Areas of Concern, National, 80th percentileEJSCREEN Areas of Concern, National, 85th percentileEJSCREEN Areas of Concern, National, 90th percentileEJSCREEN Areas of Concern, National, 95th percentileEJSCREEN Areas of Concern, National, 70th percentile (communities)EJSCREEN Areas of Concern, National, 75th percentile (communities)EJSCREEN Areas of Concern, National, 80th percentile (communities)EJSCREEN Areas of Concern, National, 85th percentile (communities)EJSCREEN Areas of Concern, National, 90th percentile (communities)EJSCREEN Areas of Concern, National, 95th percentile (communities)
00100102010013.000.000.000.000.000.00100000
10100102020025.000.000.000.000.000.00100000
20100102030027.005.001.000.000.000.00111000
30100102060018.004.001.001.001.000.00111110
40100102060029.008.005.003.001.000.00111110
\n", + "
" + ], + "text/plain": [ + " GEOID10_CBG EJSCREEN Areas of Concern, National, 70th percentile \\\n", + "0 010010201001 3.00 \n", + "1 010010202002 5.00 \n", + "2 010010203002 7.00 \n", + "3 010010206001 8.00 \n", + "4 010010206002 9.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 5.00 \n", + "3 4.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 1.00 \n", + "3 1.00 \n", + "4 5.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 1.00 \n", + "4 3.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 1.00 \n", + "4 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 0.00 \n", + "4 0.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile (communities) \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_reshaped_nat.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7eedff74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EJSCREEN Areas of Concern, National, 70th percentileEJSCREEN Areas of Concern, National, 75th percentileEJSCREEN Areas of Concern, National, 80th percentileEJSCREEN Areas of Concern, National, 85th percentileEJSCREEN Areas of Concern, National, 90th percentileEJSCREEN Areas of Concern, National, 95th percentileEJSCREEN Areas of Concern, National, 70th percentile (communities)EJSCREEN Areas of Concern, National, 75th percentile (communities)EJSCREEN Areas of Concern, National, 80th percentile (communities)EJSCREEN Areas of Concern, National, 85th percentile (communities)EJSCREEN Areas of Concern, National, 90th percentile (communities)EJSCREEN Areas of Concern, National, 95th percentile (communities)
count93500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.00
mean7.335.924.543.242.010.901.000.920.810.680.520.31
std3.363.763.793.502.901.930.000.270.390.470.500.46
min1.000.000.000.000.000.001.000.000.000.000.000.00
25%5.002.001.000.000.000.001.001.001.000.000.000.00
50%9.006.004.002.001.000.001.001.001.001.001.000.00
75%10.0010.008.006.003.001.001.001.001.001.001.001.00
max11.0011.0011.0011.0011.0011.001.001.001.001.001.001.00
\n", + "
" + ], + "text/plain": [ + " EJSCREEN Areas of Concern, National, 70th percentile \\\n", + "count 93500.00 \n", + "mean 7.33 \n", + "std 3.36 \n", + "min 1.00 \n", + "25% 5.00 \n", + "50% 9.00 \n", + "75% 10.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile \\\n", + "count 93500.00 \n", + "mean 5.92 \n", + "std 3.76 \n", + "min 0.00 \n", + "25% 2.00 \n", + "50% 6.00 \n", + "75% 10.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile \\\n", + "count 93500.00 \n", + "mean 4.54 \n", + "std 3.79 \n", + "min 0.00 \n", + "25% 1.00 \n", + "50% 4.00 \n", + "75% 8.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile \\\n", + "count 93500.00 \n", + "mean 3.24 \n", + "std 3.50 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 2.00 \n", + "75% 6.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile \\\n", + "count 93500.00 \n", + "mean 2.01 \n", + "std 2.90 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 1.00 \n", + "75% 3.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile \\\n", + "count 93500.00 \n", + "mean 0.90 \n", + "std 1.93 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 0.00 \n", + "75% 1.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 1.00 \n", + "std 0.00 \n", + "min 1.00 \n", + "25% 1.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.92 \n", + "std 0.27 \n", + "min 0.00 \n", + "25% 1.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.81 \n", + "std 0.39 \n", + "min 0.00 \n", + "25% 1.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.68 \n", + "std 0.47 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.52 \n", + "std 0.50 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile (communities) \n", + "count 93500.00 \n", + "mean 0.31 \n", + "std 0.46 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 0.00 \n", + "75% 1.00 \n", + "max 1.00 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_reshaped_nat.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "428b94f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10_CBGEJSCREEN Areas of Concern, National, 70th percentileEJSCREEN Areas of Concern, National, 75th percentileEJSCREEN Areas of Concern, National, 80th percentileEJSCREEN Areas of Concern, National, 85th percentileEJSCREEN Areas of Concern, National, 90th percentileEJSCREEN Areas of Concern, National, 95th percentileEJSCREEN Areas of Concern, National, 70th percentile (communities)EJSCREEN Areas of Concern, National, 75th percentile (communities)EJSCREEN Areas of Concern, National, 80th percentile (communities)EJSCREEN Areas of Concern, National, 85th percentile (communities)EJSCREEN Areas of Concern, National, 90th percentile (communities)EJSCREEN Areas of Concern, National, 95th percentile (communities)
count93500935009350093500935009350093500935009350093500935009350093500
unique1111111111111
topFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
freq93500935009350093500935009350093500935009350093500935009350093500
\n", + "
" + ], + "text/plain": [ + " GEOID10_CBG EJSCREEN Areas of Concern, National, 70th percentile \\\n", + "count 93500 93500 \n", + "unique 1 1 \n", + "top False False \n", + "freq 93500 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile (communities) \n", + "count 93500 \n", + "unique 1 \n", + "top False \n", + "freq 93500 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.isnull(df_reshaped_nat).describe()" + ] + }, + { + "cell_type": "markdown", + "id": "7bc0f71c", + "metadata": {}, + "source": [ + "### State" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2de68aa5", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace this with something like glob when you have internet\n", + "filenames = [\n", + " \"CEQ_EJSCREEN_State_70.csv\",\n", + " \"CEQ_EJSCREEN_State_75.csv\",\n", + " \"CEQ_EJSCREEN_State_80.csv\",\n", + " \"CEQ_EJSCREEN_State_85.csv\",\n", + " \"CEQ_EJSCREEN_State_90.csv\",\n", + " \"CEQ_EJSCREEN_State_95.csv\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "fccb416e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "70\n", + "75\n", + "80\n", + "85\n", + "90\n", + "95\n" + ] + } + ], + "source": [ + "dfs = []\n", + "for f in filenames:\n", + " percentile = f[-6:][:-4]\n", + " print(percentile)\n", + "\n", + " df = pd.read_csv(\n", + " os.path.join(\n", + " EJSCREEN_CEQ_STA_DIR, \"CEQ_EJSCREEN_State_{}.csv\".format(percentile)\n", + " ),\n", + " encoding=\"ISO-8859-1\",\n", + " dtype=\"str\",\n", + " )\n", + " df[\"EXCEED_COUNT\"] = pd.to_numeric(df[\"EXCEED_COUNT\"])\n", + "\n", + " df.rename(columns={\"ID\": GEOID_FIELD_NAME}, inplace=True)\n", + " df[\"percentile\"] = percentile\n", + " df = df[[GEOID_FIELD_NAME, \"percentile\", \"EXCEED_COUNT\"]]\n", + " dfs.append(df)\n", + "\n", + "df = pd.concat(dfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8300e454", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10_CBGpercentileEXCEED_COUNT
0010010202002704
1010010203002703
2010010206001704
3010010206002709
40100102070017011
\n", + "
" + ], + "text/plain": [ + " GEOID10_CBG percentile EXCEED_COUNT\n", + "0 010010202002 70 4\n", + "1 010010203002 70 3\n", + "2 010010206001 70 4\n", + "3 010010206002 70 9\n", + "4 010010207001 70 11" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5be30b4f", + "metadata": {}, + "outputs": [], + "source": [ + "df_reshaped_sta = df.pivot(\n", + " index=GEOID_FIELD_NAME, columns=\"percentile\", values=\"EXCEED_COUNT\"\n", + ")\n", + "df_reshaped_sta.columns = [\n", + " \"EJSCREEN Areas of Concern, State, {}th percentile\".format(p)\n", + " for p in df_reshaped_sta.columns\n", + "]\n", + "df_reshaped_sta.fillna(0, inplace=True)\n", + "\n", + "for c in df_reshaped_sta.columns:\n", + " df_reshaped_sta[c + \" (communities)\"] = (df_reshaped_sta[c] > 0) * 1\n", + "df_reshaped_sta.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9206132b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10_CBGEJSCREEN Areas of Concern, State, 70th percentileEJSCREEN Areas of Concern, State, 75th percentileEJSCREEN Areas of Concern, State, 80th percentileEJSCREEN Areas of Concern, State, 85th percentileEJSCREEN Areas of Concern, State, 90th percentileEJSCREEN Areas of Concern, State, 95th percentileEJSCREEN Areas of Concern, State, 70th percentile (communities)EJSCREEN Areas of Concern, State, 75th percentile (communities)EJSCREEN Areas of Concern, State, 80th percentile (communities)EJSCREEN Areas of Concern, State, 85th percentile (communities)EJSCREEN Areas of Concern, State, 90th percentile (communities)EJSCREEN Areas of Concern, State, 95th percentile (communities)
00100102020024.000.000.000.000.000.00100000
10100102030023.003.003.002.000.000.00111100
20100102060014.003.002.001.001.000.00111110
30100102060029.008.007.004.002.001.00111111
401001020700111.0010.0010.008.008.008.00111111
\n", + "
" + ], + "text/plain": [ + " GEOID10_CBG EJSCREEN Areas of Concern, State, 70th percentile \\\n", + "0 010010202002 4.00 \n", + "1 010010203002 3.00 \n", + "2 010010206001 4.00 \n", + "3 010010206002 9.00 \n", + "4 010010207001 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 75th percentile \\\n", + "0 0.00 \n", + "1 3.00 \n", + "2 3.00 \n", + "3 8.00 \n", + "4 10.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 80th percentile \\\n", + "0 0.00 \n", + "1 3.00 \n", + "2 2.00 \n", + "3 7.00 \n", + "4 10.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 85th percentile \\\n", + "0 0.00 \n", + "1 2.00 \n", + "2 1.00 \n", + "3 4.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 90th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 1.00 \n", + "3 2.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 95th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 1.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 70th percentile (communities) \\\n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 75th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 80th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 85th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 90th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 95th percentile (communities) \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_reshaped_sta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b551a4df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EJSCREEN Areas of Concern, National, 70th percentileEJSCREEN Areas of Concern, National, 75th percentileEJSCREEN Areas of Concern, National, 80th percentileEJSCREEN Areas of Concern, National, 85th percentileEJSCREEN Areas of Concern, National, 90th percentileEJSCREEN Areas of Concern, National, 95th percentileEJSCREEN Areas of Concern, National, 70th percentile (communities)EJSCREEN Areas of Concern, National, 75th percentile (communities)EJSCREEN Areas of Concern, National, 80th percentile (communities)EJSCREEN Areas of Concern, National, 85th percentile (communities)EJSCREEN Areas of Concern, National, 90th percentile (communities)EJSCREEN Areas of Concern, National, 95th percentile (communities)
count93500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.0093500.00
mean7.335.924.543.242.010.901.000.920.810.680.520.31
std3.363.763.793.502.901.930.000.270.390.470.500.46
min1.000.000.000.000.000.001.000.000.000.000.000.00
25%5.002.001.000.000.000.001.001.001.000.000.000.00
50%9.006.004.002.001.000.001.001.001.001.001.000.00
75%10.0010.008.006.003.001.001.001.001.001.001.001.00
max11.0011.0011.0011.0011.0011.001.001.001.001.001.001.00
\n", + "
" + ], + "text/plain": [ + " EJSCREEN Areas of Concern, National, 70th percentile \\\n", + "count 93500.00 \n", + "mean 7.33 \n", + "std 3.36 \n", + "min 1.00 \n", + "25% 5.00 \n", + "50% 9.00 \n", + "75% 10.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile \\\n", + "count 93500.00 \n", + "mean 5.92 \n", + "std 3.76 \n", + "min 0.00 \n", + "25% 2.00 \n", + "50% 6.00 \n", + "75% 10.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile \\\n", + "count 93500.00 \n", + "mean 4.54 \n", + "std 3.79 \n", + "min 0.00 \n", + "25% 1.00 \n", + "50% 4.00 \n", + "75% 8.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile \\\n", + "count 93500.00 \n", + "mean 3.24 \n", + "std 3.50 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 2.00 \n", + "75% 6.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile \\\n", + "count 93500.00 \n", + "mean 2.01 \n", + "std 2.90 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 1.00 \n", + "75% 3.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile \\\n", + "count 93500.00 \n", + "mean 0.90 \n", + "std 1.93 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 0.00 \n", + "75% 1.00 \n", + "max 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 1.00 \n", + "std 0.00 \n", + "min 1.00 \n", + "25% 1.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.92 \n", + "std 0.27 \n", + "min 0.00 \n", + "25% 1.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.81 \n", + "std 0.39 \n", + "min 0.00 \n", + "25% 1.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.68 \n", + "std 0.47 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n", + "count 93500.00 \n", + "mean 0.52 \n", + "std 0.50 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 1.00 \n", + "75% 1.00 \n", + "max 1.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile (communities) \n", + "count 93500.00 \n", + "mean 0.31 \n", + "std 0.46 \n", + "min 0.00 \n", + "25% 0.00 \n", + "50% 0.00 \n", + "75% 1.00 \n", + "max 1.00 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_reshaped_nat.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c3cb5696", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10_CBGEJSCREEN Areas of Concern, State, 70th percentileEJSCREEN Areas of Concern, State, 75th percentileEJSCREEN Areas of Concern, State, 80th percentileEJSCREEN Areas of Concern, State, 85th percentileEJSCREEN Areas of Concern, State, 90th percentileEJSCREEN Areas of Concern, State, 95th percentileEJSCREEN Areas of Concern, State, 70th percentile (communities)EJSCREEN Areas of Concern, State, 75th percentile (communities)EJSCREEN Areas of Concern, State, 80th percentile (communities)EJSCREEN Areas of Concern, State, 85th percentile (communities)EJSCREEN Areas of Concern, State, 90th percentile (communities)EJSCREEN Areas of Concern, State, 95th percentile (communities)
count87555875558755587555875558755587555875558755587555875558755587555
unique1111111111111
topFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
freq87555875558755587555875558755587555875558755587555875558755587555
\n", + "
" + ], + "text/plain": [ + " GEOID10_CBG EJSCREEN Areas of Concern, State, 70th percentile \\\n", + "count 87555 87555 \n", + "unique 1 1 \n", + "top False False \n", + "freq 87555 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 75th percentile \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 80th percentile \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 85th percentile \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 90th percentile \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 95th percentile \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 70th percentile (communities) \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 75th percentile (communities) \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 80th percentile (communities) \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 85th percentile (communities) \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 90th percentile (communities) \\\n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 \n", + "\n", + " EJSCREEN Areas of Concern, State, 95th percentile (communities) \n", + "count 87555 \n", + "unique 1 \n", + "top False \n", + "freq 87555 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.isnull(df_reshaped_sta).describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "099cca8c", + "metadata": {}, + "outputs": [], + "source": [ + "df_reshaped = df_reshaped_nat.merge(df_reshaped_sta, on=GEOID_FIELD_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "23097787", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10_CBGEJSCREEN Areas of Concern, National, 70th percentileEJSCREEN Areas of Concern, National, 75th percentileEJSCREEN Areas of Concern, National, 80th percentileEJSCREEN Areas of Concern, National, 85th percentileEJSCREEN Areas of Concern, National, 90th percentileEJSCREEN Areas of Concern, National, 95th percentileEJSCREEN Areas of Concern, National, 70th percentile (communities)EJSCREEN Areas of Concern, National, 75th percentile (communities)EJSCREEN Areas of Concern, National, 80th percentile (communities)EJSCREEN Areas of Concern, National, 85th percentile (communities)EJSCREEN Areas of Concern, National, 90th percentile (communities)EJSCREEN Areas of Concern, National, 95th percentile (communities)EJSCREEN Areas of Concern, State, 70th percentileEJSCREEN Areas of Concern, State, 75th percentileEJSCREEN Areas of Concern, State, 80th percentileEJSCREEN Areas of Concern, State, 85th percentileEJSCREEN Areas of Concern, State, 90th percentileEJSCREEN Areas of Concern, State, 95th percentileEJSCREEN Areas of Concern, State, 70th percentile (communities)EJSCREEN Areas of Concern, State, 75th percentile (communities)EJSCREEN Areas of Concern, State, 80th percentile (communities)EJSCREEN Areas of Concern, State, 85th percentile (communities)EJSCREEN Areas of Concern, State, 90th percentile (communities)EJSCREEN Areas of Concern, State, 95th percentile (communities)
00100102020025.000.000.000.000.000.001000004.000.000.000.000.000.00100000
10100102030027.005.001.000.000.000.001110003.003.003.002.000.000.00111100
20100102060018.004.001.001.001.000.001111104.003.002.001.001.000.00111110
30100102060029.008.005.003.001.000.001111109.008.007.004.002.001.00111111
401001020700111.0011.008.008.006.003.0011111111.0010.0010.008.008.008.00111111
\n", + "
" + ], + "text/plain": [ + " GEOID10_CBG EJSCREEN Areas of Concern, National, 70th percentile \\\n", + "0 010010202002 5.00 \n", + "1 010010203002 7.00 \n", + "2 010010206001 8.00 \n", + "3 010010206002 9.00 \n", + "4 010010207001 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile \\\n", + "0 0.00 \n", + "1 5.00 \n", + "2 4.00 \n", + "3 8.00 \n", + "4 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile \\\n", + "0 0.00 \n", + "1 1.00 \n", + "2 1.00 \n", + "3 5.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 1.00 \n", + "3 3.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 1.00 \n", + "3 1.00 \n", + "4 6.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 0.00 \n", + "4 3.00 \n", + "\n", + " EJSCREEN Areas of Concern, National, 70th percentile (communities) \\\n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 75th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 80th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 85th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 90th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, National, 95th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 70th percentile \\\n", + "0 4.00 \n", + "1 3.00 \n", + "2 4.00 \n", + "3 9.00 \n", + "4 11.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 75th percentile \\\n", + "0 0.00 \n", + "1 3.00 \n", + "2 3.00 \n", + "3 8.00 \n", + "4 10.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 80th percentile \\\n", + "0 0.00 \n", + "1 3.00 \n", + "2 2.00 \n", + "3 7.00 \n", + "4 10.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 85th percentile \\\n", + "0 0.00 \n", + "1 2.00 \n", + "2 1.00 \n", + "3 4.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 90th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 1.00 \n", + "3 2.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 95th percentile \\\n", + "0 0.00 \n", + "1 0.00 \n", + "2 0.00 \n", + "3 1.00 \n", + "4 8.00 \n", + "\n", + " EJSCREEN Areas of Concern, State, 70th percentile (communities) \\\n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 75th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 80th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 85th percentile (communities) \\\n", + "0 0 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 90th percentile (communities) \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "\n", + " EJSCREEN Areas of Concern, State, 95th percentile (communities) \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_reshaped.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "56098d7b", + "metadata": {}, + "outputs": [], + "source": [ + "df_reshaped.to_csv(\n", + " path_or_buf=LOCAL_DATA_OUTPUT_DIR\n", + " / \"ejscreen_areas_of_concerns_indicators.csv\",\n", + " na_rep=\"\",\n", + " index=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "403dfbc6", + "metadata": {}, + "source": [ + "# Next Steps / Questions\n", + "Lucas, here's what the output file looks like. For each CBG I have new columns corresponding to the different percentiles for both State and National. For each percentile there are two columns: one for the number of `EXCEED_COUNT` and a boolean indicator for whether `EXCEED_COUNT > 0` for that percentile. I think that's what we wanted right?\n", + "\n", + "1. Do we have a list of all CBGs? The reason for asking is I created a CSV that lists each CBG and the number of EJSCREEN Areas of Concerns for each percentile. It's not going to have all the CBGs in them since if the CBG doesn't have an area concern at least at the 70th percentile, then the CBG wouldn't have appeared in the source data set. Do we want to make sure to add all the remaining CBGs with 0's across the board? \n", + "1. Definitely need to clean up the code, at least not make it so duplicatous across national and state" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 3a97dc2e..61ca4b20 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -34,7 +34,9 @@ "\n", "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n", "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n", - "\n", + "from data_pipeline.etl.sources.ejscreen_areas_of_concern.etl import (\n", + " EJSCREENAreasOfConcernETL,\n", + ")\n", "\n", "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", "tqdm_notebook.pandas()" @@ -77,6 +79,14 @@ "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n", "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", "\n", + "LIFE_EXPECTANCY_FIELD = \"Life expectancy (years)\"\n", + "HEALTH_INSURANCE_FIELD = (\n", + " \"Current lack of health insurance among adults aged 18-64 years\"\n", + ")\n", + "BAD_HEALTH_FIELD = (\n", + " \"Physical health not good for >=14 days among adults aged >=18 years\"\n", + ")\n", + "\n", "# Define some suffixes\n", "POPULATION_SUFFIX = \" (priority population)\"" ] @@ -108,6 +118,55 @@ "cejst_df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b1083e8", + "metadata": {}, + "outputs": [], + "source": [ + "# Load EJSCREEN Areas of Concern data.\n", + "\n", + "# Load EJ Screen Areas of Concern\n", + "# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n", + "ejscreen_areas_of_concern_df: pd.DataFrame = None\n", + "\n", + "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n", + " print(\"Loading EJSCREEN Areas of Concern data for score pipeline.\")\n", + " ejscreen_areas_of_concern_csv = (\n", + " DATA_DIR / \"dataset\" / \"ejscreen_areas_of_concern\" / \"usa.csv\"\n", + " )\n", + " ejscreen_areas_of_concern_df = pd.read_csv(\n", + " ejscreen_areas_of_concern_csv,\n", + " dtype={GEOID_FIELD_NAME: \"string\"},\n", + " low_memory=False,\n", + " )\n", + "else:\n", + " print(\n", + " \"EJSCREEN areas of concern data does not exist locally. Not attempting to load data into comparison tool.\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fec0ed63", + "metadata": {}, + "outputs": [], + "source": [ + "# Merge EJSCREEN AoCs into CEJST data.\n", + "# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n", + "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n", + " # If available, merge EJSCREEN AoC data into CBG dfs.\n", + " cejst_df = cejst_df.merge(\n", + " ejscreen_areas_of_concern_df, on=GEOID_FIELD_NAME, how=\"outer\"\n", + " )\n", + "else:\n", + " pass\n", + "\n", + "cejst_df.head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -343,11 +402,6 @@ " other_census_tract_fields_to_keep=[],\n", " ),\n", " Index(\n", - " method_name=\"Poverty\",\n", - " priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", " method_name=\"Persistent Poverty (CBG)\",\n", " priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", @@ -355,6 +409,34 @@ " ]\n", ")\n", "\n", + "\n", + "ejscreen_areas_of_concern_census_block_group_indices = [\n", + " Index(\n", + " method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n", + " priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n", + " priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n", + " priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + "]\n", + "\n", + "# Before including EJSCREEN AoC indicators are included, check whether or not the EJSCREEN AoC data is available locally.\n", + "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n", + " # Add EJSCREEN AoCs to all of the CBG indices.\n", + " census_block_group_indices.extend(\n", + " ejscreen_areas_of_concern_census_block_group_indices\n", + " )\n", + "else:\n", + " pass\n", + "\n", "census_tract_indices = [\n", " Index(\n", " method_name=\"Persistent Poverty\",\n", @@ -620,6 +702,17 @@ " for index in census_block_group_indices + census_tract_indices\n", "]\n", "\n", + "# Convert all indices to boolean\n", + "for field_to_analyze in fields_to_analyze:\n", + " if \"Areas of Concern\" in field_to_analyze:\n", + " print(f\"Converting {field_to_analyze} to boolean.\")\n", + "\n", + " merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(\n", + " value=0\n", + " )\n", + " merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n", + "\n", + "\n", "state_fips_codes = get_state_information(DATA_DIR)\n", "\n", "merged_with_state_information_df = merged_df.merge(\n", @@ -835,6 +928,9 @@ " \"Unemployed civilians (percent)\",\n", " \"Median household income in the past 12 months\",\n", " URBAN_HEURISTIC_FIELD,\n", + " LIFE_EXPECTANCY_FIELD,\n", + " HEALTH_INSURANCE_FIELD,\n", + " BAD_HEALTH_FIELD,\n", "]\n", "\n", "for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n", @@ -1495,7 +1591,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1509,7 +1605,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.5" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index c2ceb0c3..b2de2a1a 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -56,7 +56,9 @@ POVERTY_LESS_THAN_100_FPL_PERCENTILE_FIELD = ( "Percent of individuals < 100% Federal Poverty Line (percentile)" ) MEDIAN_INCOME_PERCENT_AMI_FIELD = "Median household income (% of AMI)" -MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = "Median household income (% of AMI) (percentile)" +MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = ( + "Median household income (% of AMI) (percentile)" +) STATE_MEDIAN_INCOME_FIELD = ( "Median household income (State; 2019 inflation-adjusted dollars)" ) @@ -153,3 +155,42 @@ OVER_64_FIELD = "Individuals over 64 years old" # Urban Rural Map URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag" + + +# EJSCREEN Areas of Concern +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 70th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 75th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 80th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 85th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 90th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 95th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, State, 70th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, State, 75th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, State, 80th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, State, 85th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, State, 90th percentile (communities)" +) +EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, State, 95th percentile (communities)" +) diff --git a/data/data-pipeline/data_pipeline/score/score_c.py b/data/data-pipeline/data_pipeline/score/score_c.py index 121718c8..633739af 100644 --- a/data/data-pipeline/data_pipeline/score/score_c.py +++ b/data/data-pipeline/data_pipeline/score/score_c.py @@ -10,7 +10,7 @@ logger = get_module_logger(__name__) class ScoreC(Score): def __init__(self, df: pd.DataFrame) -> None: - Bucket = namedtuple('Bucket', ['name', 'fields']) + Bucket = namedtuple(typename="Bucket", field_names=["name", "fields"]) self.BUCKET_SOCIOECONOMIC = Bucket( field_names.C_SOCIOECONOMIC, @@ -20,15 +20,15 @@ class ScoreC(Score): field_names.HIGH_SCHOOL_ED_FIELD, field_names.UNEMPLOYMENT_FIELD, field_names.HT_INDEX_FIELD, - ] - ) + ], + ) self.BUCKET_SENSITIVE = Bucket( field_names.C_SENSITIVE, [ field_names.UNDER_5_FIELD, field_names.OVER_64_FIELD, field_names.LINGUISTIC_ISO_FIELD, - ] + ], ) self.BUCKET_ENVIRONMENTAL = Bucket( field_names.C_ENVIRONMENTAL, @@ -38,7 +38,7 @@ class ScoreC(Score): field_names.NPL_FIELD, field_names.WASTEWATER_FIELD, field_names.LEAD_PAINT_FIELD, - ] + ], ) self.BUCKET_EXPOSURES = Bucket( field_names.C_EXPOSURES, @@ -63,7 +63,7 @@ class ScoreC(Score): def add_columns(self) -> pd.DataFrame: logger.info("Adding Score C") # Average all the percentile values in each bucket into a single score for each of the four buckets. - + # TODO just use the percentile fields in the list instead for bucket in self.BUCKETS: fields_to_average = []