Add EJSCREEN Areas of Concern (#843)

* Adding ej screen areas of concern

* Uses it where user has local files, but not otherwise

Co-authored-by: VincentLaUSDS <vincent.la@omb.eop.gov>
This commit is contained in:
Lucas Merrill Brown 2021-11-02 15:38:42 -04:00 committed by GitHub
parent 1795be6cb4
commit 1d541be447
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 2546 additions and 18 deletions

View file

@ -12,7 +12,7 @@ DATASET_LIST = [
{
"name": "ejscreen",
"module_dir": "ejscreen",
"class_name": "EJScreenETL",
"class_name": "EJSCREENETL",
},
{
"name": "housing_and_transportation",
@ -69,6 +69,11 @@ DATASET_LIST = [
"module_dir": "persistent_poverty",
"class_name": "PersistentPovertyETL",
},
{
"name": "ejscreen_areas_of_concern",
"module_dir": "ejscreen_areas_of_concern",
"class_name": "EJSCREENAreasOfConcernETL",
},
]
CENSUS_INFO = {
"name": "census",

View file

@ -8,7 +8,6 @@ from data_pipeline.etl.score import constants
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -231,6 +230,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_acs_median_incomes_df,
self.national_risk_index_df,
]
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
# Join all the data sources that use census tracts
@ -312,10 +312,12 @@ class ScoreETL(ExtractTransformLoad):
field_names.UNEMPLOYMENT_FIELD,
field_names.HT_INDEX_FIELD,
]
non_numeric_columns = [
self.GEOID_FIELD_NAME,
field_names.PERSISTENT_POVERTY_FIELD,
]
columns_to_keep = non_numeric_columns + numeric_columns
df = df[columns_to_keep]

View file

@ -6,7 +6,7 @@ from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class EJScreenETL(ExtractTransformLoad):
class EJSCREENETL(ExtractTransformLoad):
def __init__(self):
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"

View file

@ -0,0 +1,3 @@
# EJ Screen Areas of Concern Data
Note, this dataset is actually not public, so the data file must be stored locally in order for this to run

View file

@ -0,0 +1,73 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# Note: while we normally set these properties in `__init__`,
# we are setting them as class properties here so they can be accessed by the
# class method `ejscreen_areas_of_concern_data_exists`.
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
)
def __init__(self):
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
)
# TO DO: Load from actual source; the issue is that this dataset is not public for now
self.df: pd.DataFrame
@classmethod
def ejscreen_areas_of_concern_data_exists(cls):
"""Check whether or not the EJSCREEN areas of concern data exists.
Note: this data is provided privately and is not currently publicly available.
To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person
running it has access to that data, the following method checks whether the source file exists.
If it does exist, code can and should include to this data. If it does not exist, code should
not reference this data.
"""
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
def extract(self) -> None:
if self.ejscreen_areas_of_concern_data_exists():
logger.info("Loading EJSCREEN Areas of Concern Data Locally")
self.df = pd.read_csv(
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
dtype={
self.GEOID_FIELD_NAME: "string",
},
low_memory=False,
)
else:
logger.info(
"EJSCREEN areas of concern data does not exist locally. Not loading the data."
)
def transform(self) -> None:
logger.info("Transforming EJSCREEN Areas of Concern Data")
# TO DO: As a one off we did all the processing in a separate Notebook
# Can add here later for a future PR
pass
def load(self) -> None:
if self.ejscreen_areas_of_concern_data_exists():
logger.info("Saving EJSCREEN Areas of Concern Data")
# write nationwide csv
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
else:
logger.info(
"EJSCREEN areas of concern data does not exist locally. Not saving the data."
)

File diff suppressed because it is too large Load diff

View file

@ -34,7 +34,9 @@
"\n",
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
"\n",
"from data_pipeline.etl.sources.ejscreen_areas_of_concern.etl import (\n",
" EJSCREENAreasOfConcernETL,\n",
")\n",
"\n",
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
"tqdm_notebook.pandas()"
@ -77,6 +79,14 @@
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
"\n",
"LIFE_EXPECTANCY_FIELD = \"Life expectancy (years)\"\n",
"HEALTH_INSURANCE_FIELD = (\n",
" \"Current lack of health insurance among adults aged 18-64 years\"\n",
")\n",
"BAD_HEALTH_FIELD = (\n",
" \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
")\n",
"\n",
"# Define some suffixes\n",
"POPULATION_SUFFIX = \" (priority population)\""
]
@ -108,6 +118,55 @@
"cejst_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b1083e8",
"metadata": {},
"outputs": [],
"source": [
"# Load EJSCREEN Areas of Concern data.\n",
"\n",
"# Load EJ Screen Areas of Concern\n",
"# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n",
"ejscreen_areas_of_concern_df: pd.DataFrame = None\n",
"\n",
"if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
" print(\"Loading EJSCREEN Areas of Concern data for score pipeline.\")\n",
" ejscreen_areas_of_concern_csv = (\n",
" DATA_DIR / \"dataset\" / \"ejscreen_areas_of_concern\" / \"usa.csv\"\n",
" )\n",
" ejscreen_areas_of_concern_df = pd.read_csv(\n",
" ejscreen_areas_of_concern_csv,\n",
" dtype={GEOID_FIELD_NAME: \"string\"},\n",
" low_memory=False,\n",
" )\n",
"else:\n",
" print(\n",
" \"EJSCREEN areas of concern data does not exist locally. Not attempting to load data into comparison tool.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fec0ed63",
"metadata": {},
"outputs": [],
"source": [
"# Merge EJSCREEN AoCs into CEJST data.\n",
"# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n",
"if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
" # If available, merge EJSCREEN AoC data into CBG dfs.\n",
" cejst_df = cejst_df.merge(\n",
" ejscreen_areas_of_concern_df, on=GEOID_FIELD_NAME, how=\"outer\"\n",
" )\n",
"else:\n",
" pass\n",
"\n",
"cejst_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -343,11 +402,6 @@
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Poverty\",\n",
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Persistent Poverty (CBG)\",\n",
" priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n",
@ -355,6 +409,34 @@
" ]\n",
")\n",
"\n",
"\n",
"ejscreen_areas_of_concern_census_block_group_indices = [\n",
" Index(\n",
" method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
" priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
" priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
" priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
"]\n",
"\n",
"# Before including EJSCREEN AoC indicators are included, check whether or not the EJSCREEN AoC data is available locally.\n",
"if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
" # Add EJSCREEN AoCs to all of the CBG indices.\n",
" census_block_group_indices.extend(\n",
" ejscreen_areas_of_concern_census_block_group_indices\n",
" )\n",
"else:\n",
" pass\n",
"\n",
"census_tract_indices = [\n",
" Index(\n",
" method_name=\"Persistent Poverty\",\n",
@ -620,6 +702,17 @@
" for index in census_block_group_indices + census_tract_indices\n",
"]\n",
"\n",
"# Convert all indices to boolean\n",
"for field_to_analyze in fields_to_analyze:\n",
" if \"Areas of Concern\" in field_to_analyze:\n",
" print(f\"Converting {field_to_analyze} to boolean.\")\n",
"\n",
" merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(\n",
" value=0\n",
" )\n",
" merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n",
"\n",
"\n",
"state_fips_codes = get_state_information(DATA_DIR)\n",
"\n",
"merged_with_state_information_df = merged_df.merge(\n",
@ -835,6 +928,9 @@
" \"Unemployed civilians (percent)\",\n",
" \"Median household income in the past 12 months\",\n",
" URBAN_HEURISTIC_FIELD,\n",
" LIFE_EXPECTANCY_FIELD,\n",
" HEALTH_INSURANCE_FIELD,\n",
" BAD_HEALTH_FIELD,\n",
"]\n",
"\n",
"for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n",
@ -1495,7 +1591,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -1509,7 +1605,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
"version": "3.9.6"
}
},
"nbformat": 4,

View file

@ -56,7 +56,9 @@ POVERTY_LESS_THAN_100_FPL_PERCENTILE_FIELD = (
"Percent of individuals < 100% Federal Poverty Line (percentile)"
)
MEDIAN_INCOME_PERCENT_AMI_FIELD = "Median household income (% of AMI)"
MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = "Median household income (% of AMI) (percentile)"
MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = (
"Median household income (% of AMI) (percentile)"
)
STATE_MEDIAN_INCOME_FIELD = (
"Median household income (State; 2019 inflation-adjusted dollars)"
)
@ -153,3 +155,42 @@ OVER_64_FIELD = "Individuals over 64 years old"
# Urban Rural Map
URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
# EJSCREEN Areas of Concern
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 70th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 75th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 80th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 85th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 90th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 95th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, State, 70th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, State, 75th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, State, 80th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, State, 85th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, State, 90th percentile (communities)"
)
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, State, 95th percentile (communities)"
)

View file

@ -10,7 +10,7 @@ logger = get_module_logger(__name__)
class ScoreC(Score):
def __init__(self, df: pd.DataFrame) -> None:
Bucket = namedtuple('Bucket', ['name', 'fields'])
Bucket = namedtuple(typename="Bucket", field_names=["name", "fields"])
self.BUCKET_SOCIOECONOMIC = Bucket(
field_names.C_SOCIOECONOMIC,
@ -20,15 +20,15 @@ class ScoreC(Score):
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.HT_INDEX_FIELD,
]
)
],
)
self.BUCKET_SENSITIVE = Bucket(
field_names.C_SENSITIVE,
[
field_names.UNDER_5_FIELD,
field_names.OVER_64_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
]
],
)
self.BUCKET_ENVIRONMENTAL = Bucket(
field_names.C_ENVIRONMENTAL,
@ -38,7 +38,7 @@ class ScoreC(Score):
field_names.NPL_FIELD,
field_names.WASTEWATER_FIELD,
field_names.LEAD_PAINT_FIELD,
]
],
)
self.BUCKET_EXPOSURES = Bucket(
field_names.C_EXPOSURES,
@ -63,7 +63,7 @@ class ScoreC(Score):
def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score C")
# Average all the percentile values in each bucket into a single score for each of the four buckets.
# TODO just use the percentile fields in the list instead
for bucket in self.BUCKETS:
fields_to_average = []