mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 09:41:26 -08:00
Add EJSCREEN Areas of Concern (#843)
* Adding ej screen areas of concern * Uses it where user has local files, but not otherwise Co-authored-by: VincentLaUSDS <vincent.la@omb.eop.gov>
This commit is contained in:
parent
1795be6cb4
commit
1d541be447
10 changed files with 2546 additions and 18 deletions
|
@ -12,7 +12,7 @@ DATASET_LIST = [
|
|||
{
|
||||
"name": "ejscreen",
|
||||
"module_dir": "ejscreen",
|
||||
"class_name": "EJScreenETL",
|
||||
"class_name": "EJSCREENETL",
|
||||
},
|
||||
{
|
||||
"name": "housing_and_transportation",
|
||||
|
@ -69,6 +69,11 @@ DATASET_LIST = [
|
|||
"module_dir": "persistent_poverty",
|
||||
"class_name": "PersistentPovertyETL",
|
||||
},
|
||||
{
|
||||
"name": "ejscreen_areas_of_concern",
|
||||
"module_dir": "ejscreen_areas_of_concern",
|
||||
"class_name": "EJSCREENAreasOfConcernETL",
|
||||
},
|
||||
]
|
||||
CENSUS_INFO = {
|
||||
"name": "census",
|
||||
|
|
|
@ -8,7 +8,6 @@ from data_pipeline.etl.score import constants
|
|||
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
|
@ -231,6 +230,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.census_acs_median_incomes_df,
|
||||
self.national_risk_index_df,
|
||||
]
|
||||
|
||||
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
|
||||
|
||||
# Join all the data sources that use census tracts
|
||||
|
@ -312,10 +312,12 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.HT_INDEX_FIELD,
|
||||
]
|
||||
|
||||
non_numeric_columns = [
|
||||
self.GEOID_FIELD_NAME,
|
||||
field_names.PERSISTENT_POVERTY_FIELD,
|
||||
]
|
||||
|
||||
columns_to_keep = non_numeric_columns + numeric_columns
|
||||
df = df[columns_to_keep]
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from data_pipeline.utils import get_module_logger
|
|||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class EJScreenETL(ExtractTransformLoad):
|
||||
class EJSCREENETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
|
||||
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
# EJ Screen Areas of Concern Data
|
||||
|
||||
Note, this dataset is actually not public, so the data file must be stored locally in order for this to run
|
|
@ -0,0 +1,73 @@
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
||||
# Note: while we normally set these properties in `__init__`,
|
||||
# we are setting them as class properties here so they can be accessed by the
|
||||
# class method `ejscreen_areas_of_concern_data_exists`.
|
||||
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
|
||||
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
|
||||
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self.OUTPUT_PATH = (
|
||||
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
|
||||
)
|
||||
|
||||
# TO DO: Load from actual source; the issue is that this dataset is not public for now
|
||||
self.df: pd.DataFrame
|
||||
|
||||
@classmethod
|
||||
def ejscreen_areas_of_concern_data_exists(cls):
|
||||
"""Check whether or not the EJSCREEN areas of concern data exists.
|
||||
|
||||
Note: this data is provided privately and is not currently publicly available.
|
||||
|
||||
To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person
|
||||
running it has access to that data, the following method checks whether the source file exists.
|
||||
|
||||
If it does exist, code can and should include to this data. If it does not exist, code should
|
||||
not reference this data.
|
||||
|
||||
"""
|
||||
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
|
||||
|
||||
def extract(self) -> None:
|
||||
if self.ejscreen_areas_of_concern_data_exists():
|
||||
logger.info("Loading EJSCREEN Areas of Concern Data Locally")
|
||||
self.df = pd.read_csv(
|
||||
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
|
||||
dtype={
|
||||
self.GEOID_FIELD_NAME: "string",
|
||||
},
|
||||
low_memory=False,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"EJSCREEN areas of concern data does not exist locally. Not loading the data."
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming EJSCREEN Areas of Concern Data")
|
||||
|
||||
# TO DO: As a one off we did all the processing in a separate Notebook
|
||||
# Can add here later for a future PR
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
if self.ejscreen_areas_of_concern_data_exists():
|
||||
logger.info("Saving EJSCREEN Areas of Concern Data")
|
||||
# write nationwide csv
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
else:
|
||||
logger.info(
|
||||
"EJSCREEN areas of concern data does not exist locally. Not saving the data."
|
||||
)
|
2308
data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb
Normal file
2308
data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb
Normal file
File diff suppressed because it is too large
Load diff
|
@ -34,7 +34,9 @@
|
|||
"\n",
|
||||
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
|
||||
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
|
||||
"\n",
|
||||
"from data_pipeline.etl.sources.ejscreen_areas_of_concern.etl import (\n",
|
||||
" EJSCREENAreasOfConcernETL,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
||||
"tqdm_notebook.pandas()"
|
||||
|
@ -77,6 +79,14 @@
|
|||
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
|
||||
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
|
||||
"\n",
|
||||
"LIFE_EXPECTANCY_FIELD = \"Life expectancy (years)\"\n",
|
||||
"HEALTH_INSURANCE_FIELD = (\n",
|
||||
" \"Current lack of health insurance among adults aged 18-64 years\"\n",
|
||||
")\n",
|
||||
"BAD_HEALTH_FIELD = (\n",
|
||||
" \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Define some suffixes\n",
|
||||
"POPULATION_SUFFIX = \" (priority population)\""
|
||||
]
|
||||
|
@ -108,6 +118,55 @@
|
|||
"cejst_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1b1083e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load EJSCREEN Areas of Concern data.\n",
|
||||
"\n",
|
||||
"# Load EJ Screen Areas of Concern\n",
|
||||
"# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n",
|
||||
"ejscreen_areas_of_concern_df: pd.DataFrame = None\n",
|
||||
"\n",
|
||||
"if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
|
||||
" print(\"Loading EJSCREEN Areas of Concern data for score pipeline.\")\n",
|
||||
" ejscreen_areas_of_concern_csv = (\n",
|
||||
" DATA_DIR / \"dataset\" / \"ejscreen_areas_of_concern\" / \"usa.csv\"\n",
|
||||
" )\n",
|
||||
" ejscreen_areas_of_concern_df = pd.read_csv(\n",
|
||||
" ejscreen_areas_of_concern_csv,\n",
|
||||
" dtype={GEOID_FIELD_NAME: \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" )\n",
|
||||
"else:\n",
|
||||
" print(\n",
|
||||
" \"EJSCREEN areas of concern data does not exist locally. Not attempting to load data into comparison tool.\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fec0ed63",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Merge EJSCREEN AoCs into CEJST data.\n",
|
||||
"# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n",
|
||||
"if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
|
||||
" # If available, merge EJSCREEN AoC data into CBG dfs.\n",
|
||||
" cejst_df = cejst_df.merge(\n",
|
||||
" ejscreen_areas_of_concern_df, on=GEOID_FIELD_NAME, how=\"outer\"\n",
|
||||
" )\n",
|
||||
"else:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"cejst_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -343,11 +402,6 @@
|
|||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Poverty\",\n",
|
||||
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Persistent Poverty (CBG)\",\n",
|
||||
" priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
|
@ -355,6 +409,34 @@
|
|||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ejscreen_areas_of_concern_census_block_group_indices = [\n",
|
||||
" Index(\n",
|
||||
" method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
|
||||
" priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
|
||||
" priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
|
||||
" priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Before including EJSCREEN AoC indicators are included, check whether or not the EJSCREEN AoC data is available locally.\n",
|
||||
"if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
|
||||
" # Add EJSCREEN AoCs to all of the CBG indices.\n",
|
||||
" census_block_group_indices.extend(\n",
|
||||
" ejscreen_areas_of_concern_census_block_group_indices\n",
|
||||
" )\n",
|
||||
"else:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"census_tract_indices = [\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Persistent Poverty\",\n",
|
||||
|
@ -620,6 +702,17 @@
|
|||
" for index in census_block_group_indices + census_tract_indices\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Convert all indices to boolean\n",
|
||||
"for field_to_analyze in fields_to_analyze:\n",
|
||||
" if \"Areas of Concern\" in field_to_analyze:\n",
|
||||
" print(f\"Converting {field_to_analyze} to boolean.\")\n",
|
||||
"\n",
|
||||
" merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(\n",
|
||||
" value=0\n",
|
||||
" )\n",
|
||||
" merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state_fips_codes = get_state_information(DATA_DIR)\n",
|
||||
"\n",
|
||||
"merged_with_state_information_df = merged_df.merge(\n",
|
||||
|
@ -835,6 +928,9 @@
|
|||
" \"Unemployed civilians (percent)\",\n",
|
||||
" \"Median household income in the past 12 months\",\n",
|
||||
" URBAN_HEURISTIC_FIELD,\n",
|
||||
" LIFE_EXPECTANCY_FIELD,\n",
|
||||
" HEALTH_INSURANCE_FIELD,\n",
|
||||
" BAD_HEALTH_FIELD,\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n",
|
||||
|
@ -1495,7 +1591,7 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -1509,7 +1605,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.5"
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -56,7 +56,9 @@ POVERTY_LESS_THAN_100_FPL_PERCENTILE_FIELD = (
|
|||
"Percent of individuals < 100% Federal Poverty Line (percentile)"
|
||||
)
|
||||
MEDIAN_INCOME_PERCENT_AMI_FIELD = "Median household income (% of AMI)"
|
||||
MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = "Median household income (% of AMI) (percentile)"
|
||||
MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = (
|
||||
"Median household income (% of AMI) (percentile)"
|
||||
)
|
||||
STATE_MEDIAN_INCOME_FIELD = (
|
||||
"Median household income (State; 2019 inflation-adjusted dollars)"
|
||||
)
|
||||
|
@ -153,3 +155,42 @@ OVER_64_FIELD = "Individuals over 64 years old"
|
|||
|
||||
# Urban Rural Map
|
||||
URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
|
||||
|
||||
|
||||
# EJSCREEN Areas of Concern
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, National, 70th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, National, 75th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, National, 80th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, National, 85th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, National, 90th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, National, 95th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, State, 70th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, State, 75th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, State, 80th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, State, 85th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, State, 90th percentile (communities)"
|
||||
)
|
||||
EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
|
||||
"EJSCREEN Areas of Concern, State, 95th percentile (communities)"
|
||||
)
|
||||
|
|
|
@ -10,7 +10,7 @@ logger = get_module_logger(__name__)
|
|||
|
||||
class ScoreC(Score):
|
||||
def __init__(self, df: pd.DataFrame) -> None:
|
||||
Bucket = namedtuple('Bucket', ['name', 'fields'])
|
||||
Bucket = namedtuple(typename="Bucket", field_names=["name", "fields"])
|
||||
|
||||
self.BUCKET_SOCIOECONOMIC = Bucket(
|
||||
field_names.C_SOCIOECONOMIC,
|
||||
|
@ -20,15 +20,15 @@ class ScoreC(Score):
|
|||
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.HT_INDEX_FIELD,
|
||||
]
|
||||
)
|
||||
],
|
||||
)
|
||||
self.BUCKET_SENSITIVE = Bucket(
|
||||
field_names.C_SENSITIVE,
|
||||
[
|
||||
field_names.UNDER_5_FIELD,
|
||||
field_names.OVER_64_FIELD,
|
||||
field_names.LINGUISTIC_ISO_FIELD,
|
||||
]
|
||||
],
|
||||
)
|
||||
self.BUCKET_ENVIRONMENTAL = Bucket(
|
||||
field_names.C_ENVIRONMENTAL,
|
||||
|
@ -38,7 +38,7 @@ class ScoreC(Score):
|
|||
field_names.NPL_FIELD,
|
||||
field_names.WASTEWATER_FIELD,
|
||||
field_names.LEAD_PAINT_FIELD,
|
||||
]
|
||||
],
|
||||
)
|
||||
self.BUCKET_EXPOSURES = Bucket(
|
||||
field_names.C_EXPOSURES,
|
||||
|
@ -63,7 +63,7 @@ class ScoreC(Score):
|
|||
def add_columns(self) -> pd.DataFrame:
|
||||
logger.info("Adding Score C")
|
||||
# Average all the percentile values in each bucket into a single score for each of the four buckets.
|
||||
|
||||
|
||||
# TODO just use the percentile fields in the list instead
|
||||
for bucket in self.BUCKETS:
|
||||
fields_to_average = []
|
||||
|
|
Loading…
Add table
Reference in a new issue