Add EJSCREEN Areas of Concern (#843)

* Adding ej screen areas of concern

* Uses it where user has local files, but not otherwise

Co-authored-by: VincentLaUSDS <vincent.la@omb.eop.gov>
This commit is contained in:
Lucas Merrill Brown 2021-11-02 15:38:42 -04:00 committed by GitHub
commit 1d541be447
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 2546 additions and 18 deletions

View file

@ -12,7 +12,7 @@ DATASET_LIST = [
{
"name": "ejscreen",
"module_dir": "ejscreen",
"class_name": "EJScreenETL",
"class_name": "EJSCREENETL",
},
{
"name": "housing_and_transportation",
@ -69,6 +69,11 @@ DATASET_LIST = [
"module_dir": "persistent_poverty",
"class_name": "PersistentPovertyETL",
},
{
"name": "ejscreen_areas_of_concern",
"module_dir": "ejscreen_areas_of_concern",
"class_name": "EJSCREENAreasOfConcernETL",
},
]
CENSUS_INFO = {
"name": "census",

View file

@ -8,7 +8,6 @@ from data_pipeline.etl.score import constants
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -231,6 +230,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_acs_median_incomes_df,
self.national_risk_index_df,
]
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
# Join all the data sources that use census tracts
@ -312,10 +312,12 @@ class ScoreETL(ExtractTransformLoad):
field_names.UNEMPLOYMENT_FIELD,
field_names.HT_INDEX_FIELD,
]
non_numeric_columns = [
self.GEOID_FIELD_NAME,
field_names.PERSISTENT_POVERTY_FIELD,
]
columns_to_keep = non_numeric_columns + numeric_columns
df = df[columns_to_keep]

View file

@ -6,7 +6,7 @@ from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class EJScreenETL(ExtractTransformLoad):
class EJSCREENETL(ExtractTransformLoad):
def __init__(self):
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"

View file

@ -0,0 +1,3 @@
# EJ Screen Areas of Concern Data
Note, this dataset is actually not public, so the data file must be stored locally in order for this to run

View file

@ -0,0 +1,73 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# Note: while we normally set these properties in `__init__`,
# we are setting them as class properties here so they can be accessed by the
# class method `ejscreen_areas_of_concern_data_exists`.
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
)
def __init__(self):
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
)
# TO DO: Load from actual source; the issue is that this dataset is not public for now
self.df: pd.DataFrame
@classmethod
def ejscreen_areas_of_concern_data_exists(cls):
"""Check whether or not the EJSCREEN areas of concern data exists.
Note: this data is provided privately and is not currently publicly available.
To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person
running it has access to that data, the following method checks whether the source file exists.
If it does exist, code can and should include to this data. If it does not exist, code should
not reference this data.
"""
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
def extract(self) -> None:
if self.ejscreen_areas_of_concern_data_exists():
logger.info("Loading EJSCREEN Areas of Concern Data Locally")
self.df = pd.read_csv(
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
dtype={
self.GEOID_FIELD_NAME: "string",
},
low_memory=False,
)
else:
logger.info(
"EJSCREEN areas of concern data does not exist locally. Not loading the data."
)
def transform(self) -> None:
logger.info("Transforming EJSCREEN Areas of Concern Data")
# TO DO: As a one off we did all the processing in a separate Notebook
# Can add here later for a future PR
pass
def load(self) -> None:
if self.ejscreen_areas_of_concern_data_exists():
logger.info("Saving EJSCREEN Areas of Concern Data")
# write nationwide csv
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
else:
logger.info(
"EJSCREEN areas of concern data does not exist locally. Not saving the data."
)