mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-26 19:11:16 -07:00
Add EJSCREEN Areas of Concern (#843)
* Adding ej screen areas of concern * Uses it where user has local files, but not otherwise Co-authored-by: VincentLaUSDS <vincent.la@omb.eop.gov>
This commit is contained in:
parent
1795be6cb4
commit
1d541be447
10 changed files with 2546 additions and 18 deletions
|
@ -12,7 +12,7 @@ DATASET_LIST = [
|
|||
{
|
||||
"name": "ejscreen",
|
||||
"module_dir": "ejscreen",
|
||||
"class_name": "EJScreenETL",
|
||||
"class_name": "EJSCREENETL",
|
||||
},
|
||||
{
|
||||
"name": "housing_and_transportation",
|
||||
|
@ -69,6 +69,11 @@ DATASET_LIST = [
|
|||
"module_dir": "persistent_poverty",
|
||||
"class_name": "PersistentPovertyETL",
|
||||
},
|
||||
{
|
||||
"name": "ejscreen_areas_of_concern",
|
||||
"module_dir": "ejscreen_areas_of_concern",
|
||||
"class_name": "EJSCREENAreasOfConcernETL",
|
||||
},
|
||||
]
|
||||
CENSUS_INFO = {
|
||||
"name": "census",
|
||||
|
|
|
@ -8,7 +8,6 @@ from data_pipeline.etl.score import constants
|
|||
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
|
@ -231,6 +230,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.census_acs_median_incomes_df,
|
||||
self.national_risk_index_df,
|
||||
]
|
||||
|
||||
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)
|
||||
|
||||
# Join all the data sources that use census tracts
|
||||
|
@ -312,10 +312,12 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.HT_INDEX_FIELD,
|
||||
]
|
||||
|
||||
non_numeric_columns = [
|
||||
self.GEOID_FIELD_NAME,
|
||||
field_names.PERSISTENT_POVERTY_FIELD,
|
||||
]
|
||||
|
||||
columns_to_keep = non_numeric_columns + numeric_columns
|
||||
df = df[columns_to_keep]
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from data_pipeline.utils import get_module_logger
|
|||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class EJScreenETL(ExtractTransformLoad):
|
||||
class EJSCREENETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
|
||||
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
# EJ Screen Areas of Concern Data
|
||||
|
||||
Note, this dataset is actually not public, so the data file must be stored locally in order for this to run
|
|
@ -0,0 +1,73 @@
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
||||
# Note: while we normally set these properties in `__init__`,
|
||||
# we are setting them as class properties here so they can be accessed by the
|
||||
# class method `ejscreen_areas_of_concern_data_exists`.
|
||||
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
|
||||
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
|
||||
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self.OUTPUT_PATH = (
|
||||
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
|
||||
)
|
||||
|
||||
# TO DO: Load from actual source; the issue is that this dataset is not public for now
|
||||
self.df: pd.DataFrame
|
||||
|
||||
@classmethod
|
||||
def ejscreen_areas_of_concern_data_exists(cls):
|
||||
"""Check whether or not the EJSCREEN areas of concern data exists.
|
||||
|
||||
Note: this data is provided privately and is not currently publicly available.
|
||||
|
||||
To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person
|
||||
running it has access to that data, the following method checks whether the source file exists.
|
||||
|
||||
If it does exist, code can and should include to this data. If it does not exist, code should
|
||||
not reference this data.
|
||||
|
||||
"""
|
||||
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
|
||||
|
||||
def extract(self) -> None:
|
||||
if self.ejscreen_areas_of_concern_data_exists():
|
||||
logger.info("Loading EJSCREEN Areas of Concern Data Locally")
|
||||
self.df = pd.read_csv(
|
||||
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
|
||||
dtype={
|
||||
self.GEOID_FIELD_NAME: "string",
|
||||
},
|
||||
low_memory=False,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"EJSCREEN areas of concern data does not exist locally. Not loading the data."
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Transforming EJSCREEN Areas of Concern Data")
|
||||
|
||||
# TO DO: As a one off we did all the processing in a separate Notebook
|
||||
# Can add here later for a future PR
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
if self.ejscreen_areas_of_concern_data_exists():
|
||||
logger.info("Saving EJSCREEN Areas of Concern Data")
|
||||
# write nationwide csv
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
else:
|
||||
logger.info(
|
||||
"EJSCREEN areas of concern data does not exist locally. Not saving the data."
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue