diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py index 38c6c222..dfb987f3 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py @@ -8,31 +8,34 @@ logger = get_module_logger(__name__) class EJScreenAreasOfConcernETL(ExtractTransformLoad): def __init__(self): - # self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip" - # self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv" - self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019" + self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern" + + # TO DO: Load from actual source; the issue is that this dataset is not public for now + self.LOCAL_CSV_PATH = self.DATA_PATH / "local" + self.GEOID_CBG_FIELD_NAME = "GEOID10_CBG" self.df: pd.DataFrame def extract(self) -> None: - logger.info("Downloading EJScreen Data") - super().extract( - self.EJSCREEN_FTP_URL, - self.TMP_PATH, - verify=False, # EPA EJScreen end point has certificate issues often - ) + logger.info("Loading EJScreen Areas of Concern Data Locally") - def transform(self) -> None: - logger.info("Transforming EJScreen Data") self.df = pd.read_csv( - self.EJSCREEN_CSV, - dtype={"ID": "string"}, - # EJSCREEN writes the word "None" for NA data. - na_values=["None"], + filepath_or_buffer=self.LOCAL_CSV_PATH + / "ejscreen_areas_of_concerns_indicators.csv", + dtype={ + self.GEOID_CBG_FIELD_NAME: "string", + }, low_memory=False, ) + def transform(self) -> None: + logger.info("Transforming EJScreen Areas of Concern Data") + + # TO DO: As a one off we did all the processing in a separate Notebook + # Can add here later for a future PR + pass + def load(self) -> None: - logger.info("Saving EJScreen CSV") + logger.info("Saving EJScreen Areas of Concern Data") # write nationwide csv - self.CSV_PATH.mkdir(parents=True, exist_ok=True) - self.df.to_csv(self.CSV_PATH / "usa.csv", index=False) + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)