From 76d4ebe0c3655279bf3ec4d657bd4c1a94fb8fa3 Mon Sep 17 00:00:00 2001 From: Matthew Bowen Date: Wed, 27 Jul 2022 17:35:56 -0400 Subject: [PATCH] Refactor ETL to use datasets.yml (#1518) --- .../etl/sources/doe_energy_burden/etl.py | 52 +++++++------------ .../sources/doe_energy_burden/test_etl.py | 2 +- 2 files changed, 20 insertions(+), 34 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py index 92594f89..a25dfb70 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py @@ -3,7 +3,7 @@ import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel -from data_pipeline.utils import get_module_logger, unzip_file_from_url +from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) @@ -16,52 +16,41 @@ class DOEEnergyBurden(ExtractTransformLoad): ) GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + REVISED_ENERGY_BURDEN_FIELD_NAME: str + def __init__(self): - self.DOE_FILE_URL = self.SOURCE_URL + self.DATASET_CONFIG = super().yaml_config_load() self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "doe_energy_burden" ) - - self.TRACT_INPUT_COLUMN_NAME = "FIP" self.INPUT_ENERGY_BURDEN_FIELD_NAME = "BURDEN" - self.REVISED_ENERGY_BURDEN_FIELD_NAME = "Energy burden" - - # Constants for output - self.COLUMNS_TO_KEEP = [ - self.GEOID_TRACT_FIELD_NAME, - self.REVISED_ENERGY_BURDEN_FIELD_NAME, - ] self.raw_df: pd.DataFrame self.output_df: pd.DataFrame def extract(self) -> None: - logger.info("Starting data download.") - - unzip_file_from_url( - file_url=self.DOE_FILE_URL, - download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path() - ) - - self.raw_df = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() - / "DOE_LEAD_AMI_TRACT_2018_ALL.csv", - # The following need to remain as strings for all of their digits, not get converted to numbers. - dtype={ - self.TRACT_INPUT_COLUMN_NAME: "string", - }, - low_memory=False, + # TODO: Make these defaults so etract can be blank most of the time + super().extract( + source_url=self.SOURCE_URL, extract_path=self.get_tmp_path() ) def transform(self) -> None: logger.info("Starting transforms.") + raw_df: pd.DataFrame = pd.read_csv( + filepath_or_buffer=self.get_tmp_path() + / "DOE_LEAD_AMI_TRACT_2018_ALL.csv", + # The following need to remain as strings for all of their digits, not get converted to numbers. + dtype={ + self.INPUT_GEOID_TRACT_FIELD_NAME: "string", + }, + low_memory=False, + ) - output_df = self.raw_df.rename( + output_df = raw_df.rename( columns={ self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME, - self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, + self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, } ) @@ -78,7 +67,4 @@ class DOEEnergyBurden(ExtractTransformLoad): def load(self) -> None: logger.info("Saving DOE Energy Burden CSV") - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - self.output_df[self.COLUMNS_TO_KEEP].to_csv( - path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False - ) + super().load(float_format="%.10f") diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py index aa5e2824..548b356f 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py @@ -77,6 +77,6 @@ class TestDOEEnergyBurdenETL(TestETL): ] assert etl.GEOID_FIELD_NAME == "GEOID10" assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT" - assert etl.TRACT_INPUT_COLUMN_NAME == "FIP" + assert etl.INPUT_GEOID_TRACT_FIELD_NAME == "FIP" assert etl.INPUT_ENERGY_BURDEN_FIELD_NAME == "BURDEN" assert etl.REVISED_ENERGY_BURDEN_FIELD_NAME == "Energy burden"