From 12a6b2f10eb31d30a6139188cfb95d94d011543d Mon Sep 17 00:00:00 2001 From: Matthew Bowen Date: Wed, 27 Jul 2022 17:17:34 -0400 Subject: [PATCH] Add tests for DOE energy budren (1518 --- .../etl/sources/doe_energy_burden/etl.py | 17 ++-- .../sources/doe_energy_burden/__init__.py | 0 .../data/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip | Bin 0 -> 376 bytes .../doe_energy_burden/data/extract.csv | 16 ++++ .../sources/doe_energy_burden/data/output.csv | 16 ++++ .../doe_energy_burden/data/transform.csv | 16 ++++ .../sources/doe_energy_burden/test_etl.py | 82 ++++++++++++++++++ 7 files changed, 140 insertions(+), 7 deletions(-) create mode 100644 data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/__init__.py create mode 100644 data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip create mode 100644 data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/extract.csv create mode 100644 data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/output.csv create mode 100644 data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/transform.csv create mode 100644 data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py index 80407d39..92594f89 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py @@ -2,18 +2,22 @@ from pathlib import Path import pandas as pd from data_pipeline.config import settings -from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel from data_pipeline.utils import get_module_logger, unzip_file_from_url logger = get_module_logger(__name__) class DOEEnergyBurden(ExtractTransformLoad): + NAME = "doe_energy_burden" + SOURCE_URL: str = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip" + ) + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + def __init__(self): - self.DOE_FILE_URL = ( - settings.AWS_JUSTICE40_DATASOURCES_URL - + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip" - ) + self.DOE_FILE_URL = self.SOURCE_URL self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "doe_energy_burden" @@ -38,12 +42,11 @@ class DOEEnergyBurden(ExtractTransformLoad): unzip_file_from_url( file_url=self.DOE_FILE_URL, download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path() / "doe_energy_burden", + unzipped_file_path=self.get_tmp_path() ) self.raw_df = pd.read_csv( filepath_or_buffer=self.get_tmp_path() - / "doe_energy_burden" / "DOE_LEAD_AMI_TRACT_2018_ALL.csv", # The following need to remain as strings for all of their digits, not get converted to numbers. dtype={ diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/__init__.py b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip new file mode 100644 index 0000000000000000000000000000000000000000..44af808ec369897921b248863fb9a9c634734042 GIT binary patch literal 376 zcmWIWW@Zs#U|`^2IMwhwr0_UX$Pys0n~{M*oiz9_ex>jI!+Y}l+Zorm_#FDm zFjvEb?kJQEdPDNH(}AgF1Xal*~B@e^O-veYRl8JnG#d2}|Fx^Vn=+!D^}QZ(Uu zAB(QFNOt76fR;B2+Ttl?8{Rgq(Tl(SplMmz{OBv!Bgzd{e6L+2ru{|y_9?Mq&2Jmy x1H2iT1D;r2VBM_zn>AxTj0{~j%e(3-J literal 0 HcmV?d00001 diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/extract.csv b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/extract.csv new file mode 100644 index 00000000..1820c356 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/extract.csv @@ -0,0 +1,16 @@ +ABV,FIP,BURDEN,QUANTILE +HI,15001021010,0.0380000000,30 +HI,15001021101,0.0410000000,25 +HI,15001021402,0.0240000000,66 +HI,15001021800,0.0290000000,51 +HI,15003010201,0.0270000000,58 +HI,15007040603,0.0440000000,21 +HI,15007040604,0.0330000000,40 +HI,15007040700,0.0260000000,59 +HI,15009030100,0.0350000000,37 +HI,15009030201,0.0220000000,71 +HI,15009030402,0.0200000000,75 +HI,15009030800,0.0190000000,80 +CA,6007040300,0.2000000000,70 +CA,6007040500,0.5000000000,50 +CA,6001020100,0.1990000000,30 diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/output.csv b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/output.csv new file mode 100644 index 00000000..218fde8e --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/output.csv @@ -0,0 +1,16 @@ +GEOID10_TRACT,Energy burden +15001021010,0.0380000000 +15001021101,0.0410000000 +15001021402,0.0240000000 +15001021800,0.0290000000 +15003010201,0.0270000000 +15007040603,0.0440000000 +15007040604,0.0330000000 +15007040700,0.0260000000 +15009030100,0.0350000000 +15009030201,0.0220000000 +15009030402,0.0200000000 +15009030800,0.0190000000 +06007040300,0.2000000000 +06007040500,0.5000000000 +06001020100,0.1990000000 diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/transform.csv new file mode 100644 index 00000000..59808a83 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/transform.csv @@ -0,0 +1,16 @@ +ABV,GEOID10_TRACT,Energy burden,QUANTILE +HI,15001021010,0.0380000000,30 +HI,15001021101,0.0410000000,25 +HI,15001021402,0.0240000000,66 +HI,15001021800,0.0290000000,51 +HI,15003010201,0.0270000000,58 +HI,15007040603,0.0440000000,21 +HI,15007040604,0.0330000000,40 +HI,15007040700,0.0260000000,59 +HI,15009030100,0.0350000000,37 +HI,15009030201,0.0220000000,71 +HI,15009030402,0.0200000000,75 +HI,15009030800,0.0190000000,80 +CA,06007040300,0.2000000000,70 +CA,06007040500,0.5000000000,50 +CA,06001020100,0.1990000000,30 diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py new file mode 100644 index 00000000..aa5e2824 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py @@ -0,0 +1,82 @@ +# pylint: disable=protected-access +from unittest import mock +import pathlib +import requests + +from data_pipeline.etl.sources.doe_energy_burden.etl import ( + DOEEnergyBurden, +) +from data_pipeline.tests.sources.example.test_etl import TestETL +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +class TestDOEEnergyBurdenETL(TestETL): + """Tests the COI ETL. + + This uses pytest-snapshot. + To update individual snapshots: $ poetry run pytest + data_pipeline/tests/sources/ndoe_energy_burden/test_etl.py::TestClassNameETL:: + --snapshot-update + """ + + _ETL_CLASS = DOEEnergyBurden + + _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data" + _SAMPLE_DATA_FILE_NAME = "DOE_LEAD_AMI_TRACT_2018_ALL.csv" + _SAMPLE_DATA_ZIP_FILE_NAME = "DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip" + _EXTRACT_TMP_FOLDER_NAME = "DOEEnergyBurden" + _EXTRACT_CSV_FILE_NAME = "extract.csv" + + def setup_method(self, _method, filename=__file__): + """Invoke `setup_method` from Parent, but using the current file name. + + This code can be copied identically between all child classes. + """ + super().setup_method(_method=_method, filename=filename) + + # XXX: Refactor since I just straight copied it out of NRI's + def _setup_etl_instance_and_run_extract(self, mock_etl, mock_paths): + with mock.patch("data_pipeline.utils.requests") as requests_mock: + zip_file_fixture_src = self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_ZIP_FILE_NAME + tmp_path = mock_paths[1] + + # Create mock response. + with open(zip_file_fixture_src, mode="rb") as file: + file_contents = file.read() + response_mock = requests.Response() + response_mock.status_code = 200 + # pylint: disable=protected-access + response_mock._content = file_contents + # Return text fixture: + requests_mock.get = mock.MagicMock(return_value=response_mock) + + # Instantiate the ETL class. + etl = self._ETL_CLASS() + + # Monkey-patch the temporary directory to the one used in the test + etl.TMP_PATH = tmp_path + + # Run the extract method. + etl.extract() + + return etl + + def test_init(self, mock_etl, mock_paths): + """Tests that the ChildOpportunityIndexETL class was initialized + correctly. + """ + + etl = DOEEnergyBurden() + data_path, _ = mock_paths + assert etl.DATA_PATH == data_path + assert etl.COLUMNS_TO_KEEP == [ + "GEOID10_TRACT", + "Energy burden" + ] + assert etl.GEOID_FIELD_NAME == "GEOID10" + assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT" + assert etl.TRACT_INPUT_COLUMN_NAME == "FIP" + assert etl.INPUT_ENERGY_BURDEN_FIELD_NAME == "BURDEN" + assert etl.REVISED_ENERGY_BURDEN_FIELD_NAME == "Energy burden"