Add tests for DOE energy budren (1518

2025-02-23 01:54:18 -08:00 · 2022-07-27 17:17:34 -04:00 · 2022-07-27 17:17:34 -04:00 · 12a6b2f10e
commit 12a6b2f10e
parent e77e7aef2e
7 changed files with 140 additions and 7 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
@ -2,18 +2,22 @@ from pathlib import Path
 import pandas as pd
 from data_pipeline.config import settings
-from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
 from data_pipeline.utils import get_module_logger, unzip_file_from_url
 logger = get_module_logger(__name__)
 class DOEEnergyBurden(ExtractTransformLoad):
    NAME = "doe_energy_burden"
    SOURCE_URL: str = (
        settings.AWS_JUSTICE40_DATASOURCES_URL
        + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
    )
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    def __init__(self):
-        self.DOE_FILE_URL = (
+        self.DOE_FILE_URL = self.SOURCE_URL
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
        )
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "doe_energy_burden"
@ -38,12 +42,11 @@ class DOEEnergyBurden(ExtractTransformLoad):
        unzip_file_from_url(
            file_url=self.DOE_FILE_URL,
            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path() / "doe_energy_burden",
+            unzipped_file_path=self.get_tmp_path()
        )
        self.raw_df = pd.read_csv(
            filepath_or_buffer=self.get_tmp_path()
            / "doe_energy_burden"
            / "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={
--- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/init.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/init.py
--- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip
+++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip
--- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/extract.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/extract.csv
@ -0,0 +1,16 @@
 ABV,FIP,BURDEN,QUANTILE
 HI,15001021010,0.0380000000,30
 HI,15001021101,0.0410000000,25
 HI,15001021402,0.0240000000,66
 HI,15001021800,0.0290000000,51
 HI,15003010201,0.0270000000,58
 HI,15007040603,0.0440000000,21
 HI,15007040604,0.0330000000,40
 HI,15007040700,0.0260000000,59
 HI,15009030100,0.0350000000,37
 HI,15009030201,0.0220000000,71
 HI,15009030402,0.0200000000,75
 HI,15009030800,0.0190000000,80
 CA,6007040300,0.2000000000,70
 CA,6007040500,0.5000000000,50
 CA,6001020100,0.1990000000,30
--- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/output.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/output.csv
@ -0,0 +1,16 @@
 GEOID10_TRACT,Energy burden
 15001021010,0.0380000000
 15001021101,0.0410000000
 15001021402,0.0240000000
 15001021800,0.0290000000
 15003010201,0.0270000000
 15007040603,0.0440000000
 15007040604,0.0330000000
 15007040700,0.0260000000
 15009030100,0.0350000000
 15009030201,0.0220000000
 15009030402,0.0200000000
 15009030800,0.0190000000
 06007040300,0.2000000000
 06007040500,0.5000000000
 06001020100,0.1990000000
--- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/transform.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/data/transform.csv
@ -0,0 +1,16 @@
 ABV,GEOID10_TRACT,Energy burden,QUANTILE
 HI,15001021010,0.0380000000,30
 HI,15001021101,0.0410000000,25
 HI,15001021402,0.0240000000,66
 HI,15001021800,0.0290000000,51
 HI,15003010201,0.0270000000,58
 HI,15007040603,0.0440000000,21
 HI,15007040604,0.0330000000,40
 HI,15007040700,0.0260000000,59
 HI,15009030100,0.0350000000,37
 HI,15009030201,0.0220000000,71
 HI,15009030402,0.0200000000,75
 HI,15009030800,0.0190000000,80
 CA,06007040300,0.2000000000,70
 CA,06007040500,0.5000000000,50
 CA,06001020100,0.1990000000,30
--- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py
@ -0,0 +1,82 @@
 # pylint: disable=protected-access
 from unittest import mock
 import pathlib
 import requests
 from data_pipeline.etl.sources.doe_energy_burden.etl import (
    DOEEnergyBurden,
 )
 from data_pipeline.tests.sources.example.test_etl import TestETL
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
 class TestDOEEnergyBurdenETL(TestETL):
    """Tests the COI ETL.
    This uses pytest-snapshot.
    To update individual snapshots: $ poetry run pytest
            data_pipeline/tests/sources/ndoe_energy_burden/test_etl.py::TestClassNameETL::<testname>
            --snapshot-update
    """
    _ETL_CLASS = DOEEnergyBurden
    _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
    _SAMPLE_DATA_FILE_NAME = "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
    _SAMPLE_DATA_ZIP_FILE_NAME = "DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
    _EXTRACT_TMP_FOLDER_NAME = "DOEEnergyBurden"
    _EXTRACT_CSV_FILE_NAME = "extract.csv"
    def setup_method(self, _method, filename=__file__):
        """Invoke `setup_method` from Parent, but using the current file name.
        This code can be copied identically between all child classes.
        """
        super().setup_method(_method=_method, filename=filename)
    # XXX: Refactor since I just straight copied it out of NRI's
    def _setup_etl_instance_and_run_extract(self, mock_etl, mock_paths):
        with mock.patch("data_pipeline.utils.requests") as requests_mock:
            zip_file_fixture_src = self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_ZIP_FILE_NAME
            tmp_path = mock_paths[1]
            # Create mock response.
            with open(zip_file_fixture_src, mode="rb") as file:
                file_contents = file.read()
            response_mock = requests.Response()
            response_mock.status_code = 200
            # pylint: disable=protected-access
            response_mock._content = file_contents
            # Return text fixture:
            requests_mock.get = mock.MagicMock(return_value=response_mock)
            # Instantiate the ETL class.
            etl = self._ETL_CLASS()
            # Monkey-patch the temporary directory to the one used in the test
            etl.TMP_PATH = tmp_path
            # Run the extract method.
            etl.extract()
        return etl
    def test_init(self, mock_etl, mock_paths):
        """Tests that the ChildOpportunityIndexETL class was initialized
        correctly.
        """
        etl = DOEEnergyBurden()
        data_path, _ = mock_paths
        assert etl.DATA_PATH == data_path
        assert etl.COLUMNS_TO_KEEP == [
            "GEOID10_TRACT",
            "Energy burden"
        ]
        assert etl.GEOID_FIELD_NAME == "GEOID10"
        assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
        assert etl.TRACT_INPUT_COLUMN_NAME == "FIP"
        assert etl.INPUT_ENERGY_BURDEN_FIELD_NAME == "BURDEN"
        assert etl.REVISED_ENERGY_BURDEN_FIELD_NAME == "Energy burden"