Refactor DOE Energy Burden and COI to use YAML (#1796)

* added tribalId for Supplemental dataset (#1804) * Setting zoom levels for tribal map (#1810) * NRI dataset and initial score YAML configuration (#1534) * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * update be staging gha * checkpoint * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * checkpoint * PR Review * renoving source url * tests * stop execution of ETL if there's a YAML schema issue * update be staging gha * adding source url as class var again * clean up * force cache bust * gha cache bust * dynamically set score vars from YAML * docsctrings * removing last updated year - optional reverse percentile * passing tests * sort order * column ordening * PR review * class level vars * Updating DatasetsConfig * fix pylint errors * moving metadata hint back to code Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov> * Correct copy typo (#1809) * Add basic test suite for COI (#1518) * Update COI to use new yaml (#1518) * Add tests for DOE energy budren (1518 * Add dataset config for energy budren (1518) * Refactor ETL to use datasets.yml (#1518) * Add fake GEOIDs to COI tests (#1518) * Refactor _setup_etl_instance_and_run_extract to base (#1518) For the three classes we've done so far, a generic _setup_etl_instance_and_run_extract will work fine, for the moment we can reuse the same setup method until we decide future classes need more flexibility --- but they can also always subclass so... * Add output-path tests (#1518) * Update YAML to match constant (#1518) * Don't blindly set float format (#1518) * Add defaults for extract (#1518) * Run YAML load on all subclasses (#1518) * Update description fields (#1518) * Update YAML per final format (#1518) * Update fixture tract IDs (#1518) * Update base class refactor (#1518) Now that NRI is final I needed to make a small number of updates to my refactored code. * Remove old comment (#1518) * Fix type signature and return (#1518) * Update per code review (#1518) Co-authored-by: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov> Co-authored-by: Vim <86254807+vim-usds@users.noreply.github.com>
2025-07-23 05:20:16 -07:00 · 2022-08-10 16:02:59 -04:00 · 2022-08-10 16:02:59 -04:00 · 97e17546cc
commit 97e17546cc
parent baa591a6c6
28 changed files with 455 additions and 189 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/example/data/extract.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/data/extract.csv
@ -1,7 +1,7 @@
 GEOID10_TRACT,Input Field 1
-06007040300,2.0000000000
-06001020100,6.1000000000
-06007040500,-7.8000000000
+06027000800,2.0000000000
+06069000802,6.1000000000
+06061021322,-7.8000000000
 15001021010,12.0000000000
 15001021101,12.0552478300
 15007040603,13.5141757800
--- a/data/data-pipeline/data_pipeline/tests/sources/example/data/input.zip
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/data/input.zip
--- a/data/data-pipeline/data_pipeline/tests/sources/example/data/output.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/data/output.csv
@ -1,7 +1,7 @@
 GEOID10_TRACT,Example Field 1
-06007040300,4.0000000000
-06001020100,12.2000000000
-06007040500,-15.6000000000
+06027000800,4.0000000000
+06069000802,12.2000000000
+06061021322,-15.6000000000
 15001021010,24.0000000000
 15001021101,24.1104956600
 15007040603,27.0283515600
--- a/data/data-pipeline/data_pipeline/tests/sources/example/data/transform.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/data/transform.csv
@ -1,7 +1,7 @@
 GEOID10_TRACT,Input Field 1,Example Field 1
-06007040300,2.0000000000,4.0000000000
-06001020100,6.1000000000,12.2000000000
-06007040500,-7.8000000000,-15.6000000000
+06027000800,2.0000000000,4.0000000000
+06069000802,6.1000000000,12.2000000000
+06061021322,-7.8000000000,-15.6000000000
 15001021010,12.0000000000,24.0000000000
 15001021101,12.0552478300,24.1104956600
 15007040603,13.5141757800,27.0283515600
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -3,8 +3,10 @@ import copy
 import os
 import pathlib
 from typing import Type
+from unittest import mock
 import pytest

+import requests
 import numpy as np
 import pandas as pd

@ -47,9 +49,9 @@ class TestETL:
    # we use the same tract IDs across fixtures.
    # The test fixtures may also contain other tract IDs that are not on this list.
    _FIXTURES_SHARED_TRACT_IDS = [
-        "06007040300",
-        "06001020100",
-        "06007040500",
+        "06027000800",
+        "06069000802",
+        "06061021322",
        "15001021010",
        "15001021101",
        "15007040603",
@ -98,18 +100,32 @@ class TestETL:
        In order to re-implement this method, usually it will involve a
        decent amount of work to monkeypatch `requests` or another method that's
        used to retrieve data in order to force that method to retrieve the fixture
-        data.
+        data. A basic version of that patching is included here for classes that can use it.
        """
-        # When running this in child classes, make sure the child class re-implements
-        # this method.
-        if self._ETL_CLASS is not ExampleETL:
-            raise NotImplementedError(
-                "Prepare and run extract method not defined for this class."
+        with mock.patch("data_pipeline.utils.requests") as requests_mock:
+            zip_file_fixture_src = (
+                self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_ZIP_FILE_NAME
            )
+            tmp_path = mock_paths[1]

-        # The rest of this method applies for `ExampleETL` only.
-        etl = self._get_instance_of_etl_class()
-        etl.extract()
+            # Create mock response.
+            with open(zip_file_fixture_src, mode="rb") as file:
+                file_contents = file.read()
+            response_mock = requests.Response()
+            response_mock.status_code = 200
+            # pylint: disable=protected-access
+            response_mock._content = file_contents
+            # Return text fixture:
+            requests_mock.get = mock.MagicMock(return_value=response_mock)
+
+            # Instantiate the ETL class.
+            etl = self._ETL_CLASS()
+
+            # Monkey-patch the temporary directory to the one used in the test
+            etl.TMP_PATH = tmp_path
+
+            # Run the extract method.
+            etl.extract()

        return etl

@ -367,9 +383,14 @@ class TestETL:
            etl_with_duplicate_geo_field.output_df = actual_output_df.copy(
                deep=True
            )
+            etl_with_duplicate_geo_field.output_df.reset_index(inplace=True)
            etl_with_duplicate_geo_field.output_df.loc[
                0:1, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
-            ] = "06007040300"
+            ] = etl_with_duplicate_geo_field.output_df[
+                ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
+            ].iloc[
+                0
+            ]
            with pytest.raises(ValueError) as error:
                etl_with_duplicate_geo_field.validate()
            assert str(error.value).startswith("Duplicate values:")
@ -440,7 +461,7 @@ class TestETL:

        # Remove another column to keep and make sure error occurs.
        etl_with_missing_column = copy.deepcopy(etl)
-        columns_to_keep = actual_output_df.columns[:-1]
+        columns_to_keep = etl.COLUMNS_TO_KEEP[:-1]
        etl_with_missing_column.output_df = actual_output_df[columns_to_keep]
        with pytest.raises(ValueError) as error:
            etl_with_missing_column.validate()