mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-23 05:20:16 -07:00
Refactor DOE Energy Burden and COI to use YAML (#1796)
* added tribalId for Supplemental dataset (#1804) * Setting zoom levels for tribal map (#1810) * NRI dataset and initial score YAML configuration (#1534) * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * update be staging gha * checkpoint * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * checkpoint * PR Review * renoving source url * tests * stop execution of ETL if there's a YAML schema issue * update be staging gha * adding source url as class var again * clean up * force cache bust * gha cache bust * dynamically set score vars from YAML * docsctrings * removing last updated year - optional reverse percentile * passing tests * sort order * column ordening * PR review * class level vars * Updating DatasetsConfig * fix pylint errors * moving metadata hint back to code Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov> * Correct copy typo (#1809) * Add basic test suite for COI (#1518) * Update COI to use new yaml (#1518) * Add tests for DOE energy budren (1518 * Add dataset config for energy budren (1518) * Refactor ETL to use datasets.yml (#1518) * Add fake GEOIDs to COI tests (#1518) * Refactor _setup_etl_instance_and_run_extract to base (#1518) For the three classes we've done so far, a generic _setup_etl_instance_and_run_extract will work fine, for the moment we can reuse the same setup method until we decide future classes need more flexibility --- but they can also always subclass so... * Add output-path tests (#1518) * Update YAML to match constant (#1518) * Don't blindly set float format (#1518) * Add defaults for extract (#1518) * Run YAML load on all subclasses (#1518) * Update description fields (#1518) * Update YAML per final format (#1518) * Update fixture tract IDs (#1518) * Update base class refactor (#1518) Now that NRI is final I needed to make a small number of updates to my refactored code. * Remove old comment (#1518) * Fix type signature and return (#1518) * Update per code review (#1518) Co-authored-by: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov> Co-authored-by: Vim <86254807+vim-usds@users.noreply.github.com>
This commit is contained in:
parent
baa591a6c6
commit
97e17546cc
28 changed files with 455 additions and 189 deletions
|
@ -1,7 +1,7 @@
|
|||
GEOID10_TRACT,Input Field 1
|
||||
06007040300,2.0000000000
|
||||
06001020100,6.1000000000
|
||||
06007040500,-7.8000000000
|
||||
06027000800,2.0000000000
|
||||
06069000802,6.1000000000
|
||||
06061021322,-7.8000000000
|
||||
15001021010,12.0000000000
|
||||
15001021101,12.0552478300
|
||||
15007040603,13.5141757800
|
||||
|
|
|
Binary file not shown.
|
@ -1,7 +1,7 @@
|
|||
GEOID10_TRACT,Example Field 1
|
||||
06007040300,4.0000000000
|
||||
06001020100,12.2000000000
|
||||
06007040500,-15.6000000000
|
||||
06027000800,4.0000000000
|
||||
06069000802,12.2000000000
|
||||
06061021322,-15.6000000000
|
||||
15001021010,24.0000000000
|
||||
15001021101,24.1104956600
|
||||
15007040603,27.0283515600
|
||||
|
|
|
|
@ -1,7 +1,7 @@
|
|||
GEOID10_TRACT,Input Field 1,Example Field 1
|
||||
06007040300,2.0000000000,4.0000000000
|
||||
06001020100,6.1000000000,12.2000000000
|
||||
06007040500,-7.8000000000,-15.6000000000
|
||||
06027000800,2.0000000000,4.0000000000
|
||||
06069000802,6.1000000000,12.2000000000
|
||||
06061021322,-7.8000000000,-15.6000000000
|
||||
15001021010,12.0000000000,24.0000000000
|
||||
15001021101,12.0552478300,24.1104956600
|
||||
15007040603,13.5141757800,27.0283515600
|
||||
|
|
|
|
@ -3,8 +3,10 @@ import copy
|
|||
import os
|
||||
import pathlib
|
||||
from typing import Type
|
||||
from unittest import mock
|
||||
import pytest
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
@ -47,9 +49,9 @@ class TestETL:
|
|||
# we use the same tract IDs across fixtures.
|
||||
# The test fixtures may also contain other tract IDs that are not on this list.
|
||||
_FIXTURES_SHARED_TRACT_IDS = [
|
||||
"06007040300",
|
||||
"06001020100",
|
||||
"06007040500",
|
||||
"06027000800",
|
||||
"06069000802",
|
||||
"06061021322",
|
||||
"15001021010",
|
||||
"15001021101",
|
||||
"15007040603",
|
||||
|
@ -98,18 +100,32 @@ class TestETL:
|
|||
In order to re-implement this method, usually it will involve a
|
||||
decent amount of work to monkeypatch `requests` or another method that's
|
||||
used to retrieve data in order to force that method to retrieve the fixture
|
||||
data.
|
||||
data. A basic version of that patching is included here for classes that can use it.
|
||||
"""
|
||||
# When running this in child classes, make sure the child class re-implements
|
||||
# this method.
|
||||
if self._ETL_CLASS is not ExampleETL:
|
||||
raise NotImplementedError(
|
||||
"Prepare and run extract method not defined for this class."
|
||||
with mock.patch("data_pipeline.utils.requests") as requests_mock:
|
||||
zip_file_fixture_src = (
|
||||
self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_ZIP_FILE_NAME
|
||||
)
|
||||
tmp_path = mock_paths[1]
|
||||
|
||||
# The rest of this method applies for `ExampleETL` only.
|
||||
etl = self._get_instance_of_etl_class()
|
||||
etl.extract()
|
||||
# Create mock response.
|
||||
with open(zip_file_fixture_src, mode="rb") as file:
|
||||
file_contents = file.read()
|
||||
response_mock = requests.Response()
|
||||
response_mock.status_code = 200
|
||||
# pylint: disable=protected-access
|
||||
response_mock._content = file_contents
|
||||
# Return text fixture:
|
||||
requests_mock.get = mock.MagicMock(return_value=response_mock)
|
||||
|
||||
# Instantiate the ETL class.
|
||||
etl = self._ETL_CLASS()
|
||||
|
||||
# Monkey-patch the temporary directory to the one used in the test
|
||||
etl.TMP_PATH = tmp_path
|
||||
|
||||
# Run the extract method.
|
||||
etl.extract()
|
||||
|
||||
return etl
|
||||
|
||||
|
@ -367,9 +383,14 @@ class TestETL:
|
|||
etl_with_duplicate_geo_field.output_df = actual_output_df.copy(
|
||||
deep=True
|
||||
)
|
||||
etl_with_duplicate_geo_field.output_df.reset_index(inplace=True)
|
||||
etl_with_duplicate_geo_field.output_df.loc[
|
||||
0:1, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
|
||||
] = "06007040300"
|
||||
] = etl_with_duplicate_geo_field.output_df[
|
||||
ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
|
||||
].iloc[
|
||||
0
|
||||
]
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_duplicate_geo_field.validate()
|
||||
assert str(error.value).startswith("Duplicate values:")
|
||||
|
@ -440,7 +461,7 @@ class TestETL:
|
|||
|
||||
# Remove another column to keep and make sure error occurs.
|
||||
etl_with_missing_column = copy.deepcopy(etl)
|
||||
columns_to_keep = actual_output_df.columns[:-1]
|
||||
columns_to_keep = etl.COLUMNS_TO_KEEP[:-1]
|
||||
etl_with_missing_column.output_df = actual_output_df[columns_to_keep]
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_missing_column.validate()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue