NRI dataset and initial score YAML configuration (#1534)

* update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * update be staging gha * checkpoint * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * checkpoint * PR Review * renoving source url * tests * stop execution of ETL if there's a YAML schema issue * update be staging gha * adding source url as class var again * clean up * force cache bust * gha cache bust * dynamically set score vars from YAML * docsctrings * removing last updated year - optional reverse percentile * passing tests * sort order * column ordening * PR review * class level vars * Updating DatasetsConfig * fix pylint errors * moving metadata hint back to code Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-07-23 05:10:36 -07:00 · 2022-08-09 16:37:10 -04:00 · 2022-08-09 16:37:10 -04:00 · 1c448a77f9
commit 1c448a77f9
parent 1833e3e794
15 changed files with 272 additions and 3485 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -119,6 +119,7 @@ class TestETL:
        """
        # Setup
        etl = self._get_instance_of_etl_class()
+        etl.__init__()
        data_path, tmp_path = mock_paths

        assert etl.DATA_PATH == data_path
@ -126,8 +127,6 @@ class TestETL:

        # Also make sure all parameters that need to be non-null are non-null
        assert etl.NAME is not None
-        assert etl.LAST_UPDATED_YEAR is not None
-        assert etl.SOURCE_URL is not None
        assert etl.GEO_LEVEL is not None
        assert etl.COLUMNS_TO_KEEP is not None
        assert len(etl.COLUMNS_TO_KEEP) > 0
@ -148,14 +147,10 @@ class TestETL:
        etl = self._get_instance_of_etl_class()
        data_path, tmp_path = mock_paths

+        etl.__init__()
        actual_file_path = etl._get_output_file_path()

-        expected_file_path = (
-            data_path
-            / "dataset"
-            / f"{etl.NAME}_{etl.LAST_UPDATED_YEAR}"
-            / "usa.csv"
-        )
+        expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv"

        logger.info(f"Expected: {expected_file_path}")

@ -255,6 +250,7 @@ class TestETL:
        etl = self._setup_etl_instance_and_run_extract(
            mock_etl=mock_etl, mock_paths=mock_paths
        )
+        etl.__init__()
        etl.transform()

        assert etl.output_df is not None
@ -272,6 +268,7 @@ class TestETL:
        """
        # setup - input variables
        etl = self._get_instance_of_etl_class()
+        etl.__init__()

        # setup - mock transform step
        df_transform = pd.read_csv(