NRI dataset and initial score YAML configuration (#1534)

* update be staging gha

* NRI dataset and initial score YAML configuration

* checkpoint

* adding data checks for release branch

* passing tests

* adding INPUT_EXTRACTED_FILE_NAME to base class

* lint

* columns to keep and tests

* update be staging gha

* checkpoint

* update be staging gha

* NRI dataset and initial score YAML configuration

* checkpoint

* adding data checks for release branch

* passing tests

* adding INPUT_EXTRACTED_FILE_NAME to base class

* lint

* columns to keep and tests

* checkpoint

* PR Review

* renoving source url

* tests

* stop execution of ETL if there's a YAML schema issue

* update be staging gha

* adding source url as class var again

* clean up

* force cache bust

* gha cache bust

* dynamically set score vars from YAML

* docsctrings

* removing last updated year - optional reverse percentile

* passing tests

* sort order

* column ordening

* PR review

* class level vars

* Updating DatasetsConfig

* fix pylint errors

* moving metadata hint back to code

Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Jorge Escobar 2022-08-09 16:37:10 -04:00 committed by GitHub
commit 1c448a77f9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 272 additions and 3485 deletions

View file

@ -119,6 +119,7 @@ class TestETL:
"""
# Setup
etl = self._get_instance_of_etl_class()
etl.__init__()
data_path, tmp_path = mock_paths
assert etl.DATA_PATH == data_path
@ -126,8 +127,6 @@ class TestETL:
# Also make sure all parameters that need to be non-null are non-null
assert etl.NAME is not None
assert etl.LAST_UPDATED_YEAR is not None
assert etl.SOURCE_URL is not None
assert etl.GEO_LEVEL is not None
assert etl.COLUMNS_TO_KEEP is not None
assert len(etl.COLUMNS_TO_KEEP) > 0
@ -148,14 +147,10 @@ class TestETL:
etl = self._get_instance_of_etl_class()
data_path, tmp_path = mock_paths
etl.__init__()
actual_file_path = etl._get_output_file_path()
expected_file_path = (
data_path
/ "dataset"
/ f"{etl.NAME}_{etl.LAST_UPDATED_YEAR}"
/ "usa.csv"
)
expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv"
logger.info(f"Expected: {expected_file_path}")
@ -255,6 +250,7 @@ class TestETL:
etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl, mock_paths=mock_paths
)
etl.__init__()
etl.transform()
assert etl.output_df is not None
@ -272,6 +268,7 @@ class TestETL:
"""
# setup - input variables
etl = self._get_instance_of_etl_class()
etl.__init__()
# setup - mock transform step
df_transform = pd.read_csv(