Add tests for all non-census sources (#1899)

* Refactor CDC life-expectancy (1554) * Update to new tract list (#1554) * Adjust for tests (#1848) * Add tests for cdc_places (#1848) * Add EJScreen tests (#1848) * Add tests for HUD housing (#1848) * Add tests for GeoCorr (#1848) * Add persistent poverty tests (#1848) * Update for sources without zips, for new validation (#1848) * Update tests for new multi-CSV but (#1848) Lucas updated the CDC life expectancy data to handle a bug where two states are missing from the US Overall download. Since virtually none of our other ETL classes download multiple CSVs directly like this, it required a pretty invasive new mocking strategy. * Add basic tests for nature deprived (#1848) * Add wildfire tests (#1848) * Add flood risk tests (#1848) * Add DOT travel tests (#1848) * Add historic redlining tests (#1848) * Add tests for ME and WI (#1848) * Update now that validation exists (#1848) * Adjust for validation (#1848) * Add health insurance back to cdc places (#1848) Ooops * Update tests with new field (#1848) * Test for blank tract removal (#1848) * Add tracts for clipping behavior * Test clipping and zfill behavior (#1848) * Fix bad test assumption (#1848) * Simplify class, add test for tract padding (#1848) * Fix percentage inversion, update tests (#1848) Looking through the transformations, I noticed that we were subtracting a percentage that is usually between 0-100 from 1 instead of 100, and so were endind up with some surprising results. Confirmed with lucasmbrown-usds * Add note about first street data (#1848)
2025-10-02 01:33:17 -07:00 · 2022-09-19 15:17:00 -04:00 · 2022-09-19 15:17:00 -04:00 · 876655d2b2
commit 876655d2b2
parent 4d02525bb3
88 changed files with 2032 additions and 178 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -115,56 +115,59 @@ class ExtractTransformLoad:
    #  periods. https://github.com/usds/justice40-tool/issues/964
    EXPECTED_MAX_CENSUS_TRACTS: int = 74160

+    # Should this dataset load its configuration from
+    # the YAML files?
+    LOAD_YAML_CONFIG: bool = False
+
    # We use output_df as the final dataframe to use to write to the CSV
    # It is used on the "load" base class method
    output_df: pd.DataFrame = None

    def __init_subclass__(cls) -> None:
-        cls.DATASET_CONFIG = cls.yaml_config_load()
+        if cls.LOAD_YAML_CONFIG:
+            cls.DATASET_CONFIG = cls.yaml_config_load()

    @classmethod
-    def yaml_config_load(cls) -> Optional[dict]:
+    def yaml_config_load(cls) -> dict:
        """Generate config dictionary and set instance variables from YAML dataset."""
-        if cls.NAME is not None:
-            # check if the class instance has score YAML definitions
-            datasets_config = load_yaml_dict_from_file(
-                cls.DATASET_CONFIG_PATH / "datasets.yml",
-                DatasetsConfig,
+        # check if the class instance has score YAML definitions
+        datasets_config = load_yaml_dict_from_file(
+            cls.DATASET_CONFIG_PATH / "datasets.yml",
+            DatasetsConfig,
+        )
+
+        # get the config for this dataset
+        try:
+            dataset_config = next(
+                item
+                for item in datasets_config.get("datasets")
+                if item["module_name"] == cls.NAME
            )
+        except StopIteration:
+            # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
+            logger.error(
+                f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
+            )
+            sys.exit()

-            # get the config for this dataset
-            try:
-                dataset_config = next(
-                    item
-                    for item in datasets_config.get("datasets")
-                    if item["module_name"] == cls.NAME
-                )
-            except StopIteration:
-                # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
-                logger.error(
-                    f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
-                )
-                sys.exit()
-
-            # set some of the basic fields
-            if "input_geoid_tract_field_name" in dataset_config:
-                cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
-                    "input_geoid_tract_field_name"
-                ]
-
-            # get the columns to write on the CSV
-            # and set the constants
-            cls.COLUMNS_TO_KEEP = [
-                cls.GEOID_TRACT_FIELD_NAME,  # always index with geoid tract id
+        # set some of the basic fields
+        if "input_geoid_tract_field_name" in dataset_config:
+            cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
+                "input_geoid_tract_field_name"
            ]
-            for field in dataset_config["load_fields"]:
-                cls.COLUMNS_TO_KEEP.append(field["long_name"])
-                setattr(cls, field["df_field_name"], field["long_name"])

-                # set the constants for the class
-                setattr(cls, field["df_field_name"], field["long_name"])
-            return dataset_config
-        return None
+        # get the columns to write on the CSV
+        # and set the constants
+        cls.COLUMNS_TO_KEEP = [
+            cls.GEOID_TRACT_FIELD_NAME,  # always index with geoid tract id
+        ]
+        for field in dataset_config["load_fields"]:
+            cls.COLUMNS_TO_KEEP.append(field["long_name"])
+            setattr(cls, field["df_field_name"], field["long_name"])
+
+            # set the constants for the class
+            setattr(cls, field["df_field_name"], field["long_name"])
+        return dataset_config

    # This is a classmethod so it can be used by `get_data_frame` without
    # needing to create an instance of the class. This is a use case in `etl_score`.