Add tests for all non-census sources (#1899)

* Refactor CDC life-expectancy (1554) * Update to new tract list (#1554) * Adjust for tests (#1848) * Add tests for cdc_places (#1848) * Add EJScreen tests (#1848) * Add tests for HUD housing (#1848) * Add tests for GeoCorr (#1848) * Add persistent poverty tests (#1848) * Update for sources without zips, for new validation (#1848) * Update tests for new multi-CSV but (#1848) Lucas updated the CDC life expectancy data to handle a bug where two states are missing from the US Overall download. Since virtually none of our other ETL classes download multiple CSVs directly like this, it required a pretty invasive new mocking strategy. * Add basic tests for nature deprived (#1848) * Add wildfire tests (#1848) * Add flood risk tests (#1848) * Add DOT travel tests (#1848) * Add historic redlining tests (#1848) * Add tests for ME and WI (#1848) * Update now that validation exists (#1848) * Adjust for validation (#1848) * Add health insurance back to cdc places (#1848) Ooops * Update tests with new field (#1848) * Test for blank tract removal (#1848) * Add tracts for clipping behavior * Test clipping and zfill behavior (#1848) * Fix bad test assumption (#1848) * Simplify class, add test for tract padding (#1848) * Fix percentage inversion, update tests (#1848) Looking through the transformations, I noticed that we were subtracting a percentage that is usually between 0-100 from 1 instead of 100, and so were endind up with some surprising results. Confirmed with lucasmbrown-usds * Add note about first street data (#1848)
2025-09-30 11:03:18 -07:00 · 2022-09-19 15:17:00 -04:00 · 2022-09-19 15:17:00 -04:00 · 876655d2b2
commit 876655d2b2
parent 4d02525bb3
88 changed files with 2032 additions and 178 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/init.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/init.py
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/data/HRS_2010.zip
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/data/HRS_2010.zip
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/data/output.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/data/output.csv
@ -0,0 +1,16 @@
+GEOID10_TRACT,Tract-level redlining score,Tract-level redlining score meets or exceeds 3.25,Tract-level redlining score meets or exceeds 3.5,Tract-level redlining score meets or exceeds 3.75
+06027000800,3.3000000000,True,False,False
+06061021322,3.9900000000,True,True,True
+06069000802,3.7800000000,True,True,True
+15001021010,4.0000000000,True,True,True
+15001021101,4.0000000000,True,True,True
+15001021402,3.8600000000,True,True,True
+15001021800,4.0000000000,True,True,True
+15003010201,3.9600000000,True,True,True
+15007040603,3.9700000000,True,True,True
+15007040604,3.9400000000,True,True,True
+15007040700,3.2000000000,False,False,False
+15009030100,3.7700000000,True,True,True
+15009030201,3.2300000000,False,False,False
+15009030402,3.0000000000,False,False,False
+15009030800,3.4000000000,True,False,False
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/data/transform.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/data/transform.csv
@ -0,0 +1,16 @@
+GEOID10,CBSA_NAME,CBSA_NUM,EQINTERVAL2010,Tract-level redlining score,GEOID10_TRACT,Tract-level redlining score meets or exceeds 3.25,Tract-level redlining score meets or exceeds 3.5,Tract-level redlining score meets or exceeds 3.75
+6027000800,"Birmingham-Hoover, AL",13820,4,3.3000000000,06027000800,True,False,False
+6061021322,"Birmingham-Hoover, AL",13820,4,3.9900000000,06061021322,True,True,True
+6069000802,"Birmingham-Hoover, AL",13820,4,3.7800000000,06069000802,True,True,True
+15001021010,"Birmingham-Hoover, AL",13820,4,4.0000000000,15001021010,True,True,True
+15001021101,"Birmingham-Hoover, AL",13820,4,4.0000000000,15001021101,True,True,True
+15001021402,"Birmingham-Hoover, AL",13820,4,3.8600000000,15001021402,True,True,True
+15001021800,"Birmingham-Hoover, AL",13820,4,4.0000000000,15001021800,True,True,True
+15003010201,"Birmingham-Hoover, AL",13820,4,3.9600000000,15003010201,True,True,True
+15007040603,"Birmingham-Hoover, AL",13820,4,3.9700000000,15007040603,True,True,True
+15007040604,"Birmingham-Hoover, AL",13820,4,3.9400000000,15007040604,True,True,True
+15007040700,"Birmingham-Hoover, AL",13820,3,3.2000000000,15007040700,False,False,False
+15009030100,"Birmingham-Hoover, AL",13820,4,3.7700000000,15009030100,True,True,True
+15009030201,"Birmingham-Hoover, AL",13820,3,3.2300000000,15009030201,False,False,False
+15009030402,"Birmingham-Hoover, AL",13820,3,3.0000000000,15009030402,False,False,False
+15009030800,"Birmingham-Hoover, AL",13820,4,3.4000000000,15009030800,True,False,False
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
@ -0,0 +1,66 @@
+# pylint: disable=protected-access
+import pathlib
+import pandas as pd
+from data_pipeline.tests.sources.example.test_etl import TestETL
+from data_pipeline.etl.sources.historic_redlining.etl import (
+    HistoricRedliningETL,
+)
+
+
+class TestHistoricRedliningETL(TestETL):
+    _ETL_CLASS = HistoricRedliningETL
+
+    _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
+    _SAMPLE_DATA_FILE_NAME = "HRS_2010.xlsx"
+    _SAMPLE_DATA_ZIP_FILE_NAME = "HRS_2010.zip"
+    _EXTRACT_TMP_FOLDER_NAME = "HistoricRedliningETL"
+
+    def setup_method(self, _method, filename=__file__):
+        """Invoke `setup_method` from Parent, but using the current file name.
+
+        This code can be copied identically between all child classes.
+        """
+        super().setup_method(_method=_method, filename=filename)
+
+    def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
+        etl = self._setup_etl_instance_and_run_extract(
+            mock_etl=mock_etl,
+            mock_paths=mock_paths,
+        )
+        tmp_df = pd.read_excel(
+            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
+        )
+        assert tmp_df.shape == (15, 5)
+
+    def test_load_base(self, snapshot, mock_etl, mock_paths):
+        """Test load method.
+        We need to run transform here for real to add
+        the dynamic cols to keep
+        """
+        # setup - input variables
+        etl = self._setup_etl_instance_and_run_extract(
+            mock_etl=mock_etl,
+            mock_paths=mock_paths,
+        )
+        etl.transform()
+        etl.load()
+
+        # Make sure it creates the file.
+        actual_output_path = etl._get_output_file_path()
+        assert actual_output_path.exists()
+
+        # Check COLUMNS_TO_KEEP remain
+        actual_output = pd.read_csv(
+            actual_output_path, dtype={etl.GEOID_TRACT_FIELD_NAME: str}
+        )
+
+        for col in etl.COLUMNS_TO_KEEP:
+            assert col in actual_output.columns, f"{col} is missing from output"
+
+        # Check the snapshots
+        snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
+        snapshot.assert_match(
+            actual_output.to_csv(index=False, float_format=self._FLOAT_FORMAT),
+            self._OUTPUT_CSV_FILE_NAME,
+        )