Adds National Risk Index data to ETL pipeline (#549)

* Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline
2025-07-28 13:51:16 -07:00 · 2021-09-07 20:51:34 -04:00 · 2021-09-07 20:51:34 -04:00 · f0900f7b69
commit f0900f7b69
parent 94298635c2
14 changed files with 307 additions and 7 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/init.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/init.py
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/acs.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/acs.csv
@ -0,0 +1,11 @@
+GEOID10,POPULATION
+050070403001,1000
+050070403002,1500
+050010201001,1000
+050010201002,1500
+150070405001,2000
+150070405002,2250
+150010210101,2000
+150010210102,1500
+150010211011,1750
+150010211012,1500
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/input.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/input.csv
@ -0,0 +1,6 @@
+TRACT,TRACTFIPS,RISK_SCORE,RISK_RATNG,RISK_NPCTL
+40300,05007040300,10.492015,Very Low,15.3494
+20100,05001020100,14.705854,Relatively Low,36.725828
+40500,15007040500,10.234981,Very Low,13.997993
+21010,15001021010,21.537231,Relatively Moderate,59.488033
+21101,15001021101,19.434585,Relatively Low,53.392265
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv
@ -0,0 +1,11 @@
+GEOID10,GEOID10_TRACT,TRACT,RISK_SCORE,RISK_RATNG,RISK_NPCTL
+050070403001,05007040300,40300,10.492015,Very Low,15.3494
+050070403002,05007040300,40300,10.492015,Very Low,15.3494
+050010201001,05001020100,20100,14.705854,Relatively Low,36.725828
+050010201002,05001020100,20100,14.705854,Relatively Low,36.725828
+150070405001,15007040500,40500,10.234981,Very Low,13.997993
+150070405002,15007040500,40500,10.234981,Very Low,13.997993
+150010210101,15001021010,21010,21.537231,Relatively Moderate,59.488033
+150010210102,15001021010,21010,21.537231,Relatively Moderate,59.488033
+150010211011,15001021101,21101,19.434585,Relatively Low,53.392265
+150010211012,15001021101,21101,19.434585,Relatively Low,53.392265
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -0,0 +1,110 @@
+from pathlib import Path
+from shutil import copyfile
+
+import pandas as pd
+
+from data_pipeline.config import settings
+from data_pipeline.etl.sources.national_risk_index.etl import (
+    NationalRiskIndexETL,
+)
+
+DATA_DIR = (
+    settings.APP_ROOT / "tests" / "sources" / "national_risk_index" / "data"
+)
+
+
+def copy_data_files(src: Path, dst: Path) -> None:
+    """Copies test data from src Path to dst Path for use in testing
+
+    Args
+        src: pathlib.Path instance. The location of the source data file.
+        dst: pathlib.Path instance. Where to copy the source data file to.
+
+    Returns
+        None. This is a void function
+    """
+    if not dst.exists():
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        copyfile(src, dst)
+        assert dst.exists()
+
+
+class TestNationalRiskIndexETL:
+    def test_init(self, mock_etl, mock_paths):
+        """Tests that the mock NationalRiskIndexETL class instance was
+        initiliazed correctly.
+
+        Validates the following conditions:
+        - self.DATA_PATH points to the "data" folder in the temp directory
+        - self.TMP_PATH points to the "data/tmp" folder in the temp directory
+        - self.INPUT_PATH points to the correct path in the temp directory
+        - self.OUTPUT_PATH points to the correct path in the temp directory
+        """
+        # setup
+        etl = NationalRiskIndexETL()
+        data_path, tmp_path = mock_paths
+        input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
+        output_dir = data_path / "dataset" / "national_risk_index_2020"
+        # validation
+        assert etl.DATA_PATH == data_path
+        assert etl.TMP_PATH == tmp_path
+        assert etl.INPUT_CSV == input_csv
+        assert etl.OUTPUT_DIR == output_dir
+        assert etl.GEOID_FIELD_NAME == "GEOID10"
+        assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
+
+    def test_transform(self, mock_etl):
+        """Tests the transform() method for NationalRiskIndexETL
+
+        Validates the following conditions:
+        - The columns have been renamed correctly
+        - The values for each tract has been applied to each of the block
+          groups in that tract
+        """
+        # setup - copy sample data into tmp_dir
+        etl = NationalRiskIndexETL()
+        input_src = DATA_DIR / "input.csv"
+        input_dst = etl.INPUT_CSV
+        acs_src = DATA_DIR / "acs.csv"
+        acs_dst = DATA_DIR / etl.BLOCK_GROUP_CSV
+        for src, dst in [(input_src, input_dst), (acs_src, acs_dst)]:
+            copy_data_files(src, dst)
+        # setup - read in sample output as dataframe
+        TRACT_COL = etl.GEOID_TRACT_FIELD_NAME
+        BLOCK_COL = etl.GEOID_FIELD_NAME
+        expected = pd.read_csv(
+            DATA_DIR / "output.csv",
+            dtype={BLOCK_COL: "string", TRACT_COL: "string"},
+        )
+        # execution
+        etl.transform()
+        # validation
+        assert etl.df.shape == (10, 6)
+        assert etl.df.equals(expected)
+
+    def test_load(self, mock_etl):
+        """Tests the load() method for NationalRiskIndexETL
+
+        Validates the following conditions:
+        - The transformed dataframe is written to the directory specified by
+          self.OUTPUT_DIR
+        - The content of the file that's written matches the data in self.df
+        """
+        # setup
+        etl = NationalRiskIndexETL()
+        output_path = etl.OUTPUT_DIR / "usa.csv"
+        TRACT_COL = etl.GEOID_TRACT_FIELD_NAME
+        BLOCK_COL = etl.GEOID_FIELD_NAME
+        expected = pd.read_csv(
+            DATA_DIR / "output.csv",
+            dtype={BLOCK_COL: str, TRACT_COL: str},
+        )
+        etl.df = expected
+        # execution
+        etl.load()
+        output = pd.read_csv(
+            output_path, dtype={BLOCK_COL: str, TRACT_COL: str}
+        )
+        # validation
+        assert output_path.exists()
+        assert output.equals(expected)