mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 13:51:16 -07:00
Adds National Risk Index data to ETL pipeline (#549)
* Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline
This commit is contained in:
parent
94298635c2
commit
f0900f7b69
14 changed files with 307 additions and 7 deletions
|
@ -0,0 +1,11 @@
|
|||
GEOID10,POPULATION
|
||||
050070403001,1000
|
||||
050070403002,1500
|
||||
050010201001,1000
|
||||
050010201002,1500
|
||||
150070405001,2000
|
||||
150070405002,2250
|
||||
150010210101,2000
|
||||
150010210102,1500
|
||||
150010211011,1750
|
||||
150010211012,1500
|
|
|
@ -0,0 +1,6 @@
|
|||
TRACT,TRACTFIPS,RISK_SCORE,RISK_RATNG,RISK_NPCTL
|
||||
40300,05007040300,10.492015,Very Low,15.3494
|
||||
20100,05001020100,14.705854,Relatively Low,36.725828
|
||||
40500,15007040500,10.234981,Very Low,13.997993
|
||||
21010,15001021010,21.537231,Relatively Moderate,59.488033
|
||||
21101,15001021101,19.434585,Relatively Low,53.392265
|
|
|
@ -0,0 +1,11 @@
|
|||
GEOID10,GEOID10_TRACT,TRACT,RISK_SCORE,RISK_RATNG,RISK_NPCTL
|
||||
050070403001,05007040300,40300,10.492015,Very Low,15.3494
|
||||
050070403002,05007040300,40300,10.492015,Very Low,15.3494
|
||||
050010201001,05001020100,20100,14.705854,Relatively Low,36.725828
|
||||
050010201002,05001020100,20100,14.705854,Relatively Low,36.725828
|
||||
150070405001,15007040500,40500,10.234981,Very Low,13.997993
|
||||
150070405002,15007040500,40500,10.234981,Very Low,13.997993
|
||||
150010210101,15001021010,21010,21.537231,Relatively Moderate,59.488033
|
||||
150010210102,15001021010,21010,21.537231,Relatively Moderate,59.488033
|
||||
150010211011,15001021101,21101,19.434585,Relatively Low,53.392265
|
||||
150010211012,15001021101,21101,19.434585,Relatively Low,53.392265
|
|
|
@ -0,0 +1,110 @@
|
|||
from pathlib import Path
|
||||
from shutil import copyfile
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.sources.national_risk_index.etl import (
|
||||
NationalRiskIndexETL,
|
||||
)
|
||||
|
||||
DATA_DIR = (
|
||||
settings.APP_ROOT / "tests" / "sources" / "national_risk_index" / "data"
|
||||
)
|
||||
|
||||
|
||||
def copy_data_files(src: Path, dst: Path) -> None:
|
||||
"""Copies test data from src Path to dst Path for use in testing
|
||||
|
||||
Args
|
||||
src: pathlib.Path instance. The location of the source data file.
|
||||
dst: pathlib.Path instance. Where to copy the source data file to.
|
||||
|
||||
Returns
|
||||
None. This is a void function
|
||||
"""
|
||||
if not dst.exists():
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
copyfile(src, dst)
|
||||
assert dst.exists()
|
||||
|
||||
|
||||
class TestNationalRiskIndexETL:
|
||||
def test_init(self, mock_etl, mock_paths):
|
||||
"""Tests that the mock NationalRiskIndexETL class instance was
|
||||
initiliazed correctly.
|
||||
|
||||
Validates the following conditions:
|
||||
- self.DATA_PATH points to the "data" folder in the temp directory
|
||||
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
|
||||
- self.INPUT_PATH points to the correct path in the temp directory
|
||||
- self.OUTPUT_PATH points to the correct path in the temp directory
|
||||
"""
|
||||
# setup
|
||||
etl = NationalRiskIndexETL()
|
||||
data_path, tmp_path = mock_paths
|
||||
input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
|
||||
output_dir = data_path / "dataset" / "national_risk_index_2020"
|
||||
# validation
|
||||
assert etl.DATA_PATH == data_path
|
||||
assert etl.TMP_PATH == tmp_path
|
||||
assert etl.INPUT_CSV == input_csv
|
||||
assert etl.OUTPUT_DIR == output_dir
|
||||
assert etl.GEOID_FIELD_NAME == "GEOID10"
|
||||
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
|
||||
|
||||
def test_transform(self, mock_etl):
|
||||
"""Tests the transform() method for NationalRiskIndexETL
|
||||
|
||||
Validates the following conditions:
|
||||
- The columns have been renamed correctly
|
||||
- The values for each tract has been applied to each of the block
|
||||
groups in that tract
|
||||
"""
|
||||
# setup - copy sample data into tmp_dir
|
||||
etl = NationalRiskIndexETL()
|
||||
input_src = DATA_DIR / "input.csv"
|
||||
input_dst = etl.INPUT_CSV
|
||||
acs_src = DATA_DIR / "acs.csv"
|
||||
acs_dst = DATA_DIR / etl.BLOCK_GROUP_CSV
|
||||
for src, dst in [(input_src, input_dst), (acs_src, acs_dst)]:
|
||||
copy_data_files(src, dst)
|
||||
# setup - read in sample output as dataframe
|
||||
TRACT_COL = etl.GEOID_TRACT_FIELD_NAME
|
||||
BLOCK_COL = etl.GEOID_FIELD_NAME
|
||||
expected = pd.read_csv(
|
||||
DATA_DIR / "output.csv",
|
||||
dtype={BLOCK_COL: "string", TRACT_COL: "string"},
|
||||
)
|
||||
# execution
|
||||
etl.transform()
|
||||
# validation
|
||||
assert etl.df.shape == (10, 6)
|
||||
assert etl.df.equals(expected)
|
||||
|
||||
def test_load(self, mock_etl):
|
||||
"""Tests the load() method for NationalRiskIndexETL
|
||||
|
||||
Validates the following conditions:
|
||||
- The transformed dataframe is written to the directory specified by
|
||||
self.OUTPUT_DIR
|
||||
- The content of the file that's written matches the data in self.df
|
||||
"""
|
||||
# setup
|
||||
etl = NationalRiskIndexETL()
|
||||
output_path = etl.OUTPUT_DIR / "usa.csv"
|
||||
TRACT_COL = etl.GEOID_TRACT_FIELD_NAME
|
||||
BLOCK_COL = etl.GEOID_FIELD_NAME
|
||||
expected = pd.read_csv(
|
||||
DATA_DIR / "output.csv",
|
||||
dtype={BLOCK_COL: str, TRACT_COL: str},
|
||||
)
|
||||
etl.df = expected
|
||||
# execution
|
||||
etl.load()
|
||||
output = pd.read_csv(
|
||||
output_path, dtype={BLOCK_COL: str, TRACT_COL: str}
|
||||
)
|
||||
# validation
|
||||
assert output_path.exists()
|
||||
assert output.equals(expected)
|
Loading…
Add table
Add a link
Reference in a new issue