Adds National Risk Index data to ETL pipeline (#549)

* Adds dev dependencies to requirements.txt and re-runs black on codebase

* Adds test and code for national risk index etl, still in progress

* Removes test_data from .gitignore

* Adds test data to nation_risk_index tests

* Creates tests and ETL class for NRI data

* Adds tests for load() and transform() methods of NationalRiskIndexETL

* Updates README.md with info about the NRI dataset

* Adds to dos

* Moves tests and test data into a tests/ dir in national_risk_index

* Moves tmp_dir for tests into data/tmp/tests/

* Promotes fixtures to conftest and relocates national_risk_index tests:
The relocation of national_risk_index tests is necessary because tests 
can only use fixtures specified in conftests within the same package

* Fixes issue with df.equals() in test_transform()

* Files reformatted by black

* Commit changes to other files after re-running black

* Fixes unused import that caused lint checks to fail

* Moves tests/ directory to app root for data_pipeline
This commit is contained in:
Billy Daly 2021-09-07 20:51:34 -04:00 committed by GitHub
commit f0900f7b69
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 307 additions and 7 deletions

View file

@ -0,0 +1,11 @@
GEOID10,POPULATION
050070403001,1000
050070403002,1500
050010201001,1000
050010201002,1500
150070405001,2000
150070405002,2250
150010210101,2000
150010210102,1500
150010211011,1750
150010211012,1500
1 GEOID10 POPULATION
2 050070403001 1000
3 050070403002 1500
4 050010201001 1000
5 050010201002 1500
6 150070405001 2000
7 150070405002 2250
8 150010210101 2000
9 150010210102 1500
10 150010211011 1750
11 150010211012 1500

View file

@ -0,0 +1,6 @@
TRACT,TRACTFIPS,RISK_SCORE,RISK_RATNG,RISK_NPCTL
40300,05007040300,10.492015,Very Low,15.3494
20100,05001020100,14.705854,Relatively Low,36.725828
40500,15007040500,10.234981,Very Low,13.997993
21010,15001021010,21.537231,Relatively Moderate,59.488033
21101,15001021101,19.434585,Relatively Low,53.392265
1 TRACT TRACTFIPS RISK_SCORE RISK_RATNG RISK_NPCTL
2 40300 05007040300 10.492015 Very Low 15.3494
3 20100 05001020100 14.705854 Relatively Low 36.725828
4 40500 15007040500 10.234981 Very Low 13.997993
5 21010 15001021010 21.537231 Relatively Moderate 59.488033
6 21101 15001021101 19.434585 Relatively Low 53.392265

View file

@ -0,0 +1,11 @@
GEOID10,GEOID10_TRACT,TRACT,RISK_SCORE,RISK_RATNG,RISK_NPCTL
050070403001,05007040300,40300,10.492015,Very Low,15.3494
050070403002,05007040300,40300,10.492015,Very Low,15.3494
050010201001,05001020100,20100,14.705854,Relatively Low,36.725828
050010201002,05001020100,20100,14.705854,Relatively Low,36.725828
150070405001,15007040500,40500,10.234981,Very Low,13.997993
150070405002,15007040500,40500,10.234981,Very Low,13.997993
150010210101,15001021010,21010,21.537231,Relatively Moderate,59.488033
150010210102,15001021010,21010,21.537231,Relatively Moderate,59.488033
150010211011,15001021101,21101,19.434585,Relatively Low,53.392265
150010211012,15001021101,21101,19.434585,Relatively Low,53.392265
1 GEOID10 GEOID10_TRACT TRACT RISK_SCORE RISK_RATNG RISK_NPCTL
2 050070403001 05007040300 40300 10.492015 Very Low 15.3494
3 050070403002 05007040300 40300 10.492015 Very Low 15.3494
4 050010201001 05001020100 20100 14.705854 Relatively Low 36.725828
5 050010201002 05001020100 20100 14.705854 Relatively Low 36.725828
6 150070405001 15007040500 40500 10.234981 Very Low 13.997993
7 150070405002 15007040500 40500 10.234981 Very Low 13.997993
8 150010210101 15001021010 21010 21.537231 Relatively Moderate 59.488033
9 150010210102 15001021010 21010 21.537231 Relatively Moderate 59.488033
10 150010211011 15001021101 21101 19.434585 Relatively Low 53.392265
11 150010211012 15001021101 21101 19.434585 Relatively Low 53.392265

View file

@ -0,0 +1,110 @@
from pathlib import Path
from shutil import copyfile
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.sources.national_risk_index.etl import (
NationalRiskIndexETL,
)
DATA_DIR = (
settings.APP_ROOT / "tests" / "sources" / "national_risk_index" / "data"
)
def copy_data_files(src: Path, dst: Path) -> None:
"""Copies test data from src Path to dst Path for use in testing
Args
src: pathlib.Path instance. The location of the source data file.
dst: pathlib.Path instance. Where to copy the source data file to.
Returns
None. This is a void function
"""
if not dst.exists():
dst.parent.mkdir(parents=True, exist_ok=True)
copyfile(src, dst)
assert dst.exists()
class TestNationalRiskIndexETL:
def test_init(self, mock_etl, mock_paths):
"""Tests that the mock NationalRiskIndexETL class instance was
initiliazed correctly.
Validates the following conditions:
- self.DATA_PATH points to the "data" folder in the temp directory
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
- self.INPUT_PATH points to the correct path in the temp directory
- self.OUTPUT_PATH points to the correct path in the temp directory
"""
# setup
etl = NationalRiskIndexETL()
data_path, tmp_path = mock_paths
input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
output_dir = data_path / "dataset" / "national_risk_index_2020"
# validation
assert etl.DATA_PATH == data_path
assert etl.TMP_PATH == tmp_path
assert etl.INPUT_CSV == input_csv
assert etl.OUTPUT_DIR == output_dir
assert etl.GEOID_FIELD_NAME == "GEOID10"
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
def test_transform(self, mock_etl):
"""Tests the transform() method for NationalRiskIndexETL
Validates the following conditions:
- The columns have been renamed correctly
- The values for each tract has been applied to each of the block
groups in that tract
"""
# setup - copy sample data into tmp_dir
etl = NationalRiskIndexETL()
input_src = DATA_DIR / "input.csv"
input_dst = etl.INPUT_CSV
acs_src = DATA_DIR / "acs.csv"
acs_dst = DATA_DIR / etl.BLOCK_GROUP_CSV
for src, dst in [(input_src, input_dst), (acs_src, acs_dst)]:
copy_data_files(src, dst)
# setup - read in sample output as dataframe
TRACT_COL = etl.GEOID_TRACT_FIELD_NAME
BLOCK_COL = etl.GEOID_FIELD_NAME
expected = pd.read_csv(
DATA_DIR / "output.csv",
dtype={BLOCK_COL: "string", TRACT_COL: "string"},
)
# execution
etl.transform()
# validation
assert etl.df.shape == (10, 6)
assert etl.df.equals(expected)
def test_load(self, mock_etl):
"""Tests the load() method for NationalRiskIndexETL
Validates the following conditions:
- The transformed dataframe is written to the directory specified by
self.OUTPUT_DIR
- The content of the file that's written matches the data in self.df
"""
# setup
etl = NationalRiskIndexETL()
output_path = etl.OUTPUT_DIR / "usa.csv"
TRACT_COL = etl.GEOID_TRACT_FIELD_NAME
BLOCK_COL = etl.GEOID_FIELD_NAME
expected = pd.read_csv(
DATA_DIR / "output.csv",
dtype={BLOCK_COL: str, TRACT_COL: str},
)
etl.df = expected
# execution
etl.load()
output = pd.read_csv(
output_path, dtype={BLOCK_COL: str, TRACT_COL: str}
)
# validation
assert output_path.exists()
assert output.equals(expected)