Add tests for DOE energy budren (1518

This commit is contained in:
Matthew Bowen 2022-07-27 17:17:34 -04:00 committed by matt bowen
parent e77e7aef2e
commit 12a6b2f10e
7 changed files with 140 additions and 7 deletions

View file

@ -2,18 +2,22 @@ from pathlib import Path
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import get_module_logger, unzip_file_from_url from data_pipeline.utils import get_module_logger, unzip_file_from_url
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class DOEEnergyBurden(ExtractTransformLoad): class DOEEnergyBurden(ExtractTransformLoad):
def __init__(self): NAME = "doe_energy_burden"
self.DOE_FILE_URL = ( SOURCE_URL: str = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip" + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
) )
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
def __init__(self):
self.DOE_FILE_URL = self.SOURCE_URL
self.OUTPUT_PATH: Path = ( self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "doe_energy_burden" self.DATA_PATH / "dataset" / "doe_energy_burden"
@ -38,12 +42,11 @@ class DOEEnergyBurden(ExtractTransformLoad):
unzip_file_from_url( unzip_file_from_url(
file_url=self.DOE_FILE_URL, file_url=self.DOE_FILE_URL,
download_path=self.get_tmp_path(), download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "doe_energy_burden", unzipped_file_path=self.get_tmp_path()
) )
self.raw_df = pd.read_csv( self.raw_df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() filepath_or_buffer=self.get_tmp_path()
/ "doe_energy_burden"
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv", / "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
# The following need to remain as strings for all of their digits, not get converted to numbers. # The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={ dtype={

View file

@ -0,0 +1,16 @@
ABV,FIP,BURDEN,QUANTILE
HI,15001021010,0.0380000000,30
HI,15001021101,0.0410000000,25
HI,15001021402,0.0240000000,66
HI,15001021800,0.0290000000,51
HI,15003010201,0.0270000000,58
HI,15007040603,0.0440000000,21
HI,15007040604,0.0330000000,40
HI,15007040700,0.0260000000,59
HI,15009030100,0.0350000000,37
HI,15009030201,0.0220000000,71
HI,15009030402,0.0200000000,75
HI,15009030800,0.0190000000,80
CA,6007040300,0.2000000000,70
CA,6007040500,0.5000000000,50
CA,6001020100,0.1990000000,30
1 ABV FIP BURDEN QUANTILE
2 HI 15001021010 0.0380000000 30
3 HI 15001021101 0.0410000000 25
4 HI 15001021402 0.0240000000 66
5 HI 15001021800 0.0290000000 51
6 HI 15003010201 0.0270000000 58
7 HI 15007040603 0.0440000000 21
8 HI 15007040604 0.0330000000 40
9 HI 15007040700 0.0260000000 59
10 HI 15009030100 0.0350000000 37
11 HI 15009030201 0.0220000000 71
12 HI 15009030402 0.0200000000 75
13 HI 15009030800 0.0190000000 80
14 CA 6007040300 0.2000000000 70
15 CA 6007040500 0.5000000000 50
16 CA 6001020100 0.1990000000 30

View file

@ -0,0 +1,16 @@
GEOID10_TRACT,Energy burden
15001021010,0.0380000000
15001021101,0.0410000000
15001021402,0.0240000000
15001021800,0.0290000000
15003010201,0.0270000000
15007040603,0.0440000000
15007040604,0.0330000000
15007040700,0.0260000000
15009030100,0.0350000000
15009030201,0.0220000000
15009030402,0.0200000000
15009030800,0.0190000000
06007040300,0.2000000000
06007040500,0.5000000000
06001020100,0.1990000000
1 GEOID10_TRACT Energy burden
2 15001021010 0.0380000000
3 15001021101 0.0410000000
4 15001021402 0.0240000000
5 15001021800 0.0290000000
6 15003010201 0.0270000000
7 15007040603 0.0440000000
8 15007040604 0.0330000000
9 15007040700 0.0260000000
10 15009030100 0.0350000000
11 15009030201 0.0220000000
12 15009030402 0.0200000000
13 15009030800 0.0190000000
14 06007040300 0.2000000000
15 06007040500 0.5000000000
16 06001020100 0.1990000000

View file

@ -0,0 +1,16 @@
ABV,GEOID10_TRACT,Energy burden,QUANTILE
HI,15001021010,0.0380000000,30
HI,15001021101,0.0410000000,25
HI,15001021402,0.0240000000,66
HI,15001021800,0.0290000000,51
HI,15003010201,0.0270000000,58
HI,15007040603,0.0440000000,21
HI,15007040604,0.0330000000,40
HI,15007040700,0.0260000000,59
HI,15009030100,0.0350000000,37
HI,15009030201,0.0220000000,71
HI,15009030402,0.0200000000,75
HI,15009030800,0.0190000000,80
CA,06007040300,0.2000000000,70
CA,06007040500,0.5000000000,50
CA,06001020100,0.1990000000,30
1 ABV GEOID10_TRACT Energy burden QUANTILE
2 HI 15001021010 0.0380000000 30
3 HI 15001021101 0.0410000000 25
4 HI 15001021402 0.0240000000 66
5 HI 15001021800 0.0290000000 51
6 HI 15003010201 0.0270000000 58
7 HI 15007040603 0.0440000000 21
8 HI 15007040604 0.0330000000 40
9 HI 15007040700 0.0260000000 59
10 HI 15009030100 0.0350000000 37
11 HI 15009030201 0.0220000000 71
12 HI 15009030402 0.0200000000 75
13 HI 15009030800 0.0190000000 80
14 CA 06007040300 0.2000000000 70
15 CA 06007040500 0.5000000000 50
16 CA 06001020100 0.1990000000 30

View file

@ -0,0 +1,82 @@
# pylint: disable=protected-access
from unittest import mock
import pathlib
import requests
from data_pipeline.etl.sources.doe_energy_burden.etl import (
DOEEnergyBurden,
)
from data_pipeline.tests.sources.example.test_etl import TestETL
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class TestDOEEnergyBurdenETL(TestETL):
"""Tests the COI ETL.
This uses pytest-snapshot.
To update individual snapshots: $ poetry run pytest
data_pipeline/tests/sources/ndoe_energy_burden/test_etl.py::TestClassNameETL::<testname>
--snapshot-update
"""
_ETL_CLASS = DOEEnergyBurden
_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
_SAMPLE_DATA_ZIP_FILE_NAME = "DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
_EXTRACT_TMP_FOLDER_NAME = "DOEEnergyBurden"
_EXTRACT_CSV_FILE_NAME = "extract.csv"
def setup_method(self, _method, filename=__file__):
"""Invoke `setup_method` from Parent, but using the current file name.
This code can be copied identically between all child classes.
"""
super().setup_method(_method=_method, filename=filename)
# XXX: Refactor since I just straight copied it out of NRI's
def _setup_etl_instance_and_run_extract(self, mock_etl, mock_paths):
with mock.patch("data_pipeline.utils.requests") as requests_mock:
zip_file_fixture_src = self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_ZIP_FILE_NAME
tmp_path = mock_paths[1]
# Create mock response.
with open(zip_file_fixture_src, mode="rb") as file:
file_contents = file.read()
response_mock = requests.Response()
response_mock.status_code = 200
# pylint: disable=protected-access
response_mock._content = file_contents
# Return text fixture:
requests_mock.get = mock.MagicMock(return_value=response_mock)
# Instantiate the ETL class.
etl = self._ETL_CLASS()
# Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path
# Run the extract method.
etl.extract()
return etl
def test_init(self, mock_etl, mock_paths):
"""Tests that the ChildOpportunityIndexETL class was initialized
correctly.
"""
etl = DOEEnergyBurden()
data_path, _ = mock_paths
assert etl.DATA_PATH == data_path
assert etl.COLUMNS_TO_KEEP == [
"GEOID10_TRACT",
"Energy burden"
]
assert etl.GEOID_FIELD_NAME == "GEOID10"
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
assert etl.TRACT_INPUT_COLUMN_NAME == "FIP"
assert etl.INPUT_ENERGY_BURDEN_FIELD_NAME == "BURDEN"
assert etl.REVISED_ENERGY_BURDEN_FIELD_NAME == "Energy burden"