Add tests for all non-census sources (#1899)

* Refactor CDC life-expectancy (1554)

* Update to new tract list (#1554)

* Adjust for tests (#1848)

* Add tests for cdc_places (#1848)

* Add EJScreen tests (#1848)

* Add tests for HUD housing (#1848)

* Add tests for GeoCorr (#1848)

* Add persistent poverty tests (#1848)

* Update for sources without zips, for new validation (#1848)

* Update tests for new multi-CSV but (#1848)

Lucas updated the CDC life expectancy data to handle a bug where two
states are missing from the US Overall download. Since virtually none of
our other ETL classes download multiple CSVs directly like this, it
required a pretty invasive new mocking strategy.

* Add basic tests for nature deprived (#1848)

* Add wildfire tests (#1848)

* Add flood risk tests (#1848)

* Add DOT travel tests (#1848)

* Add historic redlining tests (#1848)

* Add tests for ME and WI (#1848)

* Update now that validation exists (#1848)

* Adjust for validation (#1848)

* Add health insurance back to cdc places (#1848)

Ooops

* Update tests with new field (#1848)

* Test for blank tract removal (#1848)

* Add tracts for clipping behavior

* Test clipping and zfill behavior (#1848)

* Fix bad test assumption (#1848)

* Simplify class, add test for tract padding (#1848)

* Fix percentage inversion, update tests (#1848)

Looking through the transformations, I noticed that we were subtracting
a percentage that is usually between 0-100 from 1 instead of 100, and so
were endind up with some surprising results. Confirmed with lucasmbrown-usds

* Add note about first street data (#1848)
This commit is contained in:
Matt Bowen 2022-09-19 15:17:00 -04:00 committed by GitHub
commit 876655d2b2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
88 changed files with 2032 additions and 178 deletions

View file

@ -21,6 +21,7 @@ class ExampleETL(ExtractTransformLoad):
LAST_UPDATED_YEAR = 2017
SOURCE_URL = "https://www.example.com/example.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
LOAD_YAML_CONFIG: bool = True
def __init__(self):
self.COLUMNS_TO_KEEP = [

View file

@ -2,7 +2,7 @@
import copy
import os
import pathlib
from typing import Type
from typing import Type, Optional
from unittest import mock
import pytest
@ -45,7 +45,7 @@ class TestETL:
# so that we do not have to manually copy the "sample data" when we run the tests.
_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "input.csv"
_SAMPLE_DATA_ZIP_FILE_NAME = "input.zip"
_SAMPLE_DATA_ZIP_FILE_NAME: Optional[str] = "input.zip"
_EXTRACT_TMP_FOLDER_NAME = "ExampleETL"
# Note: We used shared census tract IDs so that later our tests can join all the
@ -124,22 +124,37 @@ class TestETL:
used to retrieve data in order to force that method to retrieve the fixture
data. A basic version of that patching is included here for classes that can use it.
"""
with mock.patch("data_pipeline.utils.requests") as requests_mock:
zip_file_fixture_src = (
self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_ZIP_FILE_NAME
)
tmp_path = mock_paths[1]
# Create mock response.
with open(zip_file_fixture_src, mode="rb") as file:
file_contents = file.read()
with mock.patch(
"data_pipeline.utils.requests"
) as requests_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
zip_file_fixture_src = (
self._DATA_DIRECTORY_FOR_TEST
/ self._SAMPLE_DATA_ZIP_FILE_NAME
)
# Create mock response.
with open(zip_file_fixture_src, mode="rb") as file:
file_contents = file.read()
else:
with open(
self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_FILE_NAME,
"rb",
) as file:
file_contents = file.read()
response_mock = requests.Response()
response_mock.status_code = 200
# pylint: disable=protected-access
response_mock._content = file_contents
# Return text fixture:
requests_mock.get = mock.MagicMock(return_value=response_mock)
mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
]
# Instantiate the ETL class.
etl = self._get_instance_of_etl_class()
@ -225,9 +240,14 @@ class TestETL:
"""This will test that the sample data exists where it's supposed to as it's supposed to
As per conversation with Jorge, here we can *just* test that the zip file exists.
"""
assert (
self._SAMPLE_DATA_PATH / self._SAMPLE_DATA_ZIP_FILE_NAME
).exists()
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
assert (
self._SAMPLE_DATA_PATH / self._SAMPLE_DATA_ZIP_FILE_NAME
).exists()
else:
assert (
self._SAMPLE_DATA_PATH / self._SAMPLE_DATA_FILE_NAME
).exists()
def test_extract_unzips_base(self, mock_etl, mock_paths):
"""Tests the extract method.
@ -235,17 +255,18 @@ class TestETL:
As per conversation with Jorge, no longer includes snapshot. Instead, verifies that the
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
"""
tmp_path = mock_paths[1]
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
tmp_path = mock_paths[1]
_ = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl,
mock_paths=mock_paths,
)
assert (
tmp_path
/ self._EXTRACT_TMP_FOLDER_NAME
/ self._SAMPLE_DATA_FILE_NAME
).exists()
_ = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl,
mock_paths=mock_paths,
)
assert (
tmp_path
/ self._EXTRACT_TMP_FOLDER_NAME
/ self._SAMPLE_DATA_FILE_NAME
).exists()
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
"""Tests the extract method.