From c6a7a28ca63776335f323c069530f2ea110d8d5d Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 3 Aug 2022 15:35:31 -0400 Subject: [PATCH] Refactor CDC life-expectancy (1554) --- .../etl/score/config/datasets.yml | 17 ++++++ .../etl/sources/cdc_life_expectancy/etl.py | 38 ++++++------ .../sources/cdc_life_expectancy/__init__.py | 0 .../sources/cdc_life_expectancy/data/US_A.CSV | 16 +++++ .../cdc_life_expectancy/data/extract.csv | 16 +++++ .../cdc_life_expectancy/data/output.csv | 16 +++++ .../cdc_life_expectancy/data/transform.csv | 16 +++++ .../sources/cdc_life_expectancy/test_etl.py | 59 +++++++++++++++++++ 8 files changed, 159 insertions(+), 19 deletions(-) create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/__init__.py create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index 96f64749..1edeeb7f 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -101,6 +101,23 @@ datasets: include_in_csv: true include_in_excel: true column_position: 1 + - long_name: "U.S. Small-area Life Expectancy Estimates Project (USALEEP)" + short_name: "USALEEP" + module_name: "cdc_life_expectancy" + description_short: "Average number of years of life a person who has attained a given age can expect to live. " + description_long: "Average number of years of life a person who has attained a given age can expect to live. " + input_geoid_tract_field_name: "Tract ID" + load_fields: + - short_name: "CDC_USALEEP_LE" + df_field_name: "Life expectancy (years)" + long_name: "Life expectancy (years)" + create_reverse_percentile: true + field_type: float + include_in_tiles: False + include_in_csv: true + include_in_excel: true + column_position: 60 + - long_name: "Exaple ETL" short_name: "Example" module_name: "example_dataset" diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 2aac7412..f6c0bc2d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -1,21 +1,24 @@ from pathlib import Path import pandas as pd -from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.utils import get_module_logger, download_file_from_url +from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel +from data_pipeline.utils import download_file_from_url, get_module_logger logger = get_module_logger(__name__) class CDCLifeExpectancy(ExtractTransformLoad): + NAME = "cdc_life_expectancy" + GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT + def __init__(self): self.FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "cdc_life_expectancy" ) - self.TRACT_INPUT_COLUMN_NAME = "Tract ID" self.LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)" + self._DOWNLOAD_FILE_LOCATION = self.get_tmp_path() / "US_A.CSV" # Constants for output self.COLUMNS_TO_KEEP = [ @@ -23,37 +26,34 @@ class CDCLifeExpectancy(ExtractTransformLoad): self.LIFE_EXPECTANCY_FIELD_NAME, ] - self.raw_df: pd.DataFrame self.output_df: pd.DataFrame def extract(self) -> None: + # Needs to be overridden because the data aren't zipped logger.info("Starting data download.") - download_file_name = ( - self.get_tmp_path() / "cdc_life_expectancy" / "usa.csv" - ) download_file_from_url( file_url=self.FILE_URL, - download_file_name=download_file_name, + download_file_name=self._DOWNLOAD_FILE_LOCATION, verify=True, ) - self.raw_df = pd.read_csv( - filepath_or_buffer=download_file_name, - dtype={ - # The following need to remain as strings for all of their digits, not get converted to numbers. - self.TRACT_INPUT_COLUMN_NAME: "string", - }, - low_memory=False, - ) - def transform(self) -> None: logger.info("Starting DOE energy burden transform.") - self.output_df = self.raw_df.rename( + raw_df = pd.read_csv( + filepath_or_buffer=self._DOWNLOAD_FILE_LOCATION, + dtype={ + # The following need to remain as strings for all of their digits, not get converted to numbers. + self.INPUT_GEOID_TRACT_FIELD_NAME: "string", + }, + low_memory=False, + ) + + self.output_df = raw_df.rename( columns={ "e(0)": self.LIFE_EXPECTANCY_FIELD_NAME, - self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, + self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, } ) diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/__init__.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV new file mode 100644 index 00000000..e0698261 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV @@ -0,0 +1,16 @@ +Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag +15001021010,15,001,021010,77.4,1.6548,2 +15001021101,15,001,021101,82.5,3.9086,3 +15001021402,15,001,021402,80.4,1.093,2 +15001021800,15,001,021800,79.5,1.132,2 +15003010201,15,003,010201,79.4,1.5261,3 +15007040603,15,007,040603,86.3,2.2285,3 +15007040604,15,007,040604,84.9,2.1995,3 +15007040700,15,007,040700,80.4,0.7571,2 +15009030100,15,009,030100,77.2,1.8736,3 +15009030402,15,009,030402,83.5,1.8267,3 +15009030800,15,009,030800,82.2,1.6251,3 +06007040500,06,007,040500,99.1,3.1415,3 +06001020100,06,001,020100,99.1,3.1415,3 +06007040300,06,007,040300,99.1,3.1415,3 +15009030201,15,009,030201,99.1,3.1415,3 diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv new file mode 100644 index 00000000..7e5d872b --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv @@ -0,0 +1,16 @@ +Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag +15001021010,15,1,21010,77.4000000000,1.6548000000,2 +15001021101,15,1,21101,82.5000000000,3.9086000000,3 +15001021402,15,1,21402,80.4000000000,1.0930000000,2 +15001021800,15,1,21800,79.5000000000,1.1320000000,2 +15003010201,15,3,10201,79.4000000000,1.5261000000,3 +15007040603,15,7,40603,86.3000000000,2.2285000000,3 +15007040604,15,7,40604,84.9000000000,2.1995000000,3 +15007040700,15,7,40700,80.4000000000,0.7571000000,2 +15009030100,15,9,30100,77.2000000000,1.8736000000,3 +15009030402,15,9,30402,83.5000000000,1.8267000000,3 +15009030800,15,9,30800,82.2000000000,1.6251000000,3 +6007040500,6,7,40500,99.1000000000,3.1415000000,3 +6001020100,6,1,20100,99.1000000000,3.1415000000,3 +6007040300,6,7,40300,99.1000000000,3.1415000000,3 +15009030201,15,9,30201,99.1000000000,3.1415000000,3 diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv new file mode 100644 index 00000000..461a21e8 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv @@ -0,0 +1,16 @@ +GEOID10_TRACT,Life expectancy (years) +15001021010,77.4000000000 +15001021101,82.5000000000 +15001021402,80.4000000000 +15001021800,79.5000000000 +15003010201,79.4000000000 +15007040603,86.3000000000 +15007040604,84.9000000000 +15007040700,80.4000000000 +15009030100,77.2000000000 +15009030402,83.5000000000 +15009030800,82.2000000000 +06007040500,99.1000000000 +06001020100,99.1000000000 +06007040300,99.1000000000 +15009030201,99.1000000000 diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv new file mode 100644 index 00000000..6cbccac0 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv @@ -0,0 +1,16 @@ +GEOID10_TRACT,STATE2KX,CNTY2KX,TRACT2KX,Life expectancy (years),se(e(0)),Abridged life table flag +15001021010,15,1,21010,77.4000000000,1.6548000000,2 +15001021101,15,1,21101,82.5000000000,3.9086000000,3 +15001021402,15,1,21402,80.4000000000,1.0930000000,2 +15001021800,15,1,21800,79.5000000000,1.1320000000,2 +15003010201,15,3,10201,79.4000000000,1.5261000000,3 +15007040603,15,7,40603,86.3000000000,2.2285000000,3 +15007040604,15,7,40604,84.9000000000,2.1995000000,3 +15007040700,15,7,40700,80.4000000000,0.7571000000,2 +15009030100,15,9,30100,77.2000000000,1.8736000000,3 +15009030402,15,9,30402,83.5000000000,1.8267000000,3 +15009030800,15,9,30800,82.2000000000,1.6251000000,3 +06007040500,6,7,40500,99.1000000000,3.1415000000,3 +06001020100,6,1,20100,99.1000000000,3.1415000000,3 +06007040300,6,7,40300,99.1000000000,3.1415000000,3 +15009030201,15,9,30201,99.1000000000,3.1415000000,3 diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py new file mode 100644 index 00000000..a92bb253 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py @@ -0,0 +1,59 @@ +# pylint: disable=protected-access +import pathlib + +from data_pipeline.etl.sources.cdc_life_expectancy.etl import CDCLifeExpectancy +from data_pipeline.tests.sources.example.test_etl import TestETL +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +class TestCDCLifeExpectency(TestETL): + """Tests the CDC Life Expectancy ETL. + + This uses pytest-snapshot. + To update individual snapshots: $ poetry run pytest + data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py::TestClassNameETL:: + --snapshot-update + """ + + _ETL_CLASS = CDCLifeExpectancy + + _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data" + _SAMPLE_DATA_FILE_NAME = "US_A.CSV" + _SAMPLE_DATA_ZIP_FILE_NAME = "US_A.CSV" + _EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectancy" + _EXTRACT_CSV_FILE_NAME = "extract.csv" + + def setup_method(self, _method, filename=__file__): + """Invoke `setup_method` from Parent, but using the current file name. + + This code can be copied identically between all child classes. + """ + super().setup_method(_method=_method, filename=filename) + + def test_init(self, mock_etl, mock_paths): + """Tests that the ChildOpportunityIndexETL class was initialized + correctly. + """ + + etl = self._ETL_CLASS() + data_path, _ = mock_paths + assert etl.DATA_PATH == data_path + assert etl.COLUMNS_TO_KEEP == [ + "GEOID10_TRACT", + "Life expectancy (years)", + ] + assert etl.INPUT_GEOID_TRACT_FIELD_NAME == "Tract ID" + assert etl.LIFE_EXPECTANCY_FIELD_NAME == "Life expectancy (years)" + + def test_get_output_file_path(self, mock_etl, mock_paths): + """Tests the right file name is returned.""" + etl = self._ETL_CLASS() + data_path, tmp_path = mock_paths + + output_file_path = etl._get_output_file_path() + expected_output_file_path = ( + data_path / "dataset" / "cdc_life_expectancy" / "usa.csv" + ) + assert output_file_path == expected_output_file_path