From c6a7a28ca63776335f323c069530f2ea110d8d5d Mon Sep 17 00:00:00 2001
From: matt bowen <matthew.r.bowen@omb.eop.gov>
Date: Wed, 3 Aug 2022 15:35:31 -0400
Subject: [PATCH] Refactor CDC life-expectancy (1554)

---
 .../etl/score/config/datasets.yml             | 17 ++++++
 .../etl/sources/cdc_life_expectancy/etl.py    | 38 ++++++------
 .../sources/cdc_life_expectancy/__init__.py   |  0
 .../sources/cdc_life_expectancy/data/US_A.CSV | 16 +++++
 .../cdc_life_expectancy/data/extract.csv      | 16 +++++
 .../cdc_life_expectancy/data/output.csv       | 16 +++++
 .../cdc_life_expectancy/data/transform.csv    | 16 +++++
 .../sources/cdc_life_expectancy/test_etl.py   | 59 +++++++++++++++++++
 8 files changed, 159 insertions(+), 19 deletions(-)
 create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/__init__.py
 create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV
 create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv
 create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv
 create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv
 create mode 100644 data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py

diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
index 96f64749..1edeeb7f 100644
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@@ -101,6 +101,23 @@ datasets:
         include_in_csv: true
         include_in_excel: true
         column_position: 1
+  - long_name: "U.S. Small-area Life Expectancy Estimates Project (USALEEP)"
+    short_name: "USALEEP"
+    module_name: "cdc_life_expectancy"
+    description_short: "Average number of years of life a person who has attained a given age can expect to live. "
+    description_long: "Average number of years of life a person who has attained a given age can expect to live. "
+    input_geoid_tract_field_name: "Tract ID"
+    load_fields:
+      - short_name: "CDC_USALEEP_LE"
+        df_field_name: "Life expectancy (years)"
+        long_name: "Life expectancy (years)"
+        create_reverse_percentile: true
+        field_type: float
+        include_in_tiles: False
+        include_in_csv: true
+        include_in_excel: true
+        column_position: 60
+
   - long_name: "Exaple ETL"
     short_name: "Example"
     module_name: "example_dataset"
diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
index 2aac7412..f6c0bc2d 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@@ -1,21 +1,24 @@
 from pathlib import Path
 import pandas as pd
 
-from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.utils import get_module_logger, download_file_from_url
+from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
+from data_pipeline.utils import download_file_from_url, get_module_logger
 
 logger = get_module_logger(__name__)
 
 
 class CDCLifeExpectancy(ExtractTransformLoad):
+    NAME = "cdc_life_expectancy"
+    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
+
     def __init__(self):
         self.FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
         self.OUTPUT_PATH: Path = (
             self.DATA_PATH / "dataset" / "cdc_life_expectancy"
         )
 
-        self.TRACT_INPUT_COLUMN_NAME = "Tract ID"
         self.LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
+        self._DOWNLOAD_FILE_LOCATION = self.get_tmp_path() / "US_A.CSV"
 
         # Constants for output
         self.COLUMNS_TO_KEEP = [
@@ -23,37 +26,34 @@ class CDCLifeExpectancy(ExtractTransformLoad):
             self.LIFE_EXPECTANCY_FIELD_NAME,
         ]
 
-        self.raw_df: pd.DataFrame
         self.output_df: pd.DataFrame
 
     def extract(self) -> None:
+        # Needs to be overridden because the data aren't zipped
         logger.info("Starting data download.")
 
-        download_file_name = (
-            self.get_tmp_path() / "cdc_life_expectancy" / "usa.csv"
-        )
         download_file_from_url(
             file_url=self.FILE_URL,
-            download_file_name=download_file_name,
+            download_file_name=self._DOWNLOAD_FILE_LOCATION,
             verify=True,
         )
 
-        self.raw_df = pd.read_csv(
-            filepath_or_buffer=download_file_name,
-            dtype={
-                # The following need to remain as strings for all of their digits, not get converted to numbers.
-                self.TRACT_INPUT_COLUMN_NAME: "string",
-            },
-            low_memory=False,
-        )
-
     def transform(self) -> None:
         logger.info("Starting DOE energy burden transform.")
 
-        self.output_df = self.raw_df.rename(
+        raw_df = pd.read_csv(
+            filepath_or_buffer=self._DOWNLOAD_FILE_LOCATION,
+            dtype={
+                # The following need to remain as strings for all of their digits, not get converted to numbers.
+                self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
+            },
+            low_memory=False,
+        )
+
+        self.output_df = raw_df.rename(
             columns={
                 "e(0)": self.LIFE_EXPECTANCY_FIELD_NAME,
-                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
+                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
             }
         )
 
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/__init__.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV
new file mode 100644
index 00000000..e0698261
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/US_A.CSV
@@ -0,0 +1,16 @@
+Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag
+15001021010,15,001,021010,77.4,1.6548,2
+15001021101,15,001,021101,82.5,3.9086,3
+15001021402,15,001,021402,80.4,1.093,2
+15001021800,15,001,021800,79.5,1.132,2
+15003010201,15,003,010201,79.4,1.5261,3
+15007040603,15,007,040603,86.3,2.2285,3
+15007040604,15,007,040604,84.9,2.1995,3
+15007040700,15,007,040700,80.4,0.7571,2
+15009030100,15,009,030100,77.2,1.8736,3
+15009030402,15,009,030402,83.5,1.8267,3
+15009030800,15,009,030800,82.2,1.6251,3
+06007040500,06,007,040500,99.1,3.1415,3
+06001020100,06,001,020100,99.1,3.1415,3
+06007040300,06,007,040300,99.1,3.1415,3
+15009030201,15,009,030201,99.1,3.1415,3
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv
new file mode 100644
index 00000000..7e5d872b
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/extract.csv
@@ -0,0 +1,16 @@
+Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag
+15001021010,15,1,21010,77.4000000000,1.6548000000,2
+15001021101,15,1,21101,82.5000000000,3.9086000000,3
+15001021402,15,1,21402,80.4000000000,1.0930000000,2
+15001021800,15,1,21800,79.5000000000,1.1320000000,2
+15003010201,15,3,10201,79.4000000000,1.5261000000,3
+15007040603,15,7,40603,86.3000000000,2.2285000000,3
+15007040604,15,7,40604,84.9000000000,2.1995000000,3
+15007040700,15,7,40700,80.4000000000,0.7571000000,2
+15009030100,15,9,30100,77.2000000000,1.8736000000,3
+15009030402,15,9,30402,83.5000000000,1.8267000000,3
+15009030800,15,9,30800,82.2000000000,1.6251000000,3
+6007040500,6,7,40500,99.1000000000,3.1415000000,3
+6001020100,6,1,20100,99.1000000000,3.1415000000,3
+6007040300,6,7,40300,99.1000000000,3.1415000000,3
+15009030201,15,9,30201,99.1000000000,3.1415000000,3
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv
new file mode 100644
index 00000000..461a21e8
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/output.csv
@@ -0,0 +1,16 @@
+GEOID10_TRACT,Life expectancy (years)
+15001021010,77.4000000000
+15001021101,82.5000000000
+15001021402,80.4000000000
+15001021800,79.5000000000
+15003010201,79.4000000000
+15007040603,86.3000000000
+15007040604,84.9000000000
+15007040700,80.4000000000
+15009030100,77.2000000000
+15009030402,83.5000000000
+15009030800,82.2000000000
+06007040500,99.1000000000
+06001020100,99.1000000000
+06007040300,99.1000000000
+15009030201,99.1000000000
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv
new file mode 100644
index 00000000..6cbccac0
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv
@@ -0,0 +1,16 @@
+GEOID10_TRACT,STATE2KX,CNTY2KX,TRACT2KX,Life expectancy (years),se(e(0)),Abridged life table flag
+15001021010,15,1,21010,77.4000000000,1.6548000000,2
+15001021101,15,1,21101,82.5000000000,3.9086000000,3
+15001021402,15,1,21402,80.4000000000,1.0930000000,2
+15001021800,15,1,21800,79.5000000000,1.1320000000,2
+15003010201,15,3,10201,79.4000000000,1.5261000000,3
+15007040603,15,7,40603,86.3000000000,2.2285000000,3
+15007040604,15,7,40604,84.9000000000,2.1995000000,3
+15007040700,15,7,40700,80.4000000000,0.7571000000,2
+15009030100,15,9,30100,77.2000000000,1.8736000000,3
+15009030402,15,9,30402,83.5000000000,1.8267000000,3
+15009030800,15,9,30800,82.2000000000,1.6251000000,3
+06007040500,6,7,40500,99.1000000000,3.1415000000,3
+06001020100,6,1,20100,99.1000000000,3.1415000000,3
+06007040300,6,7,40300,99.1000000000,3.1415000000,3
+15009030201,15,9,30201,99.1000000000,3.1415000000,3
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
new file mode 100644
index 00000000..a92bb253
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
@@ -0,0 +1,59 @@
+# pylint: disable=protected-access
+import pathlib
+
+from data_pipeline.etl.sources.cdc_life_expectancy.etl import CDCLifeExpectancy
+from data_pipeline.tests.sources.example.test_etl import TestETL
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class TestCDCLifeExpectency(TestETL):
+    """Tests the CDC Life Expectancy ETL.
+
+    This uses pytest-snapshot.
+    To update individual snapshots: $ poetry run pytest
+            data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py::TestClassNameETL::<testname>
+            --snapshot-update
+    """
+
+    _ETL_CLASS = CDCLifeExpectancy
+
+    _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
+    _SAMPLE_DATA_FILE_NAME = "US_A.CSV"
+    _SAMPLE_DATA_ZIP_FILE_NAME = "US_A.CSV"
+    _EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectancy"
+    _EXTRACT_CSV_FILE_NAME = "extract.csv"
+
+    def setup_method(self, _method, filename=__file__):
+        """Invoke `setup_method` from Parent, but using the current file name.
+
+        This code can be copied identically between all child classes.
+        """
+        super().setup_method(_method=_method, filename=filename)
+
+    def test_init(self, mock_etl, mock_paths):
+        """Tests that the ChildOpportunityIndexETL class was initialized
+        correctly.
+        """
+
+        etl = self._ETL_CLASS()
+        data_path, _ = mock_paths
+        assert etl.DATA_PATH == data_path
+        assert etl.COLUMNS_TO_KEEP == [
+            "GEOID10_TRACT",
+            "Life expectancy (years)",
+        ]
+        assert etl.INPUT_GEOID_TRACT_FIELD_NAME == "Tract ID"
+        assert etl.LIFE_EXPECTANCY_FIELD_NAME == "Life expectancy (years)"
+
+    def test_get_output_file_path(self, mock_etl, mock_paths):
+        """Tests the right file name is returned."""
+        etl = self._ETL_CLASS()
+        data_path, tmp_path = mock_paths
+
+        output_file_path = etl._get_output_file_path()
+        expected_output_file_path = (
+            data_path / "dataset" / "cdc_life_expectancy" / "usa.csv"
+        )
+        assert output_file_path == expected_output_file_path