Refactor CDC life-expectancy (1554)

This commit is contained in:
matt bowen 2022-08-03 15:35:31 -04:00
parent 95f88587ec
commit c6a7a28ca6
8 changed files with 159 additions and 19 deletions

View file

@ -101,6 +101,23 @@ datasets:
include_in_csv: true
include_in_excel: true
column_position: 1
- long_name: "U.S. Small-area Life Expectancy Estimates Project (USALEEP)"
short_name: "USALEEP"
module_name: "cdc_life_expectancy"
description_short: "Average number of years of life a person who has attained a given age can expect to live. "
description_long: "Average number of years of life a person who has attained a given age can expect to live. "
input_geoid_tract_field_name: "Tract ID"
load_fields:
- short_name: "CDC_USALEEP_LE"
df_field_name: "Life expectancy (years)"
long_name: "Life expectancy (years)"
create_reverse_percentile: true
field_type: float
include_in_tiles: False
include_in_csv: true
include_in_excel: true
column_position: 60
- long_name: "Exaple ETL"
short_name: "Example"
module_name: "example_dataset"

View file

@ -1,21 +1,24 @@
from pathlib import Path
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, download_file_from_url
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import download_file_from_url, get_module_logger
logger = get_module_logger(__name__)
class CDCLifeExpectancy(ExtractTransformLoad):
NAME = "cdc_life_expectancy"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
def __init__(self):
self.FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "cdc_life_expectancy"
)
self.TRACT_INPUT_COLUMN_NAME = "Tract ID"
self.LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
self._DOWNLOAD_FILE_LOCATION = self.get_tmp_path() / "US_A.CSV"
# Constants for output
self.COLUMNS_TO_KEEP = [
@ -23,37 +26,34 @@ class CDCLifeExpectancy(ExtractTransformLoad):
self.LIFE_EXPECTANCY_FIELD_NAME,
]
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame
def extract(self) -> None:
# Needs to be overridden because the data aren't zipped
logger.info("Starting data download.")
download_file_name = (
self.get_tmp_path() / "cdc_life_expectancy" / "usa.csv"
)
download_file_from_url(
file_url=self.FILE_URL,
download_file_name=download_file_name,
download_file_name=self._DOWNLOAD_FILE_LOCATION,
verify=True,
)
self.raw_df = pd.read_csv(
filepath_or_buffer=download_file_name,
dtype={
# The following need to remain as strings for all of their digits, not get converted to numbers.
self.TRACT_INPUT_COLUMN_NAME: "string",
},
low_memory=False,
)
def transform(self) -> None:
logger.info("Starting DOE energy burden transform.")
self.output_df = self.raw_df.rename(
raw_df = pd.read_csv(
filepath_or_buffer=self._DOWNLOAD_FILE_LOCATION,
dtype={
# The following need to remain as strings for all of their digits, not get converted to numbers.
self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
},
low_memory=False,
)
self.output_df = raw_df.rename(
columns={
"e(0)": self.LIFE_EXPECTANCY_FIELD_NAME,
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
}
)

View file

@ -0,0 +1,16 @@
Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag
15001021010,15,001,021010,77.4,1.6548,2
15001021101,15,001,021101,82.5,3.9086,3
15001021402,15,001,021402,80.4,1.093,2
15001021800,15,001,021800,79.5,1.132,2
15003010201,15,003,010201,79.4,1.5261,3
15007040603,15,007,040603,86.3,2.2285,3
15007040604,15,007,040604,84.9,2.1995,3
15007040700,15,007,040700,80.4,0.7571,2
15009030100,15,009,030100,77.2,1.8736,3
15009030402,15,009,030402,83.5,1.8267,3
15009030800,15,009,030800,82.2,1.6251,3
06007040500,06,007,040500,99.1,3.1415,3
06001020100,06,001,020100,99.1,3.1415,3
06007040300,06,007,040300,99.1,3.1415,3
15009030201,15,009,030201,99.1,3.1415,3
1 Tract ID STATE2KX CNTY2KX TRACT2KX e(0) se(e(0)) Abridged life table flag
2 15001021010 15 001 021010 77.4 1.6548 2
3 15001021101 15 001 021101 82.5 3.9086 3
4 15001021402 15 001 021402 80.4 1.093 2
5 15001021800 15 001 021800 79.5 1.132 2
6 15003010201 15 003 010201 79.4 1.5261 3
7 15007040603 15 007 040603 86.3 2.2285 3
8 15007040604 15 007 040604 84.9 2.1995 3
9 15007040700 15 007 040700 80.4 0.7571 2
10 15009030100 15 009 030100 77.2 1.8736 3
11 15009030402 15 009 030402 83.5 1.8267 3
12 15009030800 15 009 030800 82.2 1.6251 3
13 06007040500 06 007 040500 99.1 3.1415 3
14 06001020100 06 001 020100 99.1 3.1415 3
15 06007040300 06 007 040300 99.1 3.1415 3
16 15009030201 15 009 030201 99.1 3.1415 3

View file

@ -0,0 +1,16 @@
Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag
15001021010,15,1,21010,77.4000000000,1.6548000000,2
15001021101,15,1,21101,82.5000000000,3.9086000000,3
15001021402,15,1,21402,80.4000000000,1.0930000000,2
15001021800,15,1,21800,79.5000000000,1.1320000000,2
15003010201,15,3,10201,79.4000000000,1.5261000000,3
15007040603,15,7,40603,86.3000000000,2.2285000000,3
15007040604,15,7,40604,84.9000000000,2.1995000000,3
15007040700,15,7,40700,80.4000000000,0.7571000000,2
15009030100,15,9,30100,77.2000000000,1.8736000000,3
15009030402,15,9,30402,83.5000000000,1.8267000000,3
15009030800,15,9,30800,82.2000000000,1.6251000000,3
6007040500,6,7,40500,99.1000000000,3.1415000000,3
6001020100,6,1,20100,99.1000000000,3.1415000000,3
6007040300,6,7,40300,99.1000000000,3.1415000000,3
15009030201,15,9,30201,99.1000000000,3.1415000000,3
1 Tract ID STATE2KX CNTY2KX TRACT2KX e(0) se(e(0)) Abridged life table flag
2 15001021010 15 1 21010 77.4000000000 1.6548000000 2
3 15001021101 15 1 21101 82.5000000000 3.9086000000 3
4 15001021402 15 1 21402 80.4000000000 1.0930000000 2
5 15001021800 15 1 21800 79.5000000000 1.1320000000 2
6 15003010201 15 3 10201 79.4000000000 1.5261000000 3
7 15007040603 15 7 40603 86.3000000000 2.2285000000 3
8 15007040604 15 7 40604 84.9000000000 2.1995000000 3
9 15007040700 15 7 40700 80.4000000000 0.7571000000 2
10 15009030100 15 9 30100 77.2000000000 1.8736000000 3
11 15009030402 15 9 30402 83.5000000000 1.8267000000 3
12 15009030800 15 9 30800 82.2000000000 1.6251000000 3
13 6007040500 6 7 40500 99.1000000000 3.1415000000 3
14 6001020100 6 1 20100 99.1000000000 3.1415000000 3
15 6007040300 6 7 40300 99.1000000000 3.1415000000 3
16 15009030201 15 9 30201 99.1000000000 3.1415000000 3

View file

@ -0,0 +1,16 @@
GEOID10_TRACT,Life expectancy (years)
15001021010,77.4000000000
15001021101,82.5000000000
15001021402,80.4000000000
15001021800,79.5000000000
15003010201,79.4000000000
15007040603,86.3000000000
15007040604,84.9000000000
15007040700,80.4000000000
15009030100,77.2000000000
15009030402,83.5000000000
15009030800,82.2000000000
06007040500,99.1000000000
06001020100,99.1000000000
06007040300,99.1000000000
15009030201,99.1000000000
1 GEOID10_TRACT Life expectancy (years)
2 15001021010 77.4000000000
3 15001021101 82.5000000000
4 15001021402 80.4000000000
5 15001021800 79.5000000000
6 15003010201 79.4000000000
7 15007040603 86.3000000000
8 15007040604 84.9000000000
9 15007040700 80.4000000000
10 15009030100 77.2000000000
11 15009030402 83.5000000000
12 15009030800 82.2000000000
13 06007040500 99.1000000000
14 06001020100 99.1000000000
15 06007040300 99.1000000000
16 15009030201 99.1000000000

View file

@ -0,0 +1,16 @@
GEOID10_TRACT,STATE2KX,CNTY2KX,TRACT2KX,Life expectancy (years),se(e(0)),Abridged life table flag
15001021010,15,1,21010,77.4000000000,1.6548000000,2
15001021101,15,1,21101,82.5000000000,3.9086000000,3
15001021402,15,1,21402,80.4000000000,1.0930000000,2
15001021800,15,1,21800,79.5000000000,1.1320000000,2
15003010201,15,3,10201,79.4000000000,1.5261000000,3
15007040603,15,7,40603,86.3000000000,2.2285000000,3
15007040604,15,7,40604,84.9000000000,2.1995000000,3
15007040700,15,7,40700,80.4000000000,0.7571000000,2
15009030100,15,9,30100,77.2000000000,1.8736000000,3
15009030402,15,9,30402,83.5000000000,1.8267000000,3
15009030800,15,9,30800,82.2000000000,1.6251000000,3
06007040500,6,7,40500,99.1000000000,3.1415000000,3
06001020100,6,1,20100,99.1000000000,3.1415000000,3
06007040300,6,7,40300,99.1000000000,3.1415000000,3
15009030201,15,9,30201,99.1000000000,3.1415000000,3
1 GEOID10_TRACT STATE2KX CNTY2KX TRACT2KX Life expectancy (years) se(e(0)) Abridged life table flag
2 15001021010 15 1 21010 77.4000000000 1.6548000000 2
3 15001021101 15 1 21101 82.5000000000 3.9086000000 3
4 15001021402 15 1 21402 80.4000000000 1.0930000000 2
5 15001021800 15 1 21800 79.5000000000 1.1320000000 2
6 15003010201 15 3 10201 79.4000000000 1.5261000000 3
7 15007040603 15 7 40603 86.3000000000 2.2285000000 3
8 15007040604 15 7 40604 84.9000000000 2.1995000000 3
9 15007040700 15 7 40700 80.4000000000 0.7571000000 2
10 15009030100 15 9 30100 77.2000000000 1.8736000000 3
11 15009030402 15 9 30402 83.5000000000 1.8267000000 3
12 15009030800 15 9 30800 82.2000000000 1.6251000000 3
13 06007040500 6 7 40500 99.1000000000 3.1415000000 3
14 06001020100 6 1 20100 99.1000000000 3.1415000000 3
15 06007040300 6 7 40300 99.1000000000 3.1415000000 3
16 15009030201 15 9 30201 99.1000000000 3.1415000000 3

View file

@ -0,0 +1,59 @@
# pylint: disable=protected-access
import pathlib
from data_pipeline.etl.sources.cdc_life_expectancy.etl import CDCLifeExpectancy
from data_pipeline.tests.sources.example.test_etl import TestETL
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class TestCDCLifeExpectency(TestETL):
"""Tests the CDC Life Expectancy ETL.
This uses pytest-snapshot.
To update individual snapshots: $ poetry run pytest
data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py::TestClassNameETL::<testname>
--snapshot-update
"""
_ETL_CLASS = CDCLifeExpectancy
_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "US_A.CSV"
_SAMPLE_DATA_ZIP_FILE_NAME = "US_A.CSV"
_EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectancy"
_EXTRACT_CSV_FILE_NAME = "extract.csv"
def setup_method(self, _method, filename=__file__):
"""Invoke `setup_method` from Parent, but using the current file name.
This code can be copied identically between all child classes.
"""
super().setup_method(_method=_method, filename=filename)
def test_init(self, mock_etl, mock_paths):
"""Tests that the ChildOpportunityIndexETL class was initialized
correctly.
"""
etl = self._ETL_CLASS()
data_path, _ = mock_paths
assert etl.DATA_PATH == data_path
assert etl.COLUMNS_TO_KEEP == [
"GEOID10_TRACT",
"Life expectancy (years)",
]
assert etl.INPUT_GEOID_TRACT_FIELD_NAME == "Tract ID"
assert etl.LIFE_EXPECTANCY_FIELD_NAME == "Life expectancy (years)"
def test_get_output_file_path(self, mock_etl, mock_paths):
"""Tests the right file name is returned."""
etl = self._ETL_CLASS()
data_path, tmp_path = mock_paths
output_file_path = etl._get_output_file_path()
expected_output_file_path = (
data_path / "dataset" / "cdc_life_expectancy" / "usa.csv"
)
assert output_file_path == expected_output_file_path