diff --git a/data/data-pipeline/data_pipeline/config.py b/data/data-pipeline/data_pipeline/config.py index 23e550a8..5dc336c8 100644 --- a/data/data-pipeline/data_pipeline/config.py +++ b/data/data-pipeline/data_pipeline/config.py @@ -12,6 +12,7 @@ settings = Dynaconf( # set root dir settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent +settings.DATA_PATH = settings.APP_ROOT / "data" settings.REQUESTS_DEFAULT_TIMOUT = 3600 # To set an environment use: # Linux/OSX: export ENV_FOR_DYNACONF=staging diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 211fbc31..65580f9a 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -7,6 +7,9 @@ from typing import Optional import pandas as pd from data_pipeline.config import settings +from data_pipeline.etl.score.etl_utils import ( + compare_to_list_of_expected_state_fips_codes, +) from data_pipeline.etl.score.schemas.datasets import DatasetsConfig from data_pipeline.utils import ( load_yaml_dict_from_file, @@ -43,7 +46,7 @@ class ExtractTransformLoad: APP_ROOT: pathlib.Path = settings.APP_ROOT # Directories - DATA_PATH: pathlib.Path = APP_ROOT / "data" + DATA_PATH: pathlib.Path = settings.DATA_PATH TMP_PATH: pathlib.Path = DATA_PATH / "tmp" CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config" DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config" @@ -82,6 +85,23 @@ class ExtractTransformLoad: # NULL_REPRESENTATION is how nulls are represented on the input field NULL_REPRESENTATION: str = None + # Whether this ETL contains data for the continental nation (DC & the US states + # except for Alaska and Hawaii) + CONTINENTAL_US_EXPECTED_IN_DATA: bool = True + + # Whether this ETL contains data for Alaska and Hawaii + ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = True + + # Whether this ETL contains data for Puerto Rico + PUERTO_RICO_EXPECTED_IN_DATA: bool = True + + # Whether this ETL contains data for the island areas + ISLAND_AREAS_EXPECTED_IN_DATA: bool = False + + # Whether this ETL contains known missing data for any additional + # states/territories + EXPECTED_MISSING_STATES: typing.List[str] = [] + # Thirteen digits in a census block group ID. EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13 # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might @@ -289,6 +309,24 @@ class ExtractTransformLoad: f"`{geo_field}`." ) + # Check whether data contains expected states + states_in_output_df = ( + self.output_df[self.GEOID_TRACT_FIELD_NAME] + .str[0:2] + .unique() + .tolist() + ) + + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=states_in_output_df, + continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA, + alaska_and_hawaii_expected=self.ALASKA_AND_HAWAII_EXPECTED_IN_DATA, + puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, + island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, + additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES, + dataset_name=self.NAME, + ) + def load(self, float_format=None) -> None: """Saves the transformed data. diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index f50eadaa..c112eec0 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -131,6 +131,58 @@ TILES_NATION_THRESHOLD_COUNT = 21 # 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"] TILES_PUERTO_RICO_FIPS_CODE = ["72"] +TILES_ALASKA_AND_HAWAII_FIPS_CODE = ["02", "15"] +TILES_CONTINENTAL_US_FIPS_CODE = [ + "01", + "04", + "05", + "06", + "08", + "09", + "10", + "11", + "12", + "13", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "53", + "54", + "55", + "56", +] # Constant to reflect UI Experience version # "Nation" referring to 50 states and DC is from Census @@ -399,5 +451,5 @@ TILES_SCORE_FLOAT_COLUMNS = [ # that use null to signify missing information in a boolean field. field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME, field_names.AML_BOOLEAN, - field_names.HISTORIC_REDLINING_SCORE_EXCEEDED + field_names.HISTORIC_REDLINING_SCORE_EXCEEDED, ] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py index f5222620..5f7b6ecd 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py @@ -1,11 +1,19 @@ import os import sys +import typing from pathlib import Path from collections import namedtuple import numpy as np import pandas as pd from data_pipeline.config import settings +from data_pipeline.etl.score.constants import ( + TILES_ISLAND_AREA_FIPS_CODES, + TILES_PUERTO_RICO_FIPS_CODE, + TILES_CONTINENTAL_US_FIPS_CODE, + TILES_ALASKA_AND_HAWAII_FIPS_CODE, +) +from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.utils import ( download_file_from_url, get_module_logger, @@ -305,3 +313,106 @@ def create_codebook( return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename( columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"} ) + + +# pylint: disable=too-many-arguments +def compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes: typing.List[str], + continental_us_expected: bool = True, + alaska_and_hawaii_expected: bool = True, + puerto_rico_expected: bool = True, + island_areas_expected: bool = True, + additional_fips_codes_not_expected: typing.List[str] = None, + dataset_name: str = None, +) -> None: + """Check whether a list of state/territory FIPS codes match expectations. + + Args: + actual_state_fips_codes (List of str): Actual state codes observed in data + continental_us_expected (bool, optional): Do you expect the continental nation + (DC & states except for Alaska and Hawaii) to be represented in data? + alaska_and_hawaii_expected (bool, optional): Do you expect Alaska and Hawaii + to be represented in the data? Note: if only *1* of Alaska and Hawaii are + not expected to be included, do not use this argument -- instead, + use `additional_fips_codes_not_expected` for the 1 state you expected to + be missing. + puerto_rico_expected (bool, optional): Do you expect PR to be represented in data? + island_areas_expected (bool, optional): Do you expect Island Areas to be represented in + data? + additional_fips_codes_not_expected (List of str, optional): Additional state codes + not expected in the data. For example, the data may be known to be missing + data from Maine and Wisconsin. + dataset_name (str, optional): The name of the data set, used only in printing an + error message. (This is helpful for debugging during parallel etl runs.) + + Returns: + None: Does not return any values. + + Raises: + ValueError: if lists do not match expectations. + """ + # Setting default argument of [] here to avoid mutability problems. + if additional_fips_codes_not_expected is None: + additional_fips_codes_not_expected = [] + + # Cast input to a set. + actual_state_fips_codes_set = set(actual_state_fips_codes) + + # Start with the list of all FIPS codes for all states and territories. + expected_states_set = set(get_state_fips_codes(settings.DATA_PATH)) + + # If continental US is not expected to be included, remove it from the + # expected states set. + if not continental_us_expected: + expected_states_set = expected_states_set - set( + TILES_CONTINENTAL_US_FIPS_CODE + ) + + # If both Alaska and Hawaii are not expected to be included, remove them from the + # expected states set. + # Note: if only *1* of Alaska and Hawaii are not expected to be included, + # do not use this argument -- instead, use `additional_fips_codes_not_expected` + # for the 1 state you expected to be missing. + if not alaska_and_hawaii_expected: + expected_states_set = expected_states_set - set( + TILES_ALASKA_AND_HAWAII_FIPS_CODE + ) + + # If Puerto Rico is not expected to be included, remove it from the expected + # states set. + if not puerto_rico_expected: + expected_states_set = expected_states_set - set( + TILES_PUERTO_RICO_FIPS_CODE + ) + + # If island areas are not expected to be included, remove them from the expected + # states set. + if not island_areas_expected: + expected_states_set = expected_states_set - set( + TILES_ISLAND_AREA_FIPS_CODES + ) + + # If additional FIPS codes are not expected to be included, remove them from the + # expected states set. + expected_states_set = expected_states_set - set( + additional_fips_codes_not_expected + ) + + dataset_name_phrase = ( + f" for dataset `{dataset_name}`" if dataset_name is not None else "" + ) + + if expected_states_set != actual_state_fips_codes_set: + raise ValueError( + f"The states and territories in the data{dataset_name_phrase} are not " + f"as expected.\n" + "FIPS state codes expected that are not present in the data:\n" + f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n" + "FIPS state codes in the data that were not expected:\n" + f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n" + ) + else: + logger.info( + "Data matches expected state and territory representation" + f"{dataset_name_phrase}." + ) diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py index 594f4856..ed33c63e 100644 --- a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py @@ -2,7 +2,10 @@ import pandas as pd import numpy as np import pytest -from data_pipeline.etl.score.etl_utils import floor_series +from data_pipeline.etl.score.etl_utils import ( + floor_series, + compare_to_list_of_expected_state_fips_codes, +) def test_floor_series(): @@ -70,3 +73,159 @@ def test_floor_series(): match="Argument series must be of type pandas series, not of type list.", ): floor_series(invalid_type, number_of_decimals=3) + + +def test_compare_to_list_of_expected_state_fips_codes(): + # Has every state/territory/DC code + fips_codes_test_1 = [ + "01", + "02", + "04", + "05", + "06", + "08", + "09", + "10", + "11", + "12", + "13", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "53", + "54", + "55", + "56", + "60", + "66", + "69", + "72", + "78", + ] + + # Should not raise any errors + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_1 + ) + + # Should raise error because Puerto Rico is not expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_1, + puerto_rico_expected=False, + ) + partial_expected_error_message = ( + "FIPS state codes in the data that were not expected:\n['72']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # Should raise error because Island Areas are not expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_1, + island_areas_expected=False, + ) + partial_expected_error_message = ( + "FIPS state codes in the data that were not expected:\n" + "['60', '66', '69', '78']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # List missing PR and Guam + fips_codes_test_2 = [x for x in fips_codes_test_1 if x not in ["66", "72"]] + + # Should raise error because all Island Areas and PR are expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_2, + ) + partial_expected_error_message = ( + "FIPS state codes expected that are not present in the data:\n" + "['66', '72']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # Missing Maine and Wisconsin + fips_codes_test_3 = [x for x in fips_codes_test_1 if x not in ["23", "55"]] + + # Should raise error because Maine and Wisconsin are expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_3, + ) + partial_expected_error_message = ( + "FIPS state codes expected that are not present in the data:\n" + "['23', '55']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # Should not raise error because Maine and Wisconsin are expected to be missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_3, + additional_fips_codes_not_expected=["23", "55"], + ) + + # Missing the continental & AK/HI nation + fips_codes_test_4 = [ + "60", + "66", + "69", + "72", + "78", + ] + + # Should raise error because the nation is expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_4, + ) + + partial_expected_error_message = ( + "FIPS state codes expected that are not present in the data:\n" + "['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', " + "'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', " + "'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', " + "'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']" + ) + + assert partial_expected_error_message in str(exception_info.value) + + # Should not raise error because continental US and AK/HI is not to be missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_4, + continental_us_expected=False, + alaska_and_hawaii_expected=False, + ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 2aac7412..d75ca85b 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -1,58 +1,137 @@ +import pathlib from pathlib import Path import pandas as pd -from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel +from data_pipeline.etl.score.etl_utils import ( + compare_to_list_of_expected_state_fips_codes, +) +from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger, download_file_from_url logger = get_module_logger(__name__) class CDCLifeExpectancy(ExtractTransformLoad): + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + PUERTO_RICO_EXPECTED_IN_DATA = False + + USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" + + STATES_MISSING_FROM_USA_FILE = ["23", "55"] + + # For some reason, LEEP does not include Maine or Wisconsin in its "All of + # USA" file. Load these separately. + WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV" + MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV" + + TRACT_INPUT_COLUMN_NAME = "Tract ID" + STATE_INPUT_COLUMN_NAME = "STATE2KX" + + raw_df: pd.DataFrame + output_df: pd.DataFrame + def __init__(self): - self.FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "cdc_life_expectancy" ) - self.TRACT_INPUT_COLUMN_NAME = "Tract ID" - self.LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)" - # Constants for output self.COLUMNS_TO_KEEP = [ self.GEOID_TRACT_FIELD_NAME, - self.LIFE_EXPECTANCY_FIELD_NAME, + field_names.LIFE_EXPECTANCY_FIELD, ] - self.raw_df: pd.DataFrame - self.output_df: pd.DataFrame - - def extract(self) -> None: - logger.info("Starting data download.") - - download_file_name = ( - self.get_tmp_path() / "cdc_life_expectancy" / "usa.csv" - ) + def _download_and_prep_data( + self, file_url: str, download_file_name: pathlib.Path + ) -> pd.DataFrame: download_file_from_url( - file_url=self.FILE_URL, + file_url=file_url, download_file_name=download_file_name, verify=True, ) - self.raw_df = pd.read_csv( + df = pd.read_csv( filepath_or_buffer=download_file_name, dtype={ # The following need to remain as strings for all of their digits, not get converted to numbers. self.TRACT_INPUT_COLUMN_NAME: "string", + self.STATE_INPUT_COLUMN_NAME: "string", }, low_memory=False, ) + return df + + def extract(self) -> None: + logger.info("Starting data download.") + + all_usa_raw_df = self._download_and_prep_data( + file_url=self.USA_FILE_URL, + download_file_name=self.get_tmp_path() + / "cdc_life_expectancy" + / "usa.csv", + ) + + # Check which states are missing + states_in_life_expectancy_usa_file = list( + all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique() + ) + + # Expect that PR, Island Areas, and Maine/Wisconsin are missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=states_in_life_expectancy_usa_file, + continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA, + puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, + island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, + additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE, + ) + + logger.info("Downloading data for Maine") + maine_raw_df = self._download_and_prep_data( + file_url=self.MAINE_FILE_URL, + download_file_name=self.get_tmp_path() + / "cdc_life_expectancy" + / "maine.csv", + ) + + logger.info("Downloading data for Wisconsin") + wisconsin_raw_df = self._download_and_prep_data( + file_url=self.WISCONSIN_FILE_URL, + download_file_name=self.get_tmp_path() + / "cdc_life_expectancy" + / "wisconsin.csv", + ) + + combined_df = pd.concat( + objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df], + ignore_index=True, + verify_integrity=True, + axis=0, + ) + + states_in_combined_df = list( + combined_df[self.STATE_INPUT_COLUMN_NAME].unique() + ) + + # Expect that PR and Island Areas are the only things now missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=states_in_combined_df, + continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA, + puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, + island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, + additional_fips_codes_not_expected=[], + ) + + # Save the updated version + self.raw_df = combined_df + def transform(self) -> None: - logger.info("Starting DOE energy burden transform.") + logger.info("Starting CDC life expectancy transform.") self.output_df = self.raw_df.rename( columns={ - "e(0)": self.LIFE_EXPECTANCY_FIELD_NAME, + "e(0)": field_names.LIFE_EXPECTANCY_FIELD, self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, } ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py index b3e40e3a..d2b7143c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py @@ -32,6 +32,8 @@ class ChildOpportunityIndex(ExtractTransformLoad): IMPENETRABLE_SURFACES_FIELD: str READING_FIELD: str + PUERTO_RICO_EXPECTED_IN_DATA = False + def __init__(self): self.SOURCE_URL = ( "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-" diff --git a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py index 2a99f76f..9e9d0f3f 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py @@ -16,6 +16,7 @@ class TravelCompositeETL(ExtractTransformLoad): NAME = "travel_composite" SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip" GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + PUERTO_RICO_EXPECTED_IN_DATA = False # Output score variables (values set on datasets.yml) for linting purposes TRAVEL_BURDEN_FIELD_NAME: str diff --git a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py index 0c09b711..457890db 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py @@ -20,6 +20,23 @@ class AbandonedMineETL(ExtractTransformLoad): GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT AML_BOOLEAN: str + PUERTO_RICO_EXPECTED_IN_DATA = False + EXPECTED_MISSING_STATES = [ + "10", + "11", + "12", + "15", + "23", + "27", + "31", + "33", + "34", + "36", + "45", + "50", + "55", + ] + # Define these for easy code completion def __init__(self): self.SOURCE_URL = ( diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py index 2a26370e..b623206c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py @@ -16,6 +16,8 @@ class WildfireRiskETL(ExtractTransformLoad): NAME = "fsf_wildfire_risk" SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip" GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + PUERTO_RICO_EXPECTED_IN_DATA = False + ALASKA_AND_HAWAII_EXPECTED_IN_DATA = False # Output score variables (values set on datasets.yml) for linting purposes COUNT_PROPERTIES: str diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index c6a312c0..57681974 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -17,6 +17,7 @@ class NationalRiskIndexETL(ExtractTransformLoad): NAME = "national_risk_index" SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip" GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + PUERTO_RICO_EXPECTED_IN_DATA = False # Output score variables (values set on datasets.yml) for linting purposes RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME: str diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md deleted file mode 100644 index d8736d54..00000000 --- a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# Nature deprived communities data - -The following dataset was compiled by TPL using NCLD data. We define as: AREA - [CROPLAND] - [IMPERVIOUS SURFACES]. - -## Codebook -- GEOID10 – Census tract ID -- SF – State Name -- CF – County Name -- P200_PFS – Percent of individuals below 200% Federal Poverty Line (from CEJST source data). -- CA_LT20 – Percent higher ed enrollment rate is less than 20% (from CEJST source data). -- TractAcres – Acres of tract calculated from ALAND10 field (area land/meters) in 2010 census tracts. - - CAVEAT: Some census tracts in the CEJST source file extend into open water. ALAND10 area was used to constrain percent calculations (e.g. cropland area) to land only. -- AcresCrops – Acres crops calculated by summing all cells in the NLCD Cropland Data Layer crop classes. -- PctCrops – Formula: AcresCrops/TractAcres*100. -- PctImperv – Mean imperviousness for each census tract. - - CAVEAT: Where tracts extend into open water, mean imperviousness may be underestimated. -- __TO USE__ PctNatural – Formula: 100 – PctCrops – PctImperv. -- PctNat90 – Tract in or below 10th percentile for PctNatural. 1 = True, 0 = False. - - PctNatural 10th percentile = 28.6439% -- ImpOrCrop – If tract >= 90th percentile for PctImperv OR PctCrops. 1 = True, 0 = False. - - PctImperv 90th percentile = 67.4146 % - - PctCrops 90th percentile = 27.8116 % -- LowInAndEd – If tract >= 65th percentile for P200_PFS AND CA_LT20. - - P200_PFS 65th percentile = 64.0% -- NatureDep – ImpOrCrp = 1 AND LowInAndEd = 1. - -We added `GEOID10_TRACT` before converting shapefile to csv. - -## Instructions to recreate - -### Creating Impervious plus Cropland Attributes for Census Tracts - -The Cropland Data Layer and NLCD Impervious layer were too big to put on our OneDrive, but you can download them here: - CDL: https://www.nass.usda.gov/Research_and_Science/Cropland/Release/datasets/2021_30m_cdls.zip - Impervious: https://s3-us-west-2.amazonaws.com/mrlc/nlcd_2019_impervious_l48_20210604.zip - - -#### Crops - -Add an attribute called TractAcres (or similar) to the census tracts to hold a value representing acres covered by the census tract. -Calculate the TractAcres field for each census tract by using the Calculate Geometry tool (set the Property to Area (geodesic), and the Units to Acres). -From the Cropland Data Layer (CDL), extract only the pixels representing crops, using the Extract by Attributes tool in ArcGIS Spatial Analyst toolbox. -a. The attribute table tells you the names of each type of land cover. Since the CDL also contains NLCD classes and empty classes, the actual crop classes must be extracted. -From the crops-only raster extracted from the CDL, run the Reclassify tool to create a binary layer where all crops = 1, and everything else is Null. -Run the Tabulate Area tool: -a. Zone data = census tracts -b. Input raster data = the binary crops layer -c. This will produce a table with the square meters of crops in each census tract contained in an attribute called VALUE_1 -Run the Join Field tool to join the table to the census tracts, with the VALUE_1 field as the Transfer Field, to transfer the VALUE_1 field (square meters of crops) to the census tracts. -Add a field to the census tracts called AcresCrops (or similar) to hold the acreage of crops in each census tract. -Calculate the AcresCrops field by multiplying the VALUE_1 field by 0.000247105 to produce acres of crops in each census tracts. -a. You can delete the VALUE_1 field. -Add a field called PctCrops (or similar) to hold the percent of each census tract occupied by crops. -Calculate the PctCrops field by dividing the AcresCrops field by the TractAcres field, and multiply by 100 to get the percent. -Impervious - -Run the Zonal Statistics as Table tool: -a. Zone data = census tracts -b. Input raster data = impervious data raster layer -c. Statistics type = Mean -d. This will produce a table with the percent of each census tract occupied by impervious surfaces, contained in an attribute called MEAN - -Run the Join Field tool to join the table to the census tracts, with the MEAN field as the Transfer Field, to transfer the MEAN field (percent impervious) to the census tracts. - -Add a field called PctImperv (or similar) to hold the percent impervious value. - -Calculate the PctImperv field by setting it equal to the MEAN field. -a. You can delete the MEAN field. -Combine the Crops and Impervious Data - -Open the census tracts attribute table and add a field called PctNatural (or similar). Calculate this field using this equation: 100 – PctCrops – PctImperv . This produces a value that tells you the percent of each census tract covered in natural land cover. - -Define the census tracts that fall in the 90th percentile of non-natural land cover: -a. Add a field called PctNat90 (or similar) -b. Right-click on the PctNatural field, and click Sort Ascending (lowest PctNatural values on top) -c. Select the top 10 percent of rows after the sort -d. Click on Show Selected Records in the attribute table -e. Calculate the PctNat90 field for the selected records = 1 -f. Clear the selection -g. The rows that now have a value of 1 for PctNat90 are the most lacking for natural land cover, and can be symbolized accordingly in a map diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py deleted file mode 100644 index 14d49c52..00000000 --- a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py +++ /dev/null @@ -1,77 +0,0 @@ -# pylint: disable=unsubscriptable-object -# pylint: disable=unsupported-assignment-operation - -import pandas as pd -from data_pipeline.config import settings - -from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class NatureDeprivedETL(ExtractTransformLoad): - """ETL class for the Nature Deprived Communities dataset""" - - NAME = "ncld_nature_deprived" - SOURCE_URL = ( - settings.AWS_JUSTICE40_DATASOURCES_URL - + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip" - ) - GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT - - # Output score variables (values set on datasets.yml) for linting purposes - ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME: str - TRACT_PERCENT_IMPERVIOUS_FIELD_NAME: str - TRACT_PERCENT_NON_NATURAL_FIELD_NAME: str - TRACT_PERCENT_CROPLAND_FIELD_NAME: str - - def __init__(self): - # define the full path for the input CSV file - self.INPUT_CSV = ( - self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv" - ) - - # this is the main dataframe - self.df: pd.DataFrame - - # Start dataset-specific vars here - self.PERCENT_NATURAL_FIELD_NAME = "PctNatural" - self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv" - self.PERCENT_CROPLAND_FIELD_NAME = "PctCrops" - self.TRACT_ACRES_FIELD_NAME = "TractAcres" - # In order to ensure that tracts with very small Acreage, we want to create an eligibility criterion - # similar to agrivalue. Here, we are ensuring that a tract has at least 35 acres, or is above the 1st percentile - # for area. This does indeed remove tracts from the 90th+ percentile later on - self.TRACT_ACRES_LOWER_BOUND = 35 - - def transform(self) -> None: - """Reads the unzipped data file into memory and applies the following - transformations to prepare it for the load() method: - - - Renames columns as needed - """ - logger.info("Transforming NCLD Data") - - logger.info(self.COLUMNS_TO_KEEP) - - df_ncld: pd.DataFrame = pd.read_csv( - self.INPUT_CSV, - dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, - low_memory=False, - ) - - df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = ( - df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND - ) - df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = ( - 1 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME] - ) - - # Assign the final df to the class' output_df for the load method with rename - self.output_df = df_ncld.rename( - columns={ - self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME, - self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME, - } - ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py index e9951da2..651d7f68 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py @@ -19,6 +19,11 @@ class NatureDeprivedETL(ExtractTransformLoad): + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip" ) GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + PUERTO_RICO_EXPECTED_IN_DATA = False + ALASKA_AND_HAWAII_EXPECTED_IN_DATA = False + + # Alaska and Hawaii are missing + EXPECTED_MISSING_STATES = ["02", "15"] # Output score variables (values set on datasets.yml) for linting purposes ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME: str diff --git a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py index f35d4749..945f1039 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py @@ -20,6 +20,8 @@ class USArmyFUDS(ExtractTransformLoad): ELIGIBLE_FUDS_BINARY_FIELD_NAME: str GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT + ISLAND_AREAS_EXPECTED_IN_DATA = True + def __init__(self): self.FILE_URL: str = ( "https://opendata.arcgis.com/api/v3/datasets/" diff --git a/data/data-pipeline/data_pipeline/tests/sources/child_opportunity_index/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/child_opportunity_index/test_etl.py index 7183f911..6fe2ffd3 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/child_opportunity_index/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/child_opportunity_index/test_etl.py @@ -59,7 +59,7 @@ class TestChildOpportunityIndexETL(TestETL): def test_get_output_file_path(self, mock_etl, mock_paths): """Tests the right file name is returned.""" - etl = self._ETL_CLASS() + etl = self._get_instance_of_etl_class() data_path, tmp_path = mock_paths output_file_path = etl._get_output_file_path() diff --git a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py index efa70d57..bb24ba3e 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/doe_energy_burden/test_etl.py @@ -51,7 +51,7 @@ class TestDOEEnergyBurdenETL(TestETL): def test_get_output_file_path(self, mock_etl, mock_paths): """Tests the right file name is returned.""" - etl = self._ETL_CLASS() + etl = self._get_instance_of_etl_class() data_path, tmp_path = mock_paths output_file_path = etl._get_output_file_path() diff --git a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py index 2f85b55e..b2a5f44b 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py @@ -65,7 +65,7 @@ class TestAbandondedLandMineETL(TestETL): initiliazed correctly. """ # setup - etl = self._ETL_CLASS() + etl = self._get_instance_of_etl_class() # validation assert etl.GEOID_FIELD_NAME == "GEOID10" assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT" @@ -78,7 +78,7 @@ class TestAbandondedLandMineETL(TestETL): def test_get_output_file_path(self, mock_etl, mock_paths): """Tests the right file name is returned.""" - etl = self._ETL_CLASS() + etl = self._get_instance_of_etl_class() data_path, tmp_path = mock_paths output_file_path = etl._get_output_file_path() diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py index e4c8305f..8855baad 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py @@ -11,6 +11,10 @@ import numpy as np import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel +from data_pipeline.etl.score.constants import ( + TILES_CONTINENTAL_US_FIPS_CODE, + TILES_ALASKA_AND_HAWAII_FIPS_CODE, +) from data_pipeline.tests.sources.example.etl import ExampleETL from data_pipeline.utils import get_module_logger @@ -86,7 +90,25 @@ class TestETL: self._DATA_DIRECTORY_FOR_TEST = pathlib.Path(filename).parent / "data" def _get_instance_of_etl_class(self) -> Type[ExtractTransformLoad]: - return self._ETL_CLASS() + etl_class = self._ETL_CLASS() + + # Find out what unique state codes are present in the test fixture data. + states_expected_from_fixtures = { + x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS + } + + # Set values to match test fixtures + etl_class.EXPECTED_MISSING_STATES = [ + x + for x in TILES_CONTINENTAL_US_FIPS_CODE + + TILES_ALASKA_AND_HAWAII_FIPS_CODE + if x not in states_expected_from_fixtures + ] + etl_class.PUERTO_RICO_EXPECTED_IN_DATA = False + etl_class.ISLAND_AREAS_EXPECTED_IN_DATA = False + etl_class.ALASKA_AND_HAWAII_EXPECTED_IN_DATA = True + + return etl_class def _setup_etl_instance_and_run_extract( self, mock_etl, mock_paths @@ -119,7 +141,7 @@ class TestETL: requests_mock.get = mock.MagicMock(return_value=response_mock) # Instantiate the ETL class. - etl = self._ETL_CLASS() + etl = self._get_instance_of_etl_class() # Monkey-patch the temporary directory to the one used in the test etl.TMP_PATH = tmp_path diff --git a/data/data-pipeline/data_pipeline/tests/sources/us_army_fuds/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/us_army_fuds/test_etl.py index 5d390943..ce2b63c4 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/us_army_fuds/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/us_army_fuds/test_etl.py @@ -98,7 +98,7 @@ class TestUSArmyFUDSETL(TestETL): - self.OUTPUT_PATH points to the correct path in the temp directory """ # setup - etl = self._ETL_CLASS() + etl = self._get_instance_of_etl_class() # validation assert etl.GEOID_FIELD_NAME == "GEOID10" assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT" @@ -113,7 +113,7 @@ class TestUSArmyFUDSETL(TestETL): def test_get_output_file_path(self, mock_etl, mock_paths): """Tests the right file name is returned.""" - etl = self._ETL_CLASS() + etl = self._get_instance_of_etl_class() data_path, tmp_path = mock_paths output_file_path = etl._get_output_file_path()