Issue 1075: Add refactored ETL tests to NRI (#1088)

* Adds a substantially refactored ETL test to the National Risk Index, to be used as a model for other tests
This commit is contained in:
Lucas Merrill Brown 2022-02-08 19:05:32 -05:00 committed by GitHub
commit 43e005cc10
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
41 changed files with 1155 additions and 619 deletions

View file

@ -55,11 +55,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
}
)
def validate(self) -> None:
logger.info("Validating CDC Life Expectancy Data")
pass
def load(self) -> None:
logger.info("Saving CDC Life Expectancy CSV")

View file

@ -74,8 +74,3 @@ class CDCPlacesETL(ExtractTransformLoad):
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating Census ACS Data")
pass

View file

@ -377,8 +377,3 @@ class CensusACSETL(ExtractTransformLoad):
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating Census ACS Data")
pass

View file

@ -190,8 +190,3 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating Census ACS Data")
pass

View file

@ -316,11 +316,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.output_df = merged_with_state_income_df
def validate(self) -> None:
logger.info("Validating Census ACS Median Income Data")
pass
def load(self) -> None:
logger.info("Saving Census ACS Median Income CSV")

View file

@ -405,8 +405,3 @@ class CensusDecennialETL(ExtractTransformLoad):
self.df_all[columns_to_include].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)
def validate(self) -> None:
logger.info("Validating Census Decennial Data")
pass

View file

@ -106,11 +106,6 @@ class ChildOpportunityIndex(ExtractTransformLoad):
self.output_df = output_df
def validate(self) -> None:
logger.info("Validating data.")
pass
def load(self) -> None:
logger.info("Saving CSV")

View file

@ -72,11 +72,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
self.output_df = output_df
def validate(self) -> None:
logger.info("Validating DOE Energy Burden Data")
pass
def load(self) -> None:
logger.info("Saving DOE Energy Burden CSV")

View file

@ -103,11 +103,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
"bool"
)
def validate(self) -> None:
logger.info("Validating data")
pass
def load(self) -> None:
logger.info("Saving CSV")

View file

@ -144,11 +144,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
f"GEOID Tract must be length of {expected_census_tract_field_length}"
)
def validate(self) -> None:
logger.info("Validating data.")
pass
def load(self) -> None:
logger.info("Saving CSV")

View file

@ -69,8 +69,3 @@ class GeoCorrETL(ExtractTransformLoad):
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating GeoCorr Urban Rural Map Data")
pass

View file

@ -5,7 +5,7 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -14,15 +14,14 @@ logger = get_module_logger(__name__)
class NationalRiskIndexETL(ExtractTransformLoad):
"""ETL class for the FEMA National Risk Index dataset"""
NAME = "national_risk_index"
LAST_UPDATED_YEAR = 2020
SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
def __init__(self):
self.NRI_FTP_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
self.INPUT_CSV = self.TMP_PATH / "NRI_Table_CensusTracts.csv"
self.OUTPUT_DIR = (
self.DATA_PATH / "dataset" / "national_risk_index_2020"
)
self.BLOCK_GROUP_CSV = (
self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
)
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
"EAL_SCORE"
)
@ -52,7 +51,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
"Expected population loss rate (Natural Hazards Risk Index)"
)
# Note: also need to edit transform step to add fields to output.
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@ -69,8 +67,8 @@ class NationalRiskIndexETL(ExtractTransformLoad):
"""
logger.info("Downloading 405MB National Risk Index Data")
super().extract(
self.NRI_FTP_URL,
self.TMP_PATH,
source_url=self.SOURCE_URL,
extract_path=self.TMP_PATH,
)
def transform(self) -> None:
@ -164,14 +162,12 @@ class NationalRiskIndexETL(ExtractTransformLoad):
/ df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
)
self.df = df_nri
# Round all float columns to just 10 digits.
# Note: `round` is smart enough to only apply to float columns.
df_nri = df_nri.round(10)
self.output_df = df_nri
def load(self) -> None:
"""Writes the NRI data as a csv to the directory at self.OUTPUT_DIR"""
logger.info("Saving National Risk Index CSV")
# write nationwide csv
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv(
self.OUTPUT_DIR / "usa.csv", index=False, float_format="%.10f"
)
# Suppress scientific notation.
super().load(float_format="%.10f")

View file

@ -167,8 +167,3 @@ class PersistentPovertyETL(ExtractTransformLoad):
self.df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)
def validate(self) -> None:
logger.info("Validating persistent poverty data.")
pass