mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 06:11:16 -07:00
Issue 1075: Add refactored ETL tests to NRI (#1088)
* Adds a substantially refactored ETL test to the National Risk Index, to be used as a model for other tests
This commit is contained in:
parent
f5fe8d90e2
commit
43e005cc10
41 changed files with 1155 additions and 619 deletions
|
@ -55,11 +55,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
}
|
||||
)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating CDC Life Expectancy Data")
|
||||
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CDC Life Expectancy CSV")
|
||||
|
||||
|
|
|
@ -74,8 +74,3 @@ class CDCPlacesETL(ExtractTransformLoad):
|
|||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Census ACS Data")
|
||||
|
||||
pass
|
||||
|
|
|
@ -377,8 +377,3 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Census ACS Data")
|
||||
|
||||
pass
|
||||
|
|
|
@ -190,8 +190,3 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
|||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Census ACS Data")
|
||||
|
||||
pass
|
||||
|
|
|
@ -316,11 +316,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
|||
|
||||
self.output_df = merged_with_state_income_df
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Census ACS Median Income Data")
|
||||
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving Census ACS Median Income CSV")
|
||||
|
||||
|
|
|
@ -405,8 +405,3 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
self.df_all[columns_to_include].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating Census Decennial Data")
|
||||
|
||||
pass
|
||||
|
|
|
@ -106,11 +106,6 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
|||
|
||||
self.output_df = output_df
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating data.")
|
||||
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CSV")
|
||||
|
||||
|
|
|
@ -72,11 +72,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
|||
|
||||
self.output_df = output_df
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating DOE Energy Burden Data")
|
||||
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving DOE Energy Burden CSV")
|
||||
|
||||
|
|
|
@ -103,11 +103,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
|||
"bool"
|
||||
)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating data")
|
||||
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CSV")
|
||||
|
||||
|
|
|
@ -144,11 +144,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
|||
f"GEOID Tract must be length of {expected_census_tract_field_length}"
|
||||
)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating data.")
|
||||
|
||||
pass
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving CSV")
|
||||
|
||||
|
|
|
@ -69,8 +69,3 @@ class GeoCorrETL(ExtractTransformLoad):
|
|||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating GeoCorr Urban Rural Map Data")
|
||||
|
||||
pass
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
@ -14,15 +14,14 @@ logger = get_module_logger(__name__)
|
|||
class NationalRiskIndexETL(ExtractTransformLoad):
|
||||
"""ETL class for the FEMA National Risk Index dataset"""
|
||||
|
||||
NAME = "national_risk_index"
|
||||
LAST_UPDATED_YEAR = 2020
|
||||
SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||
|
||||
def __init__(self):
|
||||
self.NRI_FTP_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
||||
self.INPUT_CSV = self.TMP_PATH / "NRI_Table_CensusTracts.csv"
|
||||
self.OUTPUT_DIR = (
|
||||
self.DATA_PATH / "dataset" / "national_risk_index_2020"
|
||||
)
|
||||
self.BLOCK_GROUP_CSV = (
|
||||
self.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
|
||||
)
|
||||
|
||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
|
||||
"EAL_SCORE"
|
||||
)
|
||||
|
@ -52,7 +51,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
|||
"Expected population loss rate (Natural Hazards Risk Index)"
|
||||
)
|
||||
|
||||
# Note: also need to edit transform step to add fields to output.
|
||||
self.COLUMNS_TO_KEEP = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
||||
|
@ -69,8 +67,8 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
|||
"""
|
||||
logger.info("Downloading 405MB National Risk Index Data")
|
||||
super().extract(
|
||||
self.NRI_FTP_URL,
|
||||
self.TMP_PATH,
|
||||
source_url=self.SOURCE_URL,
|
||||
extract_path=self.TMP_PATH,
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
|
@ -164,14 +162,12 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
|||
/ df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
|
||||
)
|
||||
|
||||
self.df = df_nri
|
||||
# Round all float columns to just 10 digits.
|
||||
# Note: `round` is smart enough to only apply to float columns.
|
||||
df_nri = df_nri.round(10)
|
||||
|
||||
self.output_df = df_nri
|
||||
|
||||
def load(self) -> None:
|
||||
"""Writes the NRI data as a csv to the directory at self.OUTPUT_DIR"""
|
||||
logger.info("Saving National Risk Index CSV")
|
||||
|
||||
# write nationwide csv
|
||||
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
self.OUTPUT_DIR / "usa.csv", index=False, float_format="%.10f"
|
||||
)
|
||||
# Suppress scientific notation.
|
||||
super().load(float_format="%.10f")
|
||||
|
|
|
@ -167,8 +167,3 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
|||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating persistent poverty data.")
|
||||
|
||||
pass
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue