mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
columns to keep and tests
This commit is contained in:
parent
8dc03b2111
commit
d0841706d5
6 changed files with 44 additions and 29 deletions
2
.github/workflows/deploy_be_staging.yml
vendored
2
.github/workflows/deploy_be_staging.yml
vendored
|
@ -109,7 +109,7 @@ jobs:
|
||||||
# Deploy to S3 for the staging URL
|
# Deploy to S3 for the staging URL
|
||||||
message: |
|
message: |
|
||||||
** Map Deployed! **
|
** Map Deployed! **
|
||||||
Map with Staging Backend: https://screeningtool.geoplatform.gov/en/cejst/?flags=stage_hash=${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}
|
Map with Staging Backend: https://screeningtool.geoplatform.gov/en?flags=stage_hash=${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}
|
||||||
Find tiles here: https://justice40-data.s3.amazonaws.com/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/tiles
|
Find tiles here: https://justice40-data.s3.amazonaws.com/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/tiles
|
||||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
repo-token-user-login: "github-actions[bot]"
|
repo-token-user-login: "github-actions[bot]"
|
||||||
|
|
|
@ -71,6 +71,13 @@ class ExtractTransformLoad:
|
||||||
# COLUMNS_TO_KEEP is used to identify which columns to keep in the output df.
|
# COLUMNS_TO_KEEP is used to identify which columns to keep in the output df.
|
||||||
COLUMNS_TO_KEEP: typing.List[str] = None
|
COLUMNS_TO_KEEP: typing.List[str] = None
|
||||||
|
|
||||||
|
# INPUT_GEOID_TRACT_FIELD_NAME is the field name that identifies the Census Tract ID
|
||||||
|
# on the input file
|
||||||
|
INPUT_GEOID_TRACT_FIELD_NAME: str = None
|
||||||
|
|
||||||
|
# NULL_REPRESENTATION is how nulls are represented on the input field
|
||||||
|
NULL_REPRESENTATION: str = None
|
||||||
|
|
||||||
# Thirteen digits in a census block group ID.
|
# Thirteen digits in a census block group ID.
|
||||||
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
||||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||||
|
@ -84,10 +91,12 @@ class ExtractTransformLoad:
|
||||||
# periods. https://github.com/usds/justice40-tool/issues/964
|
# periods. https://github.com/usds/justice40-tool/issues/964
|
||||||
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
|
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
|
||||||
|
|
||||||
|
# We use ourput_df as the final dataframe to use to write to the CSV
|
||||||
|
# It is used on the "load" base class method
|
||||||
output_df: pd.DataFrame = None
|
output_df: pd.DataFrame = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def yaml_config_load(cls):
|
def yaml_config_load(cls) -> dict:
|
||||||
# check if the class instance has score YAML definitions
|
# check if the class instance has score YAML definitions
|
||||||
datasets_config = load_yaml_dict_from_file(
|
datasets_config = load_yaml_dict_from_file(
|
||||||
cls.DATASET_CONFIG / "datasets.yml",
|
cls.DATASET_CONFIG / "datasets.yml",
|
||||||
|
@ -108,10 +117,24 @@ class ExtractTransformLoad:
|
||||||
)
|
)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
# set the fields
|
# set some of the basic fields
|
||||||
cls.LAST_UPDATED_YEAR = dataset_config["last_updated_year"]
|
cls.LAST_UPDATED_YEAR = dataset_config["last_updated_year"]
|
||||||
cls.SOURCE_URL = dataset_config["source_url"]
|
cls.SOURCE_URL = dataset_config["source_url"]
|
||||||
cls.INPUT_EXTRACTED_FILE_NAME = dataset_config["extracted_file_name"]
|
cls.INPUT_EXTRACTED_FILE_NAME = dataset_config["extracted_file_name"]
|
||||||
|
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
|
||||||
|
"input_geoid_tract_field_name"
|
||||||
|
]
|
||||||
|
cls.NULL_REPRESENTATION = dataset_config["null_representation"]
|
||||||
|
|
||||||
|
# get the columns to write on the CSV
|
||||||
|
cls.COLUMNS_TO_KEEP = [
|
||||||
|
cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id
|
||||||
|
]
|
||||||
|
for field in dataset_config["load_fields"]:
|
||||||
|
cls.COLUMNS_TO_KEEP.append(field["long_name"])
|
||||||
|
|
||||||
|
# return the config dict
|
||||||
|
return dataset_config
|
||||||
|
|
||||||
# This is a classmethod so it can be used by `get_data_frame` without
|
# This is a classmethod so it can be used by `get_data_frame` without
|
||||||
# needing to create an instance of the class. This is a use case in `etl_score`.
|
# needing to create an instance of the class. This is a use case in `etl_score`.
|
||||||
|
@ -263,8 +286,7 @@ class ExtractTransformLoad:
|
||||||
|
|
||||||
Data is written in the specified local data folder or remote AWS S3 bucket.
|
Data is written in the specified local data folder or remote AWS S3 bucket.
|
||||||
|
|
||||||
Uses the directory from `self.OUTPUT_DIR` and the file name from
|
Uses the directory and the file name from `self._get_output_file_path`.
|
||||||
`self._get_output_file_path`.
|
|
||||||
"""
|
"""
|
||||||
logger.info(f"Saving `{self.NAME}` CSV")
|
logger.info(f"Saving `{self.NAME}` CSV")
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ datasets:
|
||||||
excel_download: true
|
excel_download: true
|
||||||
- short_name: "has_ag_val"
|
- short_name: "has_ag_val"
|
||||||
df_field_name: "CONTAINS_AGRIVALUE"
|
df_field_name: "CONTAINS_AGRIVALUE"
|
||||||
long_name: "Expected building loss rate (Natural Hazards Risk Index)"
|
long_name: "Contains agricultural value"
|
||||||
field_type: bool
|
field_type: bool
|
||||||
tile_include: true
|
tile_include: true
|
||||||
csv_download: true
|
csv_download: true
|
||||||
|
|
|
@ -26,8 +26,15 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# load YAML config
|
# load YAML config
|
||||||
super().yaml_config_load()
|
self.DATASET_CONFIG = super().yaml_config_load()
|
||||||
|
|
||||||
|
# define the full path for the input CSV file
|
||||||
|
self.INPUT_CSV = self.get_tmp_path() / self.INPUT_EXTRACTED_FILE_NAME
|
||||||
|
|
||||||
|
# this is the main dataframe
|
||||||
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
# Start dataset-specific vars here
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
|
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
|
||||||
"EAL_SCORE"
|
"EAL_SCORE"
|
||||||
)
|
)
|
||||||
|
@ -58,18 +65,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
self.CONTAINS_AGRIVALUE = "Contains agricultural value"
|
self.CONTAINS_AGRIVALUE = "Contains agricultural value"
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP = [
|
|
||||||
self.GEOID_TRACT_FIELD_NAME,
|
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
|
||||||
self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME,
|
|
||||||
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME,
|
|
||||||
self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME,
|
|
||||||
self.CONTAINS_AGRIVALUE,
|
|
||||||
]
|
|
||||||
|
|
||||||
self.INPUT_CSV = self.get_tmp_path() / self.INPUT_EXTRACTED_FILE_NAME
|
|
||||||
self.df: pd.DataFrame
|
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
"""Unzips NRI dataset from the FEMA data source and writes the files
|
"""Unzips NRI dataset from the FEMA data source and writes the files
|
||||||
to the temporary data folder for use in the transform() method
|
to the temporary data folder for use in the transform() method
|
||||||
|
@ -90,19 +85,18 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming National Risk Index Data")
|
logger.info("Transforming National Risk Index Data")
|
||||||
|
|
||||||
NRI_TRACT_COL = "TRACTFIPS" # Census Tract Column in NRI data
|
|
||||||
|
|
||||||
# read in the unzipped csv from NRI data source then rename the
|
# read in the unzipped csv from NRI data source then rename the
|
||||||
# Census Tract column for merging
|
# Census Tract column for merging
|
||||||
df_nri: pd.DataFrame = pd.read_csv(
|
df_nri: pd.DataFrame = pd.read_csv(
|
||||||
self.INPUT_CSV,
|
self.INPUT_CSV,
|
||||||
dtype={NRI_TRACT_COL: "string"},
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
na_values=["None"],
|
na_values=[self.NULL_REPRESENTATION],
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
df_nri.rename(
|
df_nri.rename(
|
||||||
columns={
|
columns={
|
||||||
NRI_TRACT_COL: self.GEOID_TRACT_FIELD_NAME,
|
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
||||||
},
|
},
|
||||||
inplace=True,
|
inplace=True,
|
||||||
|
@ -185,6 +179,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
# Note: `round` is smart enough to only apply to float columns.
|
# Note: `round` is smart enough to only apply to float columns.
|
||||||
df_nri = df_nri.round(10)
|
df_nri = df_nri.round(10)
|
||||||
|
|
||||||
|
# Assign the final df to the class' output_df for the load method
|
||||||
self.output_df = df_nri
|
self.output_df = df_nri
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
|
|
|
@ -52,8 +52,3 @@ def mock_etl(monkeypatch, mock_paths) -> None:
|
||||||
data_path, tmp_path = mock_paths
|
data_path, tmp_path = mock_paths
|
||||||
monkeypatch.setattr(ExtractTransformLoad, "DATA_PATH", data_path)
|
monkeypatch.setattr(ExtractTransformLoad, "DATA_PATH", data_path)
|
||||||
monkeypatch.setattr(ExtractTransformLoad, "TMP_PATH", tmp_path)
|
monkeypatch.setattr(ExtractTransformLoad, "TMP_PATH", tmp_path)
|
||||||
monkeypatch.setattr(
|
|
||||||
ExtractTransformLoad,
|
|
||||||
"CONTENT_CONFIG",
|
|
||||||
Path.cwd() / "data_pipeline" / "score" / "config",
|
|
||||||
)
|
|
||||||
|
|
|
@ -119,6 +119,7 @@ class TestETL:
|
||||||
"""
|
"""
|
||||||
# Setup
|
# Setup
|
||||||
etl = self._get_instance_of_etl_class()
|
etl = self._get_instance_of_etl_class()
|
||||||
|
etl.__init__()
|
||||||
data_path, tmp_path = mock_paths
|
data_path, tmp_path = mock_paths
|
||||||
|
|
||||||
assert etl.DATA_PATH == data_path
|
assert etl.DATA_PATH == data_path
|
||||||
|
@ -256,6 +257,7 @@ class TestETL:
|
||||||
etl = self._setup_etl_instance_and_run_extract(
|
etl = self._setup_etl_instance_and_run_extract(
|
||||||
mock_etl=mock_etl, mock_paths=mock_paths
|
mock_etl=mock_etl, mock_paths=mock_paths
|
||||||
)
|
)
|
||||||
|
etl.__init__()
|
||||||
etl.transform()
|
etl.transform()
|
||||||
|
|
||||||
assert etl.output_df is not None
|
assert etl.output_df is not None
|
||||||
|
@ -273,6 +275,7 @@ class TestETL:
|
||||||
"""
|
"""
|
||||||
# setup - input variables
|
# setup - input variables
|
||||||
etl = self._get_instance_of_etl_class()
|
etl = self._get_instance_of_etl_class()
|
||||||
|
etl.__init__()
|
||||||
|
|
||||||
# setup - mock transform step
|
# setup - mock transform step
|
||||||
df_transform = pd.read_csv(
|
df_transform = pd.read_csv(
|
||||||
|
|
Loading…
Add table
Reference in a new issue