mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-21 09:11:26 -08:00
NRI dataset and initial score YAML configuration (#1534)
* update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * update be staging gha * checkpoint * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * checkpoint * PR Review * renoving source url * tests * stop execution of ETL if there's a YAML schema issue * update be staging gha * adding source url as class var again * clean up * force cache bust * gha cache bust * dynamically set score vars from YAML * docsctrings * removing last updated year - optional reverse percentile * passing tests * sort order * column ordening * PR review * class level vars * Updating DatasetsConfig * fix pylint errors * moving metadata hint back to code Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
parent
1833e3e794
commit
1c448a77f9
15 changed files with 272 additions and 3485 deletions
6
.github/workflows/data-checks.yml
vendored
6
.github/workflows/data-checks.yml
vendored
|
@ -2,7 +2,9 @@
|
||||||
name: Data Checks
|
name: Data Checks
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [main] # runs on any PR against main
|
branches:
|
||||||
|
- main
|
||||||
|
- "**/release/**"
|
||||||
paths:
|
paths:
|
||||||
- "data/**"
|
- "data/**"
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -16,7 +18,7 @@ jobs:
|
||||||
# checks all of the versions allowed in pyproject.toml
|
# checks all of the versions allowed in pyproject.toml
|
||||||
python-version: [3.8, 3.9]
|
python-version: [3.8, 3.9]
|
||||||
steps:
|
steps:
|
||||||
# installs python
|
# installs Python
|
||||||
# one execution of the tests per version listed above
|
# one execution of the tests per version listed above
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
|
6
.github/workflows/deploy_be_staging.yml
vendored
6
.github/workflows/deploy_be_staging.yml
vendored
|
@ -1,7 +1,9 @@
|
||||||
name: Deploy Backend Staging
|
name: Deploy Backend Staging
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [main]
|
branches:
|
||||||
|
- main
|
||||||
|
- "**/release/**"
|
||||||
paths:
|
paths:
|
||||||
- "data/**"
|
- "data/**"
|
||||||
env:
|
env:
|
||||||
|
@ -60,7 +62,7 @@ jobs:
|
||||||
- name: Update PR with deployed Score URLs
|
- name: Update PR with deployed Score URLs
|
||||||
uses: mshick/add-pr-comment@v1
|
uses: mshick/add-pr-comment@v1
|
||||||
with:
|
with:
|
||||||
# Deploy to S3 for the staging URL
|
# Deploy to S3 for the Staging URL
|
||||||
message: |
|
message: |
|
||||||
** Score Deployed! **
|
** Score Deployed! **
|
||||||
Find it here:
|
Find it here:
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
import enum
|
import enum
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import sys
|
||||||
import typing
|
import typing
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
|
||||||
from data_pipeline.utils import (
|
from data_pipeline.utils import (
|
||||||
|
load_yaml_dict_from_file,
|
||||||
unzip_file_from_url,
|
unzip_file_from_url,
|
||||||
remove_all_from_dir,
|
remove_all_from_dir,
|
||||||
get_module_logger,
|
get_module_logger,
|
||||||
|
@ -30,6 +33,9 @@ class ExtractTransformLoad:
|
||||||
Attributes:
|
Attributes:
|
||||||
DATA_PATH (pathlib.Path): Local path where all data will be stored
|
DATA_PATH (pathlib.Path): Local path where all data will be stored
|
||||||
TMP_PATH (pathlib.Path): Local path where temporary data will be stored
|
TMP_PATH (pathlib.Path): Local path where temporary data will be stored
|
||||||
|
|
||||||
|
TODO: Fill missing attrs here
|
||||||
|
|
||||||
GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
|
GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
|
||||||
GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
|
GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
|
||||||
"""
|
"""
|
||||||
|
@ -40,6 +46,7 @@ class ExtractTransformLoad:
|
||||||
DATA_PATH: pathlib.Path = APP_ROOT / "data"
|
DATA_PATH: pathlib.Path = APP_ROOT / "data"
|
||||||
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
||||||
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
||||||
|
DATASET_CONFIG: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
|
||||||
|
|
||||||
# Parameters
|
# Parameters
|
||||||
GEOID_FIELD_NAME: str = "GEOID10"
|
GEOID_FIELD_NAME: str = "GEOID10"
|
||||||
|
@ -55,6 +62,9 @@ class ExtractTransformLoad:
|
||||||
# SOURCE_URL is used to extract source data in extract().
|
# SOURCE_URL is used to extract source data in extract().
|
||||||
SOURCE_URL: str = None
|
SOURCE_URL: str = None
|
||||||
|
|
||||||
|
# INPUT_EXTRACTED_FILE_NAME is the name of the file after extract().
|
||||||
|
INPUT_EXTRACTED_FILE_NAME: str = None
|
||||||
|
|
||||||
# GEO_LEVEL is used to identify whether output data is at the unit of the tract or
|
# GEO_LEVEL is used to identify whether output data is at the unit of the tract or
|
||||||
# census block group.
|
# census block group.
|
||||||
# TODO: add tests that enforce seeing the expected geographic identifier field
|
# TODO: add tests that enforce seeing the expected geographic identifier field
|
||||||
|
@ -64,6 +74,13 @@ class ExtractTransformLoad:
|
||||||
# COLUMNS_TO_KEEP is used to identify which columns to keep in the output df.
|
# COLUMNS_TO_KEEP is used to identify which columns to keep in the output df.
|
||||||
COLUMNS_TO_KEEP: typing.List[str] = None
|
COLUMNS_TO_KEEP: typing.List[str] = None
|
||||||
|
|
||||||
|
# INPUT_GEOID_TRACT_FIELD_NAME is the field name that identifies the Census Tract ID
|
||||||
|
# on the input file
|
||||||
|
INPUT_GEOID_TRACT_FIELD_NAME: str = None
|
||||||
|
|
||||||
|
# NULL_REPRESENTATION is how nulls are represented on the input field
|
||||||
|
NULL_REPRESENTATION: str = None
|
||||||
|
|
||||||
# Thirteen digits in a census block group ID.
|
# Thirteen digits in a census block group ID.
|
||||||
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
||||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||||
|
@ -77,8 +94,53 @@ class ExtractTransformLoad:
|
||||||
# periods. https://github.com/usds/justice40-tool/issues/964
|
# periods. https://github.com/usds/justice40-tool/issues/964
|
||||||
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
|
EXPECTED_MAX_CENSUS_TRACTS: int = 74160
|
||||||
|
|
||||||
|
# We use output_df as the final dataframe to use to write to the CSV
|
||||||
|
# It is used on the "load" base class method
|
||||||
output_df: pd.DataFrame = None
|
output_df: pd.DataFrame = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def yaml_config_load(cls) -> dict:
|
||||||
|
"""Generate config dictionary and set instance variables from YAML dataset."""
|
||||||
|
|
||||||
|
# check if the class instance has score YAML definitions
|
||||||
|
datasets_config = load_yaml_dict_from_file(
|
||||||
|
cls.DATASET_CONFIG / "datasets.yml",
|
||||||
|
DatasetsConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
# get the config for this dataset
|
||||||
|
try:
|
||||||
|
dataset_config = next(
|
||||||
|
item
|
||||||
|
for item in datasets_config.get("datasets")
|
||||||
|
if item["module_name"] == cls.NAME
|
||||||
|
)
|
||||||
|
except StopIteration:
|
||||||
|
# Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
|
||||||
|
logger.error(
|
||||||
|
f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
|
||||||
|
)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
# set some of the basic fields
|
||||||
|
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
|
||||||
|
"input_geoid_tract_field_name"
|
||||||
|
]
|
||||||
|
|
||||||
|
# get the columns to write on the CSV
|
||||||
|
# and set the constants
|
||||||
|
cls.COLUMNS_TO_KEEP = [
|
||||||
|
cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id
|
||||||
|
]
|
||||||
|
for field in dataset_config["load_fields"]:
|
||||||
|
cls.COLUMNS_TO_KEEP.append(field["long_name"])
|
||||||
|
|
||||||
|
# set the constants for the class
|
||||||
|
setattr(cls, field["df_field_name"], field["long_name"])
|
||||||
|
|
||||||
|
# return the config dict
|
||||||
|
return dataset_config
|
||||||
|
|
||||||
# This is a classmethod so it can be used by `get_data_frame` without
|
# This is a classmethod so it can be used by `get_data_frame` without
|
||||||
# needing to create an instance of the class. This is a use case in `etl_score`.
|
# needing to create an instance of the class. This is a use case in `etl_score`.
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -87,16 +149,10 @@ class ExtractTransformLoad:
|
||||||
if cls.NAME is None:
|
if cls.NAME is None:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"Child ETL class needs to specify `cls.NAME` (currently "
|
f"Child ETL class needs to specify `cls.NAME` (currently "
|
||||||
f"{cls.NAME}) and `cls.LAST_UPDATED_YEAR` (currently "
|
f"{cls.NAME})."
|
||||||
f"{cls.LAST_UPDATED_YEAR})."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
output_file_path = (
|
output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
|
||||||
cls.DATA_PATH
|
|
||||||
/ "dataset"
|
|
||||||
/ f"{cls.NAME}_{cls.LAST_UPDATED_YEAR}"
|
|
||||||
/ "usa.csv"
|
|
||||||
)
|
|
||||||
return output_file_path
|
return output_file_path
|
||||||
|
|
||||||
def get_tmp_path(self) -> pathlib.Path:
|
def get_tmp_path(self) -> pathlib.Path:
|
||||||
|
@ -229,8 +285,7 @@ class ExtractTransformLoad:
|
||||||
|
|
||||||
Data is written in the specified local data folder or remote AWS S3 bucket.
|
Data is written in the specified local data folder or remote AWS S3 bucket.
|
||||||
|
|
||||||
Uses the directory from `self.OUTPUT_DIR` and the file name from
|
Uses the directory and the file name from `self._get_output_file_path`.
|
||||||
`self._get_output_file_path`.
|
|
||||||
"""
|
"""
|
||||||
logger.info(f"Saving `{self.NAME}` CSV")
|
logger.info(f"Saving `{self.NAME}` CSV")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
---
|
||||||
|
datasets:
|
||||||
|
- long_name: "FEMA National Risk Index"
|
||||||
|
short_name: "nri"
|
||||||
|
module_name: national_risk_index
|
||||||
|
input_geoid_tract_field_name: "TRACTFIPS"
|
||||||
|
load_fields:
|
||||||
|
- short_name: "ex_loss"
|
||||||
|
df_field_name: "RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME"
|
||||||
|
long_name: "FEMA Risk Index Expected Annual Loss Score"
|
||||||
|
field_type: float
|
||||||
|
number_of_decimals_in_output: 6
|
||||||
|
|
||||||
|
- short_name: "ex_pop_loss"
|
||||||
|
df_field_name: "EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME"
|
||||||
|
long_name: "Expected population loss rate (Natural Hazards Risk Index)"
|
||||||
|
description_short:
|
||||||
|
"Rate of fatalities and injuries resulting from natural hazards each year"
|
||||||
|
description_long:
|
||||||
|
"Rate relative to the population of fatalities and injuries due to fourteen
|
||||||
|
types of natural hazards each year that have some link to climate change:
|
||||||
|
avalanche, coastal flooding, cold wave, drought, hail, heat wave, hurricane,
|
||||||
|
ice storm, landslide, riverine flooding, strong wind, tornado, wildfire, and
|
||||||
|
winter weather. Population loss is defined as the Spatial Hazard Events and
|
||||||
|
Losses and National Centers for Environmental Information’s (NCEI) reported
|
||||||
|
number of fatalities and injuries caused by the hazard occurrence. To combine
|
||||||
|
fatalities and injuries for the computation of population loss value, an
|
||||||
|
injury is counted as one-tenth (1/10) of a fatality. The NCEI Storm Events
|
||||||
|
Database classifies injuries and fatalities as direct or indirect. Both direct
|
||||||
|
and indirect injuries and fatalities are counted as population loss. This
|
||||||
|
total number of injuries and fatalities is then divided by the population in
|
||||||
|
the census tract to get a per-capita rate of population risk."
|
||||||
|
field_type: float
|
||||||
|
number_of_decimals_in_output: 6
|
||||||
|
include_in_tiles: true
|
||||||
|
include_in_downloadable_files: true
|
||||||
|
create_percentile: true
|
||||||
|
|
||||||
|
- short_name: "ex_ag_loss"
|
||||||
|
df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME"
|
||||||
|
long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)"
|
||||||
|
description_short:
|
||||||
|
"Economic loss rate to agricultural value resulting from natural hazards each
|
||||||
|
year"
|
||||||
|
description_long:
|
||||||
|
"Percent of agricultural value at risk from losses due to fourteen types of
|
||||||
|
natural hazards that have some link to climate change: avalanche, coastal
|
||||||
|
flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
|
||||||
|
landslide, riverine flooding, strong wind, tornado, wildfire, and winter
|
||||||
|
weather. Rate calculated by dividing the agricultural value at risk in a
|
||||||
|
census tract by the total agricultural value in that census tract."
|
||||||
|
field_type: float
|
||||||
|
number_of_decimals_in_output: 6
|
||||||
|
include_in_tiles: true
|
||||||
|
include_in_downloadable_files: true
|
||||||
|
create_percentile: true
|
||||||
|
|
||||||
|
- short_name: "ex_bldg_loss"
|
||||||
|
df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME"
|
||||||
|
long_name: "Expected building loss rate (Natural Hazards Risk Index)"
|
||||||
|
description_short:
|
||||||
|
"Economic loss rate to building value resulting from natural hazards each year"
|
||||||
|
description_long:
|
||||||
|
"Percent of building value at risk from losses due to fourteen types of
|
||||||
|
natural hazards that have some link to climate change: avalanche, coastal
|
||||||
|
flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
|
||||||
|
landslide, riverine flooding, strong wind, tornado, wildfire, and winter
|
||||||
|
weather. Rate calculated by dividing the building value at risk in a census
|
||||||
|
tract by the total building value in that census tract."
|
||||||
|
field_type: float
|
||||||
|
number_of_decimals_in_output: 6
|
||||||
|
include_in_tiles: true
|
||||||
|
include_in_downloadable_files: true
|
||||||
|
create_percentile: true
|
||||||
|
|
||||||
|
- short_name: "has_ag_val"
|
||||||
|
df_field_name: "CONTAINS_AGRIVALUE"
|
||||||
|
long_name: "Contains agricultural value"
|
||||||
|
field_type: bool
|
|
@ -480,6 +480,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
# for instance, 3rd grade reading level : Low 3rd grade reading level.
|
# for instance, 3rd grade reading level : Low 3rd grade reading level.
|
||||||
# This low field will not exist yet, it is only calculated for the
|
# This low field will not exist yet, it is only calculated for the
|
||||||
# percentile.
|
# percentile.
|
||||||
|
# TODO: This will come from the YAML dataset config
|
||||||
ReversePercentile(
|
ReversePercentile(
|
||||||
field_name=field_names.READING_FIELD,
|
field_name=field_names.READING_FIELD,
|
||||||
low_field_name=field_names.LOW_READING_FIELD,
|
low_field_name=field_names.LOW_READING_FIELD,
|
||||||
|
|
|
@ -0,0 +1,83 @@
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class FieldType(Enum):
|
||||||
|
STRING = "string"
|
||||||
|
INT64 = "int64"
|
||||||
|
BOOL = "bool"
|
||||||
|
FLOAT = "float"
|
||||||
|
PERCENTAGE = "percentage"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DatasetsConfig:
|
||||||
|
@dataclass
|
||||||
|
class Dataset:
|
||||||
|
"""A class that defines a dataset and its load variables.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
long_name (str): A human readable title for the dataset.
|
||||||
|
short_name (str): used to compose the short variable names for tiles/arcgis. All short variable names will be prepended
|
||||||
|
with the short name of the data set it comes from, i.e. `nri__ex_loss`.
|
||||||
|
module_name (str): A string that matches both the Python module name for the dataset and the `NAME` property on the ETL class.
|
||||||
|
load_fields (LoadField): A list of type LoadField that will drive the score ETL and side effects (tiles, downloadables).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LoadField:
|
||||||
|
"""A class to define the fields to be saved on the dataset's output.
|
||||||
|
|
||||||
|
These fields will be then imported by the score generation ETL.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
short_name (str): Used in conjunction with the dataset's `short_name` for files where short names are needed.
|
||||||
|
df_field_name (str): Name for the field in the etl class.
|
||||||
|
long_name (str): Column name for the dataset's output csv.
|
||||||
|
field_type (FieldType): An enum that dictates what type of field this is.
|
||||||
|
description_short (Optional str): Description used if the field appears in the side panel.
|
||||||
|
description_long (Optional str): Description used if the field appears in the Methodology page.
|
||||||
|
number_of_decimals_in_output (Optional int): Used to represent number of decimals in side effects, like Excel. Defaults to 2 decimals.
|
||||||
|
include_in_tiles (Optional bool): Include this field on the tile export. Defaults to False.
|
||||||
|
include_in_downloadable_files (Optional bool): Include this field on the CSV and Excel exports. Defaults to False.
|
||||||
|
create_percentile (Optional bool): Whether or not the backend processing should create a percentile field (ranked in ascending order)
|
||||||
|
from the values in this field. Defaults to False.
|
||||||
|
create_reverse_percentile (Optional bool): Whether or not the backend processing should create a "reverse percentile" field (ranked in
|
||||||
|
descending order) from the values in this field. Defaults to False.
|
||||||
|
include_in_comparison_tool_as_index (Optional bool): Whether or not to include this field in the comparison tool
|
||||||
|
as an index used as comparison (e.g., this field might be a state or national index that identifies priority communities).
|
||||||
|
The field itself must be a boolean for the comparison tool to work appropriately. Defaults to False.
|
||||||
|
include_in_comparison_tool_as_statistical_descriptor (Optional bool): Whether or not to include this field in the comparison tool as a
|
||||||
|
statistical descriptor of census tracts (e.g., this field might income levels, life expectancy, etc). This will be
|
||||||
|
used to generate reports that produce information such as, tracts identified by Index A but not Index B have higher
|
||||||
|
income levels but lower life expectancy. Defaults to False.
|
||||||
|
"""
|
||||||
|
|
||||||
|
short_name: str
|
||||||
|
df_field_name: str
|
||||||
|
long_name: str
|
||||||
|
field_type: FieldType = field(
|
||||||
|
metadata={"by_value": True}
|
||||||
|
) # This will be used on the `etl_score_post` for the
|
||||||
|
# data manipulation. The `by_value` metadata prop will load the field type's Enum value instead of the index, i.e. "string"
|
||||||
|
# and not STRING
|
||||||
|
description_short: Optional[str] = None
|
||||||
|
description_long: Optional[str] = None
|
||||||
|
number_of_decimals_in_output: Optional[int] = 2
|
||||||
|
include_in_tiles: Optional[bool] = False
|
||||||
|
include_in_downloadable_files: Optional[bool] = False
|
||||||
|
create_percentile: Optional[bool] = False
|
||||||
|
create_reverse_percentile: Optional[bool] = False
|
||||||
|
include_in_comparison_tool_as_index: Optional[bool] = False
|
||||||
|
include_in_comparison_tool_as_statistical_descriptor: Optional[
|
||||||
|
bool
|
||||||
|
] = False
|
||||||
|
|
||||||
|
long_name: str
|
||||||
|
short_name: str
|
||||||
|
module_name: str
|
||||||
|
input_geoid_tract_field_name: str
|
||||||
|
load_fields: List[LoadField]
|
||||||
|
|
||||||
|
datasets: List[Dataset]
|
|
@ -15,10 +15,16 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
"""ETL class for the FEMA National Risk Index dataset"""
|
"""ETL class for the FEMA National Risk Index dataset"""
|
||||||
|
|
||||||
NAME = "national_risk_index"
|
NAME = "national_risk_index"
|
||||||
LAST_UPDATED_YEAR = 2020
|
|
||||||
SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
|
|
||||||
|
# Output score variables (values set on datasets.yml) for linting purposes
|
||||||
|
RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME: str
|
||||||
|
EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME: str
|
||||||
|
EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME: str
|
||||||
|
EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME: str
|
||||||
|
CONTAINS_AGRIVALUE: str
|
||||||
|
|
||||||
## TEMPORARILY HERE
|
## TEMPORARILY HERE
|
||||||
## To get this value up in time for launch, we've hard coded it. We would like
|
## To get this value up in time for launch, we've hard coded it. We would like
|
||||||
## to, in the future, have this pull the 10th percentile (or nth percentile)
|
## to, in the future, have this pull the 10th percentile (or nth percentile)
|
||||||
|
@ -27,54 +33,34 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
AGRIVALUE_LOWER_BOUND = 408000
|
AGRIVALUE_LOWER_BOUND = 408000
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
# load YAML config
|
||||||
|
self.DATASET_CONFIG = super().yaml_config_load()
|
||||||
|
|
||||||
|
# define the full path for the input CSV file
|
||||||
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
|
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
|
||||||
|
|
||||||
|
# this is the main dataframe
|
||||||
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
# Start dataset-specific vars here
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
|
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
|
||||||
"EAL_SCORE"
|
"EAL_SCORE"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME = (
|
|
||||||
"FEMA Risk Index Expected Annual Loss Score"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.EXPECTED_ANNUAL_LOSS_BUILDING_VALUE_INPUT_FIELD_NAME = "EAL_VALB"
|
self.EXPECTED_ANNUAL_LOSS_BUILDING_VALUE_INPUT_FIELD_NAME = "EAL_VALB"
|
||||||
|
|
||||||
self.EXPECTED_ANNUAL_LOSS_AGRICULTURAL_VALUE_INPUT_FIELD_NAME = (
|
self.EXPECTED_ANNUAL_LOSS_AGRICULTURAL_VALUE_INPUT_FIELD_NAME = (
|
||||||
"EAL_VALA"
|
"EAL_VALA"
|
||||||
)
|
)
|
||||||
self.EXPECTED_ANNUAL_LOSS_POPULATION_VALUE_INPUT_FIELD_NAME = "EAL_VALP"
|
self.EXPECTED_ANNUAL_LOSS_POPULATION_VALUE_INPUT_FIELD_NAME = "EAL_VALP"
|
||||||
|
|
||||||
self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME = "AGRIVALUE"
|
self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME = "AGRIVALUE"
|
||||||
self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
|
self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
|
||||||
self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
|
self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
|
||||||
|
|
||||||
self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = (
|
|
||||||
"Expected building loss rate (Natural Hazards Risk Index)"
|
|
||||||
)
|
|
||||||
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = (
|
|
||||||
"Expected agricultural loss rate (Natural Hazards Risk Index)"
|
|
||||||
)
|
|
||||||
self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = (
|
|
||||||
"Expected population loss rate (Natural Hazards Risk Index)"
|
|
||||||
)
|
|
||||||
self.CONTAINS_AGRIVALUE = "Contains agricultural value"
|
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP = [
|
|
||||||
self.GEOID_TRACT_FIELD_NAME,
|
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
|
||||||
self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME,
|
|
||||||
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME,
|
|
||||||
self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME,
|
|
||||||
self.CONTAINS_AGRIVALUE,
|
|
||||||
]
|
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
"""Unzips NRI dataset from the FEMA data source and writes the files
|
"""Unzips NRI dataset from the FEMA data source and writes the files
|
||||||
to the temporary data folder for use in the transform() method
|
to the temporary data folder for use in the transform() method
|
||||||
"""
|
"""
|
||||||
logger.info("Downloading 405MB National Risk Index Data")
|
logger.info("Downloading 405MB National Risk Index Data")
|
||||||
|
|
||||||
super().extract(
|
super().extract(
|
||||||
source_url=self.SOURCE_URL,
|
source_url=self.SOURCE_URL,
|
||||||
extract_path=self.get_tmp_path(),
|
extract_path=self.get_tmp_path(),
|
||||||
|
@ -90,19 +76,18 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming National Risk Index Data")
|
logger.info("Transforming National Risk Index Data")
|
||||||
|
|
||||||
NRI_TRACT_COL = "TRACTFIPS" # Census Tract Column in NRI data
|
|
||||||
|
|
||||||
# read in the unzipped csv from NRI data source then rename the
|
# read in the unzipped csv from NRI data source then rename the
|
||||||
# Census Tract column for merging
|
# Census Tract column for merging
|
||||||
df_nri: pd.DataFrame = pd.read_csv(
|
df_nri: pd.DataFrame = pd.read_csv(
|
||||||
self.INPUT_CSV,
|
self.INPUT_CSV,
|
||||||
dtype={NRI_TRACT_COL: "string"},
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
na_values=["None"],
|
na_values=["None"],
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
df_nri.rename(
|
df_nri.rename(
|
||||||
columns={
|
columns={
|
||||||
NRI_TRACT_COL: self.GEOID_TRACT_FIELD_NAME,
|
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
||||||
},
|
},
|
||||||
inplace=True,
|
inplace=True,
|
||||||
|
@ -170,6 +155,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
].clip(
|
].clip(
|
||||||
lower=self.AGRIVALUE_LOWER_BOUND
|
lower=self.AGRIVALUE_LOWER_BOUND
|
||||||
)
|
)
|
||||||
|
|
||||||
# This produces a boolean that is True in the case of non-zero agricultural value
|
# This produces a boolean that is True in the case of non-zero agricultural value
|
||||||
df_nri[self.CONTAINS_AGRIVALUE] = (
|
df_nri[self.CONTAINS_AGRIVALUE] = (
|
||||||
df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
|
df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
|
||||||
|
@ -185,6 +171,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
# Note: `round` is smart enough to only apply to float columns.
|
# Note: `round` is smart enough to only apply to float columns.
|
||||||
df_nri = df_nri.round(10)
|
df_nri = df_nri.round(10)
|
||||||
|
|
||||||
|
# Assign the final df to the class' output_df for the load method
|
||||||
self.output_df = df_nri
|
self.output_df = df_nri
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
|
|
|
@ -119,6 +119,7 @@ class TestETL:
|
||||||
"""
|
"""
|
||||||
# Setup
|
# Setup
|
||||||
etl = self._get_instance_of_etl_class()
|
etl = self._get_instance_of_etl_class()
|
||||||
|
etl.__init__()
|
||||||
data_path, tmp_path = mock_paths
|
data_path, tmp_path = mock_paths
|
||||||
|
|
||||||
assert etl.DATA_PATH == data_path
|
assert etl.DATA_PATH == data_path
|
||||||
|
@ -126,8 +127,6 @@ class TestETL:
|
||||||
|
|
||||||
# Also make sure all parameters that need to be non-null are non-null
|
# Also make sure all parameters that need to be non-null are non-null
|
||||||
assert etl.NAME is not None
|
assert etl.NAME is not None
|
||||||
assert etl.LAST_UPDATED_YEAR is not None
|
|
||||||
assert etl.SOURCE_URL is not None
|
|
||||||
assert etl.GEO_LEVEL is not None
|
assert etl.GEO_LEVEL is not None
|
||||||
assert etl.COLUMNS_TO_KEEP is not None
|
assert etl.COLUMNS_TO_KEEP is not None
|
||||||
assert len(etl.COLUMNS_TO_KEEP) > 0
|
assert len(etl.COLUMNS_TO_KEEP) > 0
|
||||||
|
@ -148,14 +147,10 @@ class TestETL:
|
||||||
etl = self._get_instance_of_etl_class()
|
etl = self._get_instance_of_etl_class()
|
||||||
data_path, tmp_path = mock_paths
|
data_path, tmp_path = mock_paths
|
||||||
|
|
||||||
|
etl.__init__()
|
||||||
actual_file_path = etl._get_output_file_path()
|
actual_file_path = etl._get_output_file_path()
|
||||||
|
|
||||||
expected_file_path = (
|
expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv"
|
||||||
data_path
|
|
||||||
/ "dataset"
|
|
||||||
/ f"{etl.NAME}_{etl.LAST_UPDATED_YEAR}"
|
|
||||||
/ "usa.csv"
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Expected: {expected_file_path}")
|
logger.info(f"Expected: {expected_file_path}")
|
||||||
|
|
||||||
|
@ -255,6 +250,7 @@ class TestETL:
|
||||||
etl = self._setup_etl_instance_and_run_extract(
|
etl = self._setup_etl_instance_and_run_extract(
|
||||||
mock_etl=mock_etl, mock_paths=mock_paths
|
mock_etl=mock_etl, mock_paths=mock_paths
|
||||||
)
|
)
|
||||||
|
etl.__init__()
|
||||||
etl.transform()
|
etl.transform()
|
||||||
|
|
||||||
assert etl.output_df is not None
|
assert etl.output_df is not None
|
||||||
|
@ -272,6 +268,7 @@ class TestETL:
|
||||||
"""
|
"""
|
||||||
# setup - input variables
|
# setup - input variables
|
||||||
etl = self._get_instance_of_etl_class()
|
etl = self._get_instance_of_etl_class()
|
||||||
|
etl.__init__()
|
||||||
|
|
||||||
# setup - mock transform step
|
# setup - mock transform step
|
||||||
df_transform = pd.read_csv(
|
df_transform = pd.read_csv(
|
||||||
|
|
|
@ -87,11 +87,6 @@ class TestNationalRiskIndexETL(TestETL):
|
||||||
assert etl.GEOID_FIELD_NAME == "GEOID10"
|
assert etl.GEOID_FIELD_NAME == "GEOID10"
|
||||||
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
|
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
|
||||||
assert etl.NAME == "national_risk_index"
|
assert etl.NAME == "national_risk_index"
|
||||||
assert etl.LAST_UPDATED_YEAR == 2020
|
|
||||||
assert (
|
|
||||||
etl.SOURCE_URL
|
|
||||||
== "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
|
||||||
)
|
|
||||||
assert etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT
|
assert etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT
|
||||||
assert etl.COLUMNS_TO_KEEP == [
|
assert etl.COLUMNS_TO_KEEP == [
|
||||||
etl.GEOID_TRACT_FIELD_NAME,
|
etl.GEOID_TRACT_FIELD_NAME,
|
||||||
|
@ -109,6 +104,6 @@ class TestNationalRiskIndexETL(TestETL):
|
||||||
|
|
||||||
output_file_path = etl._get_output_file_path()
|
output_file_path = etl._get_output_file_path()
|
||||||
expected_output_file_path = (
|
expected_output_file_path = (
|
||||||
data_path / "dataset" / "national_risk_index_2020" / "usa.csv"
|
data_path / "dataset" / "national_risk_index" / "usa.csv"
|
||||||
)
|
)
|
||||||
assert output_file_path == expected_output_file_path
|
assert output_file_path == expected_output_file_path
|
||||||
|
|
|
@ -8,6 +8,7 @@ import shutil
|
||||||
import uuid
|
import uuid
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from marshmallow import ValidationError
|
||||||
import urllib3
|
import urllib3
|
||||||
import requests
|
import requests
|
||||||
import yaml
|
import yaml
|
||||||
|
@ -350,7 +351,13 @@ def load_yaml_dict_from_file(
|
||||||
|
|
||||||
# validate YAML
|
# validate YAML
|
||||||
yaml_config_schema = class_schema(schema_class)
|
yaml_config_schema = class_schema(schema_class)
|
||||||
yaml_config_schema().load(yaml_dict)
|
|
||||||
|
try:
|
||||||
|
yaml_config_schema().load(yaml_dict)
|
||||||
|
except ValidationError as e:
|
||||||
|
logger.error(f"Invalid YAML config file {yaml_file_path}")
|
||||||
|
logger.error(e.normalized_messages())
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
return yaml_dict
|
return yaml_dict
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
{
|
{
|
||||||
|
"_comment": "Markdown Link Checker configuration, see https://github.com/gaurav-nelson/github-action-markdown-link-check and https://github.com/tcort/markdown-link-check",
|
||||||
"ignorePatterns": [
|
"ignorePatterns": [
|
||||||
{
|
{
|
||||||
"pattern": "^http://localhost"
|
"pattern": "^http://localhost"
|
||||||
|
|
3415
package-lock.json
generated
3415
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -1,7 +0,0 @@
|
||||||
{
|
|
||||||
"dependencies": {
|
|
||||||
"@turf/turf": "^6.5.0",
|
|
||||||
"@types/d3-ease": "^3.0.0",
|
|
||||||
"d3-ease": "^3.0.1"
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Add table
Reference in a new issue