From e5b84dc28d3a5d5b67b43da400731d50bfdae1e8 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Mon, 27 Jun 2022 12:59:17 -0400 Subject: [PATCH] removing last updated year - optional reverse percentile --- data/data-pipeline/data_pipeline/etl/base.py | 11 ++----- .../etl/score/config/datasets.yml | 31 +++++++++---------- .../data_pipeline/etl/score/etl_score.py | 1 + .../etl/score/schemas/datasets.py | 5 ++- .../tests/sources/example/test_etl.py | 1 - .../sources/national_risk_index/test_etl.py | 3 +- 6 files changed, 23 insertions(+), 29 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index b1aed335..873aa74c 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -123,7 +123,6 @@ class ExtractTransformLoad: sys.exit() # set some of the basic fields - cls.LAST_UPDATED_YEAR = dataset_config["last_updated_year"] cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[ "input_geoid_tract_field_name" ] @@ -150,16 +149,10 @@ class ExtractTransformLoad: if cls.NAME is None: raise NotImplementedError( f"Child ETL class needs to specify `cls.NAME` (currently " - f"{cls.NAME}) and `cls.LAST_UPDATED_YEAR` (currently " - f"{cls.LAST_UPDATED_YEAR})." + f"{cls.NAME})." ) - output_file_path = ( - cls.DATA_PATH - / "dataset" - / f"{cls.NAME}_{cls.LAST_UPDATED_YEAR}" - / "usa.csv" - ) + output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv" return output_file_path def get_tmp_path(self) -> pathlib.Path: diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index 3986ad34..8d18ae38 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -3,7 +3,6 @@ datasets: - long_name: "FEMA National Risk Index" short_name: "nri" module_name: national_risk_index - last_updated_year: 2020 description: "Dataset from FEMA that identifies communities most at risk to 18 natural hazards." input_geoid_tract_field_name: "TRACTFIPS" load_fields: @@ -11,34 +10,34 @@ datasets: df_field_name: "RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME" long_name: "FEMA Risk Index Expected Annual Loss Score" field_type: float - tile_include: true - csv_download: true - excel_download: true + include_in_tiles: true + include_in_csv: true + include_in_excel: true - short_name: "ex_pop_loss" df_field_name: "EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME" long_name: "Expected population loss rate (Natural Hazards Risk Index)" field_type: float - tile_include: true - csv_download: true - excel_download: true + include_in_tiles: true + include_in_csv: true + include_in_excel: true - short_name: "ex_ag_loss" df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME" long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)" field_type: float - tile_include: true - csv_download: true - excel_download: true + include_in_tiles: true + include_in_csv: true + include_in_excel: true - short_name: "ex_bldg_loss" df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME" long_name: "Expected building loss rate (Natural Hazards Risk Index)" field_type: float - tile_include: true - csv_download: true - excel_download: true + include_in_tiles: true + include_in_csv: true + include_in_excel: true - short_name: "has_ag_val" df_field_name: "CONTAINS_AGRIVALUE" long_name: "Contains agricultural value" field_type: bool - tile_include: true - csv_download: true - excel_download: true + include_in_tiles: true + include_in_csv: true + include_in_excel: true diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index fbef9dc2..6d952d0d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -480,6 +480,7 @@ class ScoreETL(ExtractTransformLoad): # for instance, 3rd grade reading level : Low 3rd grade reading level. # This low field will not exist yet, it is only calculated for the # percentile. + # TODO: This will come from the YAML dataset config ReversePercentile( field_name=field_names.READING_FIELD, low_field_name=field_names.LOW_READING_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py index 0d1e6f84..369e8c15 100644 --- a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py +++ b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field from enum import Enum -from typing import List +from typing import List, Optional class FieldType(Enum): @@ -40,6 +40,8 @@ class DatasetsConfig: field_type (FieldType): An enum that dictates what type of field this is. This will be used on the `etl_score_post` for the data manipulation. The `by_value` metadata prop will load the field type's Enum value instead of the index, i.e. "string" and not STRING + reverse_percentile (Optional bool): An optional bool to denote this field to be a reverse_percentile. + TODO: data/data-pipeline/data_pipeline/etl/score/etl_score.py:477 include_in_tiles (bool): Include this field on the tile export. include_in_csv (bool): Include this field on the CSV export. include_in_excel (bool): Include this field on the Excel export. @@ -49,6 +51,7 @@ class DatasetsConfig: df_field_name: str long_name: str field_type: FieldType = field(metadata={"by_value": True}) + reverse_percentile: Optional[bool] include_in_tiles: bool include_in_csv: bool include_in_excel: bool diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py index 82ab807d..6bbf2672 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py @@ -127,7 +127,6 @@ class TestETL: # Also make sure all parameters that need to be non-null are non-null assert etl.NAME is not None - assert etl.LAST_UPDATED_YEAR is not None assert etl.GEO_LEVEL is not None assert etl.COLUMNS_TO_KEEP is not None assert len(etl.COLUMNS_TO_KEEP) > 0 diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py index 0798fcce..f428565f 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py @@ -87,7 +87,6 @@ class TestNationalRiskIndexETL(TestETL): assert etl.GEOID_FIELD_NAME == "GEOID10" assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT" assert etl.NAME == "national_risk_index" - assert etl.LAST_UPDATED_YEAR == 2020 assert etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT assert etl.COLUMNS_TO_KEEP == [ etl.GEOID_TRACT_FIELD_NAME, @@ -105,6 +104,6 @@ class TestNationalRiskIndexETL(TestETL): output_file_path = etl._get_output_file_path() expected_output_file_path = ( - data_path / "dataset" / "national_risk_index_2020" / "usa.csv" + data_path / "dataset" / "national_risk_index" / "usa.csv" ) assert output_file_path == expected_output_file_path