diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 0f32b6c1..2639e2dd 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -226,7 +226,6 @@ class ScoreETL(ExtractTransformLoad): census_block_group_dfs = [ self.ejscreen_df, self.census_acs_median_incomes_df, - self.national_risk_index_df, ] census_block_group_df = self._join_cbg_dfs(census_block_group_dfs) @@ -241,6 +240,7 @@ class ScoreETL(ExtractTransformLoad): self.geocorr_urban_rural_df, self.persistent_poverty_df, self.housing_and_transportation_df, + self.national_risk_index_df, ] census_tract_df = self._join_tract_dfs(census_tract_dfs) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 24d2303c..a5340ed4 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -52,7 +52,7 @@ class NationalRiskIndexETL(ExtractTransformLoad): # Note: also need to edit transform step to add fields to output. self.COLUMNS_TO_KEEP = [ - self.GEOID_FIELD_NAME, + self.GEOID_TRACT_FIELD_NAME, self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME, self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME, @@ -82,7 +82,6 @@ class NationalRiskIndexETL(ExtractTransformLoad): logger.info("Transforming National Risk Index Data") NRI_TRACT_COL = "TRACTFIPS" # Census Tract Column in NRI data - TRACT_COL = self.GEOID_TRACT_FIELD_NAME # Census Tract column name # read in the unzipped csv from NRI data source then rename the # Census Tract column for merging @@ -94,7 +93,7 @@ class NationalRiskIndexETL(ExtractTransformLoad): ) df_nri.rename( columns={ - NRI_TRACT_COL: TRACT_COL, + NRI_TRACT_COL: self.GEOID_TRACT_FIELD_NAME, self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, }, inplace=True, @@ -120,30 +119,7 @@ class NationalRiskIndexETL(ExtractTransformLoad): / df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME] ) - # Reduce columns. - # Note: normally we wait until writing to CSV for this step, but since the file is so huge, - # move this up here for performance reasons. - df_nri = df_nri[ - [ - self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, - self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME, - self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME, - self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME, - TRACT_COL, - ] - ] - - # get the full list of Census Block Groups from the ACS data - # and extract the Census Tract ID from each Block Group ID - df_acs = pd.read_csv( - self.BLOCK_GROUP_CSV, dtype={self.GEOID_FIELD_NAME: "string"} - ) - df_acs[TRACT_COL] = df_acs[self.GEOID_FIELD_NAME].str[0:11] - df_block_group = df_acs[[self.GEOID_FIELD_NAME, TRACT_COL]] - - # merge NRI data on the Census Tract ID so that each - # Block Group inherits the NRI score of its Census Tract - self.df = df_block_group.merge(df_nri, how="left", on=TRACT_COL) + self.df = df_nri def load(self) -> None: """Writes the NRI data as a csv to the directory at self.OUTPUT_DIR""" diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv index 9f8cd7f4..baf51468 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv @@ -1,11 +1,6 @@ -GEOID10,FEMA Risk Index Expected Annual Loss Score,Expected population loss rate (Natural Hazards Risk Index),Expected agricultural loss rate (Natural Hazards Risk Index),Expected building loss rate (Natural Hazards Risk Index) -050070403001,11.5,0.2415949482342093,0.2066075060457531,0.3995003157638629 -050070403002,11.5,0.2415949482342093,0.2066075060457531,0.3995003157638629 -050010201001,12.5,0.2813432586919213,0.2071197417936341,0.5350898265541664 -050010201002,12.5,0.2813432586919213,0.2071197417936341,0.5350898265541664 -150070405001,13.5,0.2807261849372409,0.2692193373944453,0.4930937667416781 -150070405002,13.5,0.2807261849372409,0.2692193373944453,0.4930937667416781 -150010210101,14.5,0.221568983494752,0.3608865970965789,0.3050513470809191 -150010210102,14.5,0.221568983494752,0.3608865970965789,0.3050513470809191 -150010211011,15.5,0.8054882693313613,0.2041612037778874,0.306186120042156 -150010211012,15.5,0.8054882693313613,0.2041612037778874,0.306186120042156 +GEOID10_TRACT,FEMA Risk Index Expected Annual Loss Score,Expected population loss rate (Natural Hazards Risk Index),Expected agricultural loss rate (Natural Hazards Risk Index),Expected building loss rate (Natural Hazards Risk Index) +05007040300,11.5,0.241594948,0.206607506,0.399500316 +05001020100,12.5,0.281343259,0.207119742,0.535089827 +15007040500,13.5,0.280726185,0.269219337,0.493093767 +15001021010,14.5,0.221568983,0.360886597,0.305051347 +15001021101,15.5,0.805488269,0.204161204,0.30618612 diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv index 8b33b2fc..b627928f 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv @@ -1,11 +1,6 @@ -GEOID10,GEOID10_TRACT,FEMA Risk Index Expected Annual Loss Score,Expected population loss rate (Natural Hazards Risk Index),Expected agricultural loss rate (Natural Hazards Risk Index),Expected building loss rate (Natural Hazards Risk Index) -050070403001,05007040300,11.5,0.24159494823420938,0.2066075060457531,0.39950031576386297 -050070403002,05007040300,11.5,0.24159494823420938,0.2066075060457531,0.39950031576386297 -050010201001,05001020100,12.5,0.2813432586919213,0.20711974179363413,0.5350898265541664 -050010201002,05001020100,12.5,0.2813432586919213,0.20711974179363413,0.5350898265541664 -150070405001,15007040500,13.5,0.28072618493724094,0.26921933739444537,0.4930937667416781 -150070405002,15007040500,13.5,0.28072618493724094,0.26921933739444537,0.4930937667416781 -150010210101,15001021010,14.5,0.22156898349475204,0.3608865970965789,0.30505134708091913 -150010210102,15001021010,14.5,0.22156898349475204,0.3608865970965789,0.30505134708091913 -150010211011,15001021101,15.5,0.8054882693313613,0.20416120377788743,0.30618612004215606 -150010211012,15001021101,15.5,0.8054882693313613,0.20416120377788743,0.30618612004215606 +TRACT,GEOID10_TRACT,RISK_SCORE,RISK_RATNG,RISK_NPCTL,FEMA Risk Index Expected Annual Loss Score,AVLN_EALT,CFLD_EALT,CWAV_EALT,DRGT_EALT,ERQK_EALT,HAIL_EALT,HWAV_EALT,HRCN_EALT,ISTM_EALT,LNDS_EALT,LTNG_EALT,RFLD_EALT,SWND_EALT,TRND_EALT,TSUN_EALT,VLCN_EALT,WFIR_EALT,WNTW_EALT,AVLN_EXPT,CFLD_EXPT,CWAV_EXPT,DRGT_EXPT,ERQK_EXPT,HAIL_EXPT,HWAV_EXPT,HRCN_EXPT,ISTM_EXPT,LNDS_EXPT,LTNG_EXPT,RFLD_EXPT,SWND_EXPT,TRND_EXPT,TSUN_EXPT,VLCN_EXPT,WFIR_EXPT,WNTW_EXPT,EAL_VALA,EAL_VALP,EAL_VALB,AGRIVALUE,POPULATION,BUILDVALUE,Expected population loss rate (Natural Hazards Risk Index),Expected agricultural loss rate (Natural Hazards Risk Index),Expected building loss rate (Natural Hazards Risk Index) +40300,05007040300,10.492015,Very Low,15.3494,11.5,12.5,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5,50.5,234.7446176,204.8883901,126.4079101,0.24159494823420938,0.2066075060457531,0.39950031576386297 +20100,05001020100,14.705854,Relatively Low,36.725828,12.5,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5,50.5,51.5,238.9921867,179.4960371,96.24552261,0.2813432586919213,0.20711974179363413,0.5350898265541664 +40500,15007040500,10.234981,Very Low,13.997993,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5,50.5,51.5,52.5,187.5793934,183.4527834,106.4706219,0.28072618493724094,0.26921933739444537,0.4930937667416781 +21010,15001021010,21.537231,Relatively Moderate,59.488033,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5,50.5,51.5,52.5,53.5,142.7041082,236.9465219,175.3803106,0.22156898349475204,0.3608865970965789,0.30505134708091913 +21101,15001021101,19.434585,Relatively Low,53.392265,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5,50.5,51.5,52.5,53.5,54.5,257.1497377,66.41934096,177.9963115,0.8054882693313613,0.20416120377788743,0.30618612004215606 diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py index 7cef406f..d58bd8ef 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py @@ -52,6 +52,7 @@ class TestNationalRiskIndexETL: acs_dst = etl.BLOCK_GROUP_CSV for src, dst in [(input_src, input_dst), (acs_src, acs_dst)]: copy_data_files(src, dst) + # setup - read in sample output as dataframe TRACT_COL = etl.GEOID_TRACT_FIELD_NAME BLOCK_COL = etl.GEOID_FIELD_NAME @@ -59,11 +60,12 @@ class TestNationalRiskIndexETL: DATA_DIR / "transform.csv", dtype={BLOCK_COL: "string", TRACT_COL: "string"}, ) + # execution etl.transform() # validation - assert etl.df.shape == (10, 6) + assert etl.df.shape == (5, 51) pd.testing.assert_frame_equal(etl.df, expected) def test_load(self, mock_etl): @@ -93,5 +95,5 @@ class TestNationalRiskIndexETL: # validation assert output_path.exists() - assert output.shape == (10, 5) + assert output.shape == (5, 5) pd.testing.assert_frame_equal(output, expected)