From 03e59f2abdf518f16d1eace5ce986f09d399b5d4 Mon Sep 17 00:00:00 2001 From: Lucas Merrill Brown Date: Fri, 5 Nov 2021 15:43:52 -0400 Subject: [PATCH] Definition L updates (#862) * Changing FEMA risk measure * Adding "basic stats" feature to comparison tool * Tweaking Definition L --- .../data_pipeline/etl/score/etl_score.py | 1 + .../etl/sources/national_risk_index/etl.py | 56 +++++- .../ipython/scoring_comparison.ipynb | 183 ++++++++++++++++-- .../data_pipeline/score/field_names.py | 3 + .../data_pipeline/score/score_l.py | 19 +- .../national_risk_index/data/input.csv | 12 +- .../national_risk_index/data/output.csv | 22 +-- .../national_risk_index/data/transform.csv | 22 +-- .../sources/national_risk_index/test_etl.py | 10 +- 9 files changed, 265 insertions(+), 63 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 23506b41..729586f0 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -291,6 +291,7 @@ class ScoreETL(ExtractTransformLoad): field_names.LIFE_EXPECTANCY_FIELD, field_names.ENERGY_BURDEN_FIELD, field_names.FEMA_RISK_FIELD, + field_names.FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD, field_names.URBAN_HERUISTIC_FIELD, field_names.AIR_TOXICS_CANCER_RISK_FIELD, field_names.RESPITORY_HAZARD_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 1775b05a..bbfdd31f 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -25,10 +25,15 @@ class NationalRiskIndexETL(ExtractTransformLoad): "FEMA Risk Index Expected Annual Loss Score" ) + self.EXPECTED_ANNUAL_LOSS_RATE = ( + "FEMA Risk Index Expected Annual Loss Rate" + ) + # Note: also need to edit transform step to add fields to output. self.COLUMNS_TO_KEEP = [ self.GEOID_FIELD_NAME, self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, + self.EXPECTED_ANNUAL_LOSS_RATE, ] self.df: pd.DataFrame @@ -37,7 +42,7 @@ class NationalRiskIndexETL(ExtractTransformLoad): """Unzips NRI dataset from the FEMA data source and writes the files to the temporary data folder for use in the transform() method """ - logger.info("Downloading National Risk Index Data") + logger.info("Downloading 405MB National Risk Index Data") super().extract( self.NRI_FTP_URL, self.TMP_PATH, @@ -72,11 +77,58 @@ class NationalRiskIndexETL(ExtractTransformLoad): inplace=True, ) + # Calculate a risk score that does not include FEMA's measure of community vulnerability. + disaster_categories = [ + "AVLN", # Avalanche + "CFLD", # Coastal Flooding + "CWAV", # Cold Wave + "DRGT", # Drought + "ERQK", # Earthquake + "HAIL", # Hail + "HWAV", # Heat Wave + "HRCN", # Hurricane + "ISTM", # Ice Storm + "LNDS", # Landslide + "LTNG", # Lightning + "RFLD", # Riverine Flooding + "SWND", # Strong Wind + "TRND", # Tornado + "TSUN", # Tsunami + "VLCN", # Volcanic Activity + "WFIR", # Wildfire + "WNTW", # Winter Weather + ] + + # Note: I'm not sure why pylint is so upset with this particular dataframe, + # but it may be a known bug. https://github.com/PyCQA/pylint/issues/1498 + for category in disaster_categories: + df_nri[ # pylint: disable=unsupported-assignment-operation + f"{category}" + ] = ( + df_nri[ # pylint: disable=unsubscriptable-object + f"{category}_EALT" + ] # Expected Annual Loss - Total + / df_nri[ # pylint: disable=unsubscriptable-object + f"{category}_EXPT" + ] + ) + df_nri[ # pylint: disable=unsupported-assignment-operation + self.EXPECTED_ANNUAL_LOSS_RATE + ] = df_nri[ # pylint: disable=unsubscriptable-object + disaster_categories + ].sum( + axis=1 + ) + # Reduce columns. # Note: normally we wait until writing to CSV for this step, but since the file is so huge, # move this up here for performance reasons. df_nri = df_nri[ # pylint: disable=unsubscriptable-object - [self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, TRACT_COL] + [ + self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, + self.EXPECTED_ANNUAL_LOSS_RATE, + TRACT_COL, + ] ] # get the full list of Census Block Groups from the ACS data diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 5930c2e4..3beb7a0f 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -318,6 +318,28 @@ "# )" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b74b0bf", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Create a FEMA risk index score\n", + "# Note: this can be deleted at a later date.\n", + "FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD = (\n", + " \"FEMA Risk Index Expected Annual Loss Rate\"\n", + ")\n", + "FEMA_COMMUNITIES = \"FEMA Risk Index (top 30th percentile)\"\n", + "merged_df[FEMA_COMMUNITIES] = (\n", + " merged_df[f\"{FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD} (percentile)\"] > 0.70\n", + ")\n", + "\n", + "merged_df[FEMA_COMMUNITIES].describe()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -406,6 +428,11 @@ " priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", + " Index(\n", + " method_name=FEMA_COMMUNITIES,\n", + " priority_communities_field=FEMA_COMMUNITIES,\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", " ]\n", ")\n", "\n", @@ -439,11 +466,6 @@ "\n", "census_tract_indices = [\n", " Index(\n", - " method_name=\"Persistent Poverty\",\n", - " priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", " method_name=\"CalEnviroScreen 4.0\",\n", " priority_communities_field=\"calenviroscreen_priority_community\",\n", " other_census_tract_fields_to_keep=[\n", @@ -451,6 +473,27 @@ " CALENVIROSCREEN_PERCENTILE_FIELD,\n", " ],\n", " ),\n", + " Index(\n", + " method_name=\"Persistent Poverty\",\n", + " priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + "]\n", + "\n", + "# These fields will be used for statistical comparisons.\n", + "comparison_fields = [\n", + " \"Percent of individuals < 100% Federal Poverty Line\",\n", + " \"Percent of individuals < 200% Federal Poverty Line\",\n", + " \"Median household income (% of AMI)\",\n", + " \"Percent of households in linguistic isolation\",\n", + " \"Percent individuals age 25 or over with less than high school degree\",\n", + " \"Linguistic isolation (percent)\",\n", + " \"Unemployed civilians (percent)\",\n", + " \"Median household income in the past 12 months\",\n", + " URBAN_HEURISTIC_FIELD,\n", + " LIFE_EXPECTANCY_FIELD,\n", + " HEALTH_INSURANCE_FIELD,\n", + " BAD_HEALTH_FIELD,\n", "]" ] }, @@ -735,7 +778,120 @@ "write_state_distribution_excel(\n", " state_distribution_df=state_distribution_df,\n", " file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n", - ")" + ")\n", + "\n", + "# Note: this is helpful because this file is extremely long-running, so it alerts the user when the first step\n", + "# of data analysis is done. Can be removed when converted into scripts. -LMB.\n", + "import os\n", + "\n", + "os.system(\"say 'state analysis is written.'\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4d0e783", + "metadata": {}, + "outputs": [], + "source": [ + "directory = COMPARISON_OUTPUTS_DIR / \"cbg_basic_stats\"\n", + "directory.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# TODO: this Excel-writing function is extremely similar to other Excel-writing functions in this notebook.\n", + "# Refactor to use the same Excel-writing function.\n", + "def write_basic_stats_excel(\n", + " basic_stats_df: pd.DataFrame, file_path: pathlib.PosixPath\n", + ") -> None:\n", + " \"\"\"Write the dataframe to excel with special formatting.\"\"\"\n", + " # Create a Pandas Excel writer using XlsxWriter as the engine.\n", + " writer = pd.ExcelWriter(file_path, engine=\"xlsxwriter\")\n", + "\n", + " # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n", + " # index column at the left of the output dataframe.\n", + " basic_stats_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n", + "\n", + " # Get the xlsxwriter workbook and worksheet objects.\n", + " workbook = writer.book\n", + " worksheet = writer.sheets[\"Sheet1\"]\n", + " worksheet.autofilter(0, 0, basic_stats_df.shape[0], basic_stats_df.shape[1])\n", + "\n", + " # Set a width parameter for all columns\n", + " # Note: this is parameterized because every call to `set_column` requires setting the width.\n", + " column_width = 15\n", + "\n", + " for column in basic_stats_df.columns:\n", + " # Turn the column index into excel ranges (e.g., column #95 is \"CR\" and the range may be \"CR2:CR53\").\n", + " column_index = basic_stats_df.columns.get_loc(column)\n", + " column_character = get_excel_column_name(column_index)\n", + "\n", + " # Set all columns to larger width\n", + " worksheet.set_column(\n", + " f\"{column_character}:{column_character}\", column_width\n", + " )\n", + "\n", + " # Add green to red conditional formatting.\n", + " column_ranges = (\n", + " f\"{column_character}2:{column_character}{len(basic_stats_df)+1}\"\n", + " )\n", + " worksheet.conditional_format(\n", + " column_ranges,\n", + " # Min: green, max: red.\n", + " {\n", + " \"type\": \"2_color_scale\",\n", + " \"min_color\": \"#00FF7F\",\n", + " \"max_color\": \"#C82538\",\n", + " },\n", + " )\n", + "\n", + " # Special formatting for all percent columns\n", + " # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n", + " if (\n", + " \"percent \" in column\n", + " or \"(percent)\" in column\n", + " or \"Percent \" in column\n", + " ):\n", + " # Make these columns percentages.\n", + " percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n", + " worksheet.set_column(\n", + " f\"{column_character}:{column_character}\",\n", + " column_width,\n", + " percentage_format,\n", + " )\n", + "\n", + " header_format = workbook.add_format(\n", + " {\"bold\": True, \"text_wrap\": True, \"valign\": \"bottom\"}\n", + " )\n", + "\n", + " # Overwrite both the value and the format of each header cell\n", + " # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n", + " # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n", + " for col_num, value in enumerate(basic_stats_df.columns.values):\n", + " worksheet.write(0, col_num, value, header_format)\n", + "\n", + " writer.save()\n", + "\n", + "\n", + "for index in census_block_group_indices:\n", + " print(f\"Basic stats for {index.method_name}\")\n", + " temp_df = merged_df\n", + " temp_df[index.priority_communities_field] = (\n", + " temp_df[index.priority_communities_field] == True\n", + " )\n", + "\n", + " # print(sum(temp_df[\"is_a_priority_cbg\"]))\n", + " grouped_df = (\n", + " temp_df.groupby(index.priority_communities_field).mean().reset_index()\n", + " )\n", + " result_df = grouped_df[\n", + " [index.priority_communities_field] + comparison_fields\n", + " ]\n", + " result_df.to_csv(\n", + " directory / f\"{index.method_name} Basic Stats.csv\", index=False\n", + " )\n", + " write_basic_stats_excel(\n", + " basic_stats_df=result_df,\n", + " file_path=directory / f\"{index.method_name} Basic Stats.xlsx\",\n", + " )" ] }, { @@ -918,21 +1074,6 @@ " )\n", "\n", "\n", - "comparison_fields = [\n", - " \"Percent of individuals < 100% Federal Poverty Line\",\n", - " \"Percent of individuals < 200% Federal Poverty Line\",\n", - " \"Median household income (% of AMI)\",\n", - " \"Percent of households in linguistic isolation\",\n", - " \"Percent individuals age 25 or over with less than high school degree\",\n", - " \"Linguistic isolation (percent)\",\n", - " \"Unemployed civilians (percent)\",\n", - " \"Median household income in the past 12 months\",\n", - " URBAN_HEURISTIC_FIELD,\n", - " LIFE_EXPECTANCY_FIELD,\n", - " HEALTH_INSURANCE_FIELD,\n", - " BAD_HEALTH_FIELD,\n", - "]\n", - "\n", "for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n", " print(f\"Comparing {index_a} and {index_b}.\")\n", " compare_cbg_scores(\n", diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index a4114989..9c7c22d9 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -57,6 +57,9 @@ AMI_FIELD = "Area Median Income (State or metropolitan)" # Climate FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score" +FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD = ( + "FEMA Risk Index Expected Annual Loss Rate" +) # Environment DIESEL_FIELD = "Diesel particulate matter" diff --git a/data/data-pipeline/data_pipeline/score/score_l.py b/data/data-pipeline/data_pipeline/score/score_l.py index 831539c2..fa149300 100644 --- a/data/data-pipeline/data_pipeline/score/score_l.py +++ b/data/data-pipeline/data_pipeline/score/score_l.py @@ -9,7 +9,7 @@ logger = get_module_logger(__name__) class ScoreL(Score): def __init__(self, df: pd.DataFrame) -> None: - self.LOW_INCOME_THRESHOLD: float = 0.60 + self.LOW_INCOME_THRESHOLD: float = 0.65 self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 super().__init__(df) @@ -71,7 +71,7 @@ class ScoreL(Score): > self.LOW_INCOME_THRESHOLD ) & ( self.df[ - field_names.FEMA_RISK_FIELD + field_names.FEMA_EXPECTED_ANNUAL_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX ] > self.ENVIRONMENTAL_BURDEN_THRESHOLD @@ -170,13 +170,16 @@ class ScoreL(Score): # Low income: In 60th percentile or above for percent of block group population # of households where household income is less than or equal to twice the federal # poverty level. Source: Census's American Community Survey] - return ( - self.df[ - field_names.RMP_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] + + pollution_criteria = ( + self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] > self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) & ( + ) | ( + self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + + return pollution_criteria & ( self.df[ field_names.POVERTY_LESS_THAN_200_FPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/input.csv b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/input.csv index 715ab55e..8d778f09 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/input.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/input.csv @@ -1,6 +1,6 @@ -TRACT,TRACTFIPS,RISK_SCORE,RISK_RATNG,RISK_NPCTL,EAL_SCORE -40300,05007040300,10.492015,Very Low,15.3494,11.5 -20100,05001020100,14.705854,Relatively Low,36.725828,12.5 -40500,15007040500,10.234981,Very Low,13.997993,13.5 -21010,15001021010,21.537231,Relatively Moderate,59.488033,14.5 -21101,15001021101,19.434585,Relatively Low,53.392265,15.5 +TRACT,TRACTFIPS,RISK_SCORE,RISK_RATNG,RISK_NPCTL,EAL_SCORE,AVLN_EALT,CFLD_EALT,CWAV_EALT,DRGT_EALT,ERQK_EALT,HAIL_EALT,HWAV_EALT,HRCN_EALT,ISTM_EALT,LNDS_EALT,LTNG_EALT,RFLD_EALT,SWND_EALT,TRND_EALT,TSUN_EALT,VLCN_EALT,WFIR_EALT,WNTW_EALT,AVLN_EXPT,CFLD_EXPT,CWAV_EXPT,DRGT_EXPT,ERQK_EXPT,HAIL_EXPT,HWAV_EXPT,HRCN_EXPT,ISTM_EXPT,LNDS_EXPT,LTNG_EXPT,RFLD_EXPT,SWND_EXPT,TRND_EXPT,TSUN_EXPT,VLCN_EXPT,WFIR_EXPT,WNTW_EXPT +40300,05007040300,10.492015,Very Low,15.3494,11.5,12.5,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5 +20100,05001020100,14.705854,Relatively Low,36.725828,12.5,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5 +40500,15007040500,10.234981,Very Low,13.997993,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5 +21010,15001021010,21.537231,Relatively Moderate,59.488033,14.5,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5,50.5 +21101,15001021101,19.434585,Relatively Low,53.392265,15.5,16.5,17.5,18.5,19.5,20.5,21.5,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5,32.5,33.5,34.5,35.5,36.5,37.5,38.5,39.5,40.5,41.5,42.5,43.5,44.5,45.5,46.5,47.5,48.5,49.5,50.5,51.5 diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv index 67ac5d00..480cd330 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/output.csv @@ -1,11 +1,11 @@ -GEOID10,FEMA Risk Index Expected Annual Loss Score -050070403001,11.5 -050070403002,11.5 -050010201001,12.5 -050010201002,12.5 -150070405001,13.5 -150070405002,13.5 -150010210101,14.5 -150010210102,14.5 -150010211011,15.5 -150010211012,15.5 +GEOID10,FEMA Risk Index Expected Annual Loss Score,FEMA Risk Index Expected Annual Loss Rate +050070403001,11.5,9.540442348853764 +050070403002,11.5,9.540442348853764 +050010201001,12.5,9.759472262661436 +050010201002,12.5,9.759472262661436 +150070405001,13.5,9.967264470453644 +150070405002,13.5,9.967264470453644 +150010210101,14.5,10.16467498073544 +150010210102,14.5,10.16467498073544 +150010211011,15.5,10.352473850464468 +150010211012,15.5,10.352473850464468 diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv index 662cde19..d5d2b130 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/data/transform.csv @@ -1,11 +1,11 @@ -GEOID10,GEOID10_TRACT,FEMA Risk Index Expected Annual Loss Score -050070403001,05007040300,11.5 -050070403002,05007040300,11.5 -050010201001,05001020100,12.5 -050010201002,05001020100,12.5 -150070405001,15007040500,13.5 -150070405002,15007040500,13.5 -150010210101,15001021010,14.5 -150010210102,15001021010,14.5 -150010211011,15001021101,15.5 -150010211012,15001021101,15.5 +GEOID10,GEOID10_TRACT,FEMA Risk Index Expected Annual Loss Score,FEMA Risk Index Expected Annual Loss Rate +050070403001,05007040300,11.5,9.540442348853764 +050070403002,05007040300,11.5,9.540442348853764 +050010201001,05001020100,12.5,9.759472262661436 +050010201002,05001020100,12.5,9.759472262661436 +150070405001,15007040500,13.5,9.967264470453644 +150070405002,15007040500,13.5,9.967264470453644 +150010210101,15001021010,14.5,10.164674980735441 +150010210102,15001021010,14.5,10.164674980735441 +150010211011,15001021101,15.5,10.352473850464467 +150010211012,15001021101,15.5,10.352473850464467 diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py index 6c4b40d7..04c904a0 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py @@ -61,9 +61,10 @@ class TestNationalRiskIndexETL: ) # execution etl.transform() + # validation - assert etl.df.shape == (10, 3) - assert etl.df.equals(expected) + assert etl.df.shape == (10, 4) + pd.testing.assert_frame_equal(etl.df, expected) def test_load(self, mock_etl): """Tests the load() method for NationalRiskIndexETL @@ -89,7 +90,8 @@ class TestNationalRiskIndexETL: # execution etl.load() output = pd.read_csv(output_path, dtype={BLOCK_COL: str}) + # validation assert output_path.exists() - assert output.shape == (10, 2) - assert output.equals(expected) + assert output.shape == (10, 3) + pd.testing.assert_frame_equal(output, expected)