From 0ad64588ea509af11bcd65de00a67c8bd292013e Mon Sep 17 00:00:00 2001 From: Carlos Felix <63804190+carlosfelix2@users.noreply.github.com> Date: Mon, 2 Dec 2024 15:14:02 -0500 Subject: [PATCH] Comparator generate delta even with column differences --- .../data-pipeline/data_pipeline/comparator.py | 221 ++++++++++-------- 1 file changed, 121 insertions(+), 100 deletions(-) diff --git a/data/data-pipeline/data_pipeline/comparator.py b/data/data-pipeline/data_pipeline/comparator.py index 6aa85ffd..eed14b4f 100644 --- a/data/data-pipeline/data_pipeline/comparator.py +++ b/data/data-pipeline/data_pipeline/comparator.py @@ -1,6 +1,5 @@ import sys import click -import difflib import pandas as pd from pathlib import Path @@ -16,6 +15,26 @@ pd.set_option("display.max_rows", None) pd.set_option("display.width", 10000) pd.set_option("display.colheader_justify", "left") +result_text = [] + + +def _add_text(text: str): + """Add a line to the output result. + + Args: + line (str): a line to add + """ + result_text.append(text) + + +def _get_result_doc() -> str: + """Gets the document with results. + + Returns: + str: the results document as text + """ + return "".join(result_text) + def _read_from_file(file_path: Path): """Read a CSV file into a Dataframe.""" @@ -65,7 +84,9 @@ def cli(): default=constants.DATA_SCORE_CSV_FULL_FILE_PATH, help="Compare to the specified score CSV file instead of downloading from production", ) -def compare_score(compare_to_version: str, compare_to_file: str, local_score_file: str): +def compare_score( + compare_to_version: str, compare_to_file: str, local_score_file: str +): """Compares the score in the production environment to the locally generated score. The algorithm is pretty simple: @@ -81,10 +102,6 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil FLOAT_ROUNDING_PLACES = 2 WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score" - summary = "# Score Comparison Summary\n" - summary += f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the" - summary += " locally calculated score. Here are the results.\n" - log_title("Compare Score", "Compare production score to local score") if compare_to_file: @@ -118,59 +135,73 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil production_score_df_columns = sorted( production_score_df.columns.array.tolist() ) - - log_info("Comparing columns (production vs local). Differences are: ") - summary += "\n## Columns\n" - summary += "I compared the columns. Here's what I found.\n" - - col_diff = difflib.unified_diff( - production_score_df_columns, local_score_df_columns + extra_cols_in_local = set(local_score_df_columns) - set( + production_score_df_columns + ) + extra_cols_in_prod = set(production_score_df_columns) - set( + local_score_df_columns ) - col_diff_res = "" - for d in col_diff: - col_diff_res += str(d) + "\n" - if len(col_diff_res) == 0: - log_info("None. Columns are the same") - summary += "* There are no differences in the column names.\n" + _add_text("# Score Comparison Summary\n") + _add_text( + f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the" + " locally calculated score. Here are the results:\n\n" + ) + + ##################### + # Compare the columns + ##################### + log_info("Comparing columns (production vs local)") + _add_text("## Columns\n") + if len(extra_cols_in_local) == 0 and len(extra_cols_in_prod) == 0: + _add_text("* There are no differences in the column names.\n") else: - log_info("There are differences. The diff is:") - log_info(col_diff_res) - summary += f"* There are differences in the column names. Here's a diff:\n{col_diff_res}\n" + _add_text( + f"* There are {len(extra_cols_in_local)} columns that were added as compared to the production score." + ) + if len(extra_cols_in_local) > 0: + _add_text(f" Those colums are:\n{extra_cols_in_local}") + _add_text( + f"\n* There are {len(extra_cols_in_prod)} columns that were removed as compared to the production score." + ) + if len(extra_cols_in_prod) > 0: + _add_text(f" Those colums are:\n{extra_cols_in_prod}") + #################### + # Compare the scores + #################### log_info("Comparing dataframe contents (production vs local)") - summary += "\n## Scores\n" - summary += "I compared the scores, too. Here's what I found.\n" + _add_text("\n\n## Scores\n") production_row_count = len(production_score_df.index) local_row_count = len(local_score_df.index) - summary += f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}." - summary += ( - " They match!\n" - if production_row_count == local_row_count - else " They don't match.\n" + _add_text( + f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}." ) + if production_row_count == local_row_count: + _add_text(" They match!\n") + else: + _add_text(" They don't match. The differences are:\n") + _add_text( + " * New tracts added to the local score are:\n" + f"{local_score_df.index.difference(production_score_df.index).to_list()}" + "\n * Tracts removed from the local score are:\n" + f"{production_score_df.index.difference(local_score_df.index).to_list()}" + "\n" + ) production_total_population = production_score_df["Total population"].sum() local_total_population = local_score_df["Total population"].sum() - log_info( - f"The total population in all census tracts in production is {production_total_population:,}." + _add_text( + f"* The total population in all census tracts in the production score is {production_total_population:,}. " + f"The total population in all census tracts locally is {local_total_population:,}. " ) - log_info( - f"The total population in all census tracts locally is {local_total_population:,}." - ) - log_info( - f"The difference in population is {abs(production_total_population - local_total_population):,}." - ) - - summary += f"* The total population in all census tracts in the production score is {production_total_population:,}." - summary += f" The total population in all census tracts locally is {local_total_population:,}." - summary += ( - " They match!\n" + _add_text( + "They match!\n" if production_total_population == local_total_population - else f" The difference is {abs(production_total_population - local_total_population):,}.\n" + else f"The difference is {abs(production_total_population - local_total_population):,}.\n" ) production_disadvantaged_tracts_df = production_score_df.query( @@ -196,30 +227,18 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil / local_total_population ) - log_info( - f"There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score." + _add_text( + f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing" + f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}" ) - log_info( - f"This represents {production_pct_of_population_represented:.1%} of the total population." + _add_text( + f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population." ) - log_info( - f"There are {len(local_disadvantaged_tracts_set):,} in the locally generated score." - ) - log_info( - f"This represents {local_pct_of_population_represented:.1%} of the total population." - ) - log_info( - f"The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set)):,} tract(s)." - ) - - summary += f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing" - summary += f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}" - summary += f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population." - summary += ( - " The number of tracts match!\n" + _add_text( + " The number of tracts match!\n " if len(production_disadvantaged_tracts_set) == len(local_disadvantaged_tracts_set) - else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n" + else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n " ) removed_tracts = production_disadvantaged_tracts_set.difference( @@ -228,58 +247,60 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil added_tracts = local_disadvantaged_tracts_set.difference( production_disadvantaged_tracts_set ) - removed_tracts_str = ", ".join(list(removed_tracts)) - added_tracts_str = ", ".join(list(added_tracts)) - - log_info( - f"There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the prod " - "score that are not disadvantaged in the local score. Those tracts are:" - ) - log_info(removed_tracts_str) - log_info( - f"There are {len(added_tracts):,} tract(s) marked as disadvantaged in the local " - "score that are not disadvantaged in the prod score. Those tracts are:" - ) - log_info(added_tracts_str) - - summary += ( + _add_text( f"* There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the production score that are not disadvantaged in the locally" - f" generated score (i.e. disadvantaged tracts that were removed by the new score). Those tracts are:\n{removed_tracts_str}\n" - f" There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the" - f" production score (i.e. disadvantaged tracts that were added by the new score). Those tracts are:\n{added_tracts_str}\n\n" + f" generated score (i.e. disadvantaged tracts that were removed by the new score). " ) + if len(removed_tracts) > 0: + _add_text(f"Those tracts are:\n{removed_tracts}\n") + _add_text( + f"* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the" + f" production score (i.e. disadvantaged tracts that were added by the new score). " + ) + if len(added_tracts) > 0: + _add_text(f"Those tracts are:\n{added_tracts}\n") + + ################ + # Create a delta + ################ + _add_text("\n## Delta\n") + # First we make the columns on two dataframes to be the same to be able to compare + trimmed_prod_df = production_score_df.drop(extra_cols_in_prod, axis=1) + trimmed_local_df = local_score_df.drop(extra_cols_in_local, axis=1) try: - comparison_results_df = production_score_df.compare( - local_score_df, align_axis=1, keep_shape=False, keep_equal=False + comparison_results_df = trimmed_prod_df.compare( + trimmed_local_df, align_axis=1, keep_shape=False, keep_equal=False ).rename({"self": "Production", "other": "Local"}, axis=1, level=1) - summary += "* I compared all values across all census tracts." - summary += f" There are {len(comparison_results_df.index):,} tracts with at least one difference." - summary += " Please examine the logs or run the score comparison locally to view them all.\n" - - log_info( - f"There are {len(comparison_results_df.index)} rows with any differences." + _add_text( + "* I compared all values across all census tracts. Note this ignores any columns that have been added or removed." + f" There are {len(comparison_results_df.index):,} tracts with at least one difference.\n" ) - if len(comparison_results_df.index) > 0: - log_info("Those differences are:") - log_info("\n" + str(comparison_results_df)) - comparison_path = WORKING_PATH / "deltas.csv" - comparison_results_df.to_csv(path_or_buf=comparison_path) - log_info(f"Wrote comparison results to {comparison_path}") + + comparison_path = WORKING_PATH / "deltas.csv" + comparison_results_df.to_csv(path_or_buf=comparison_path) + + _add_text(f"* Wrote comparison results to {comparison_path}") except ValueError as e: - summary += "* I could not run a full comparison. This is likely because there are column or index (census tract) differences." - summary += " Please examine the logs or run the score comparison locally to find out more.\n" - log_info( - f"Encountered an exception while performing the comparison: {repr(e)}" + _add_text( + "* I could not run a full comparison. This is likely because there are column or index (census tract) differences." + " Please examine the logs or run the score comparison locally to find out more.\n" + ) + _add_text( + f"Encountered an exception while performing the comparison: {repr(e)}\n" ) + result_doc = _get_result_doc() + print(result_doc) + + # Write the report summary_path = WORKING_PATH / "comparison-summary.md" with open(summary_path, "w", encoding="utf-8") as f: - f.write(summary) + f.write(result_doc) log_info(f"Wrote comparison summary to {summary_path}") log_goodbye()