Comparator generate delta even with column differences

This commit is contained in:
Carlos Felix 2024-12-02 15:14:02 -05:00 committed by Carlos Felix
commit 0ad64588ea

View file

@ -1,6 +1,5 @@
import sys import sys
import click import click
import difflib
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
@ -16,6 +15,26 @@ pd.set_option("display.max_rows", None)
pd.set_option("display.width", 10000) pd.set_option("display.width", 10000)
pd.set_option("display.colheader_justify", "left") pd.set_option("display.colheader_justify", "left")
result_text = []
def _add_text(text: str):
"""Add a line to the output result.
Args:
line (str): a line to add
"""
result_text.append(text)
def _get_result_doc() -> str:
"""Gets the document with results.
Returns:
str: the results document as text
"""
return "".join(result_text)
def _read_from_file(file_path: Path): def _read_from_file(file_path: Path):
"""Read a CSV file into a Dataframe.""" """Read a CSV file into a Dataframe."""
@ -65,7 +84,9 @@ def cli():
default=constants.DATA_SCORE_CSV_FULL_FILE_PATH, default=constants.DATA_SCORE_CSV_FULL_FILE_PATH,
help="Compare to the specified score CSV file instead of downloading from production", help="Compare to the specified score CSV file instead of downloading from production",
) )
def compare_score(compare_to_version: str, compare_to_file: str, local_score_file: str): def compare_score(
compare_to_version: str, compare_to_file: str, local_score_file: str
):
"""Compares the score in the production environment to the locally generated score. The """Compares the score in the production environment to the locally generated score. The
algorithm is pretty simple: algorithm is pretty simple:
@ -81,10 +102,6 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
FLOAT_ROUNDING_PLACES = 2 FLOAT_ROUNDING_PLACES = 2
WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score" WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score"
summary = "# Score Comparison Summary\n"
summary += f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
summary += " locally calculated score. Here are the results.\n"
log_title("Compare Score", "Compare production score to local score") log_title("Compare Score", "Compare production score to local score")
if compare_to_file: if compare_to_file:
@ -118,59 +135,73 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
production_score_df_columns = sorted( production_score_df_columns = sorted(
production_score_df.columns.array.tolist() production_score_df.columns.array.tolist()
) )
extra_cols_in_local = set(local_score_df_columns) - set(
log_info("Comparing columns (production vs local). Differences are: ") production_score_df_columns
summary += "\n## Columns\n" )
summary += "I compared the columns. Here's what I found.\n" extra_cols_in_prod = set(production_score_df_columns) - set(
local_score_df_columns
col_diff = difflib.unified_diff(
production_score_df_columns, local_score_df_columns
) )
col_diff_res = ""
for d in col_diff:
col_diff_res += str(d) + "\n"
if len(col_diff_res) == 0: _add_text("# Score Comparison Summary\n")
log_info("None. Columns are the same") _add_text(
summary += "* There are no differences in the column names.\n" f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
" locally calculated score. Here are the results:\n\n"
)
#####################
# Compare the columns
#####################
log_info("Comparing columns (production vs local)")
_add_text("## Columns\n")
if len(extra_cols_in_local) == 0 and len(extra_cols_in_prod) == 0:
_add_text("* There are no differences in the column names.\n")
else: else:
log_info("There are differences. The diff is:") _add_text(
log_info(col_diff_res) f"* There are {len(extra_cols_in_local)} columns that were added as compared to the production score."
summary += f"* There are differences in the column names. Here's a diff:\n{col_diff_res}\n" )
if len(extra_cols_in_local) > 0:
_add_text(f" Those colums are:\n{extra_cols_in_local}")
_add_text(
f"\n* There are {len(extra_cols_in_prod)} columns that were removed as compared to the production score."
)
if len(extra_cols_in_prod) > 0:
_add_text(f" Those colums are:\n{extra_cols_in_prod}")
####################
# Compare the scores
####################
log_info("Comparing dataframe contents (production vs local)") log_info("Comparing dataframe contents (production vs local)")
summary += "\n## Scores\n" _add_text("\n\n## Scores\n")
summary += "I compared the scores, too. Here's what I found.\n"
production_row_count = len(production_score_df.index) production_row_count = len(production_score_df.index)
local_row_count = len(local_score_df.index) local_row_count = len(local_score_df.index)
summary += f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}." _add_text(
summary += ( f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
" They match!\n"
if production_row_count == local_row_count
else " They don't match.\n"
) )
if production_row_count == local_row_count:
_add_text(" They match!\n")
else:
_add_text(" They don't match. The differences are:\n")
_add_text(
" * New tracts added to the local score are:\n"
f"{local_score_df.index.difference(production_score_df.index).to_list()}"
"\n * Tracts removed from the local score are:\n"
f"{production_score_df.index.difference(local_score_df.index).to_list()}"
"\n"
)
production_total_population = production_score_df["Total population"].sum() production_total_population = production_score_df["Total population"].sum()
local_total_population = local_score_df["Total population"].sum() local_total_population = local_score_df["Total population"].sum()
log_info( _add_text(
f"The total population in all census tracts in production is {production_total_population:,}." f"* The total population in all census tracts in the production score is {production_total_population:,}. "
f"The total population in all census tracts locally is {local_total_population:,}. "
) )
log_info( _add_text(
f"The total population in all census tracts locally is {local_total_population:,}." "They match!\n"
)
log_info(
f"The difference in population is {abs(production_total_population - local_total_population):,}."
)
summary += f"* The total population in all census tracts in the production score is {production_total_population:,}."
summary += f" The total population in all census tracts locally is {local_total_population:,}."
summary += (
" They match!\n"
if production_total_population == local_total_population if production_total_population == local_total_population
else f" The difference is {abs(production_total_population - local_total_population):,}.\n" else f"The difference is {abs(production_total_population - local_total_population):,}.\n"
) )
production_disadvantaged_tracts_df = production_score_df.query( production_disadvantaged_tracts_df = production_score_df.query(
@ -196,30 +227,18 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
/ local_total_population / local_total_population
) )
log_info( _add_text(
f"There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score." f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
) )
log_info( _add_text(
f"This represents {production_pct_of_population_represented:.1%} of the total population." f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
) )
log_info( _add_text(
f"There are {len(local_disadvantaged_tracts_set):,} in the locally generated score." " The number of tracts match!\n "
)
log_info(
f"This represents {local_pct_of_population_represented:.1%} of the total population."
)
log_info(
f"The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set)):,} tract(s)."
)
summary += f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
summary += f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
summary += f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
summary += (
" The number of tracts match!\n"
if len(production_disadvantaged_tracts_set) if len(production_disadvantaged_tracts_set)
== len(local_disadvantaged_tracts_set) == len(local_disadvantaged_tracts_set)
else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n" else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n "
) )
removed_tracts = production_disadvantaged_tracts_set.difference( removed_tracts = production_disadvantaged_tracts_set.difference(
@ -228,58 +247,60 @@ def compare_score(compare_to_version: str, compare_to_file: str, local_score_fil
added_tracts = local_disadvantaged_tracts_set.difference( added_tracts = local_disadvantaged_tracts_set.difference(
production_disadvantaged_tracts_set production_disadvantaged_tracts_set
) )
removed_tracts_str = ", ".join(list(removed_tracts)) _add_text(
added_tracts_str = ", ".join(list(added_tracts))
log_info(
f"There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the prod "
"score that are not disadvantaged in the local score. Those tracts are:"
)
log_info(removed_tracts_str)
log_info(
f"There are {len(added_tracts):,} tract(s) marked as disadvantaged in the local "
"score that are not disadvantaged in the prod score. Those tracts are:"
)
log_info(added_tracts_str)
summary += (
f"* There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the production score that are not disadvantaged in the locally" f"* There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the production score that are not disadvantaged in the locally"
f" generated score (i.e. disadvantaged tracts that were removed by the new score). Those tracts are:\n{removed_tracts_str}\n" f" generated score (i.e. disadvantaged tracts that were removed by the new score). "
f" There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
f" production score (i.e. disadvantaged tracts that were added by the new score). Those tracts are:\n{added_tracts_str}\n\n"
) )
if len(removed_tracts) > 0:
_add_text(f"Those tracts are:\n{removed_tracts}\n")
_add_text(
f"* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
f" production score (i.e. disadvantaged tracts that were added by the new score). "
)
if len(added_tracts) > 0:
_add_text(f"Those tracts are:\n{added_tracts}\n")
################
# Create a delta
################
_add_text("\n## Delta\n")
# First we make the columns on two dataframes to be the same to be able to compare
trimmed_prod_df = production_score_df.drop(extra_cols_in_prod, axis=1)
trimmed_local_df = local_score_df.drop(extra_cols_in_local, axis=1)
try: try:
comparison_results_df = production_score_df.compare( comparison_results_df = trimmed_prod_df.compare(
local_score_df, align_axis=1, keep_shape=False, keep_equal=False trimmed_local_df, align_axis=1, keep_shape=False, keep_equal=False
).rename({"self": "Production", "other": "Local"}, axis=1, level=1) ).rename({"self": "Production", "other": "Local"}, axis=1, level=1)
summary += "* I compared all values across all census tracts." _add_text(
summary += f" There are {len(comparison_results_df.index):,} tracts with at least one difference." "* I compared all values across all census tracts. Note this ignores any columns that have been added or removed."
summary += " Please examine the logs or run the score comparison locally to view them all.\n" f" There are {len(comparison_results_df.index):,} tracts with at least one difference.\n"
log_info(
f"There are {len(comparison_results_df.index)} rows with any differences."
) )
if len(comparison_results_df.index) > 0:
log_info("Those differences are:") comparison_path = WORKING_PATH / "deltas.csv"
log_info("\n" + str(comparison_results_df)) comparison_results_df.to_csv(path_or_buf=comparison_path)
comparison_path = WORKING_PATH / "deltas.csv"
comparison_results_df.to_csv(path_or_buf=comparison_path) _add_text(f"* Wrote comparison results to {comparison_path}")
log_info(f"Wrote comparison results to {comparison_path}")
except ValueError as e: except ValueError as e:
summary += "* I could not run a full comparison. This is likely because there are column or index (census tract) differences." _add_text(
summary += " Please examine the logs or run the score comparison locally to find out more.\n" "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
log_info( " Please examine the logs or run the score comparison locally to find out more.\n"
f"Encountered an exception while performing the comparison: {repr(e)}" )
_add_text(
f"Encountered an exception while performing the comparison: {repr(e)}\n"
) )
result_doc = _get_result_doc()
print(result_doc)
# Write the report
summary_path = WORKING_PATH / "comparison-summary.md" summary_path = WORKING_PATH / "comparison-summary.md"
with open(summary_path, "w", encoding="utf-8") as f: with open(summary_path, "w", encoding="utf-8") as f:
f.write(summary) f.write(result_doc)
log_info(f"Wrote comparison summary to {summary_path}") log_info(f"Wrote comparison summary to {summary_path}")
log_goodbye() log_goodbye()