From 35f1cffbb451704b8c3e8de3e96f9d41c9d4fc8d Mon Sep 17 00:00:00 2001 From: Carlos Felix <63804190+carlosfelix2@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:51:00 -0500 Subject: [PATCH] Comparison tool can use local files instead of downloading production score --- .../data-pipeline/data_pipeline/comparator.py | 96 +++++++++++-------- 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/data/data-pipeline/data_pipeline/comparator.py b/data/data-pipeline/data_pipeline/comparator.py index 6cf184b6..6aa85ffd 100644 --- a/data/data-pipeline/data_pipeline/comparator.py +++ b/data/data-pipeline/data_pipeline/comparator.py @@ -2,6 +2,7 @@ import sys import click import difflib import pandas as pd +from pathlib import Path from data_pipeline.etl.score import constants from data_pipeline.utils import get_module_logger, download_file_from_url @@ -16,6 +17,22 @@ pd.set_option("display.width", 10000) pd.set_option("display.colheader_justify", "left") +def _read_from_file(file_path: Path): + """Read a CSV file into a Dataframe.""" + if not file_path.is_file(): + logger.error( + f"- No score file exists at {file_path}. " + "Please generate the score and try again." + ) + sys.exit(1) + return pd.read_csv( + file_path, + index_col="GEOID10_TRACT", + dtype={"GEOID10_TRACT": str}, + low_memory=False, + ).sort_index() + + @click.group() def cli(): """ @@ -33,8 +50,22 @@ def cli(): default="1.0", required=False, type=str, + help="Set the production score version to compare to", ) -def compare_score(compare_to_version: str): +@click.option( + "-f", + "--compare_to_file", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + help="Compare to the specified score CSV file instead of downloading from production", +) +@click.option( + "-l", + "--local_score_file", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + default=constants.DATA_SCORE_CSV_FULL_FILE_PATH, + help="Compare to the specified score CSV file instead of downloading from production", +) +def compare_score(compare_to_version: str, compare_to_file: str, local_score_file: str): """Compares the score in the production environment to the locally generated score. The algorithm is pretty simple: @@ -56,39 +87,25 @@ def compare_score(compare_to_version: str): log_title("Compare Score", "Compare production score to local score") - locally_generated_score_path = constants.DATA_SCORE_CSV_FULL_FILE_PATH - if not locally_generated_score_path.is_file(): - logger.error( - f"- No score file exists at {locally_generated_score_path}. Please generate the score and try again." + if compare_to_file: + log_info(f"Comparing to file {compare_to_file}...") + production_score_path = compare_to_file + else: + # TODO: transition to downloader code when it's available + production_score_url = f"https://justice40-data.s3.amazonaws.com/data-versions/{compare_to_version}/data/score/csv/full/usa.csv" + production_score_path = WORKING_PATH / "usa.csv" + + log_info(f"Fetching score version {compare_to_version} from AWS") + production_score_path.parent.mkdir(parents=True, exist_ok=True) + download_file_from_url( + file_url=production_score_url, + download_file_name=production_score_path, ) - sys.exit(1) - # TODO: transition to downloader code when it's available - production_score_url = f"https://justice40-data.s3.amazonaws.com/data-versions/{compare_to_version}/data/score/csv/full/usa.csv" - production_score_path = ( - WORKING_PATH / f"prod-score-csv-full-{compare_to_version}-usa.csv" - ) - - log_info(f"Fetching score version {compare_to_version} from AWS") - production_score_path.parent.mkdir(parents=True, exist_ok=True) - download_file_from_url( - file_url=production_score_url, download_file_name=production_score_path - ) - - log_info("Loading files into pandas for comparisons") - - local_score_df = pd.read_csv( - locally_generated_score_path, - index_col="GEOID10_TRACT", - dtype={"GEOID10_TRACT": str}, - low_memory=False, - ).sort_index() - production_score_df = pd.read_csv( - production_score_path, - index_col="GEOID10_TRACT", - dtype={"GEOID10_TRACT": str}, - low_memory=False, - ).sort_index() + log_info(f"Loading local score from {local_score_file}") + local_score_df = _read_from_file(local_score_file) + log_info(f"Loading production score from {production_score_path}") + production_score_df = _read_from_file(production_score_path) # Because of variations in Python versions and machine-level calculations, some of # our numbers can be really close but not the same. That throws off our comparisons. @@ -241,17 +258,16 @@ def compare_score(compare_to_version: str): summary += "* I compared all values across all census tracts." summary += f" There are {len(comparison_results_df.index):,} tracts with at least one difference." summary += " Please examine the logs or run the score comparison locally to view them all.\n" + log_info( f"There are {len(comparison_results_df.index)} rows with any differences." ) - - log_info("Those differences are:") - log_info("\n" + str(comparison_results_df)) - - comparison_path = WORKING_PATH / "deltas.csv" - comparison_results_df.to_csv(path_or_buf=comparison_path) - - log_info(f"Wrote comparison results to {comparison_path}") + if len(comparison_results_df.index) > 0: + log_info("Those differences are:") + log_info("\n" + str(comparison_results_df)) + comparison_path = WORKING_PATH / "deltas.csv" + comparison_results_df.to_csv(path_or_buf=comparison_path) + log_info(f"Wrote comparison results to {comparison_path}") except ValueError as e: summary += "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."