Add ability to compare scores (#2172)

* Add ability to perform score comparisons, and include that ability in github action * Update version of add-pr-comment * Update code with black * Update comparator.py * Fix message-path and remove extra parameter * Update the text to be a bit more friendly and easy to read
2025-02-21 09:11:26 -08:00 · 2023-02-21 16:50:31 -06:00 · 2023-02-21 16:50:31 -06:00 · 79c223b646
commit 79c223b646
parent a49770038d
2 changed files with 183 additions and 1 deletions
--- a/.github/workflows/deploy_be_staging.yml
+++ b/.github/workflows/deploy_be_staging.yml
@ -72,7 +72,6 @@ jobs:
        run: |
          poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
          poetry run s4cmd put ./data_pipeline/files/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read
-
      - name: Update PR with deployed Score URLs
        uses: mshick/add-pr-comment@v1
        with:
@ -85,6 +84,15 @@ jobs:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
          repo-token-user-login: "github-actions[bot]"
          allow-repeats: false
+      - name: Perform Score Comparisons
+        run: |
+          poetry run python3 data_pipeline/comparator.py compare-score
+      - name: Update PR with Score Comparisons
+        uses: mshick/add-pr-comment@v2
+        with:
+          message-path: ./data/data-pipeline/data_pipeline/data/tmp/Comparator/Score/comparison-summary.md
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          allow-repeats: false
      - name: Set timezone for tippecanoe
        uses: szenius/set-timezone@v1.0
        with:
--- a/data/data-pipeline/data_pipeline/comparator.py
+++ b/data/data-pipeline/data_pipeline/comparator.py
@ -0,0 +1,174 @@
+import sys
+import click
+import difflib
+import pandas as pd
+
+from data_pipeline.etl.score import constants
+from data_pipeline.utils import get_module_logger, download_file_from_url
+from data_pipeline.application import log_title, log_info, log_goodbye
+
+logger = get_module_logger(__name__)
+
+pd.set_option("display.max_columns", None)
+pd.set_option("display.max_colwidth", None)
+pd.set_option("display.max_rows", None)
+pd.set_option("display.width", 10000)
+pd.set_option("display.colheader_justify", "left")
+
+
+@click.group()
+def cli():
+    """
+    A helper tool to run comparisons between files in production and those
+    in the local file system.
+    """
+
+
+@cli.command(
+    help="Compare score stored in the AWS production environment to the production score. Defaults to checking against version 1.0.",
+)
+@click.option(
+    "-v",
+    "--compare-to-version",
+    default="1.0",
+    required=False,
+    type=str,
+)
+def compare_score(compare_to_version: str):
+    """Compares the score in the production environment to the locally generated score. The
+    algorithm is pretty simple:
+
+    1. Fetch and load both scores into dataframes.
+    2. Round floats to a number of decimal places to account for differences in the machine
+    and python versions used to generate the scores. If we skip this step, there are usually
+    thousands of extremely minor differences.
+    3. Compare the columns. Print out the deltas.
+    4. Compare the values. Print out the deltas. Save the deltas to deltas.csv.
+    5. Save a nice summary to comparison-summary.md. End.
+    """
+
+    FLOAT_ROUNDING_PLACES = 2
+    WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score"
+
+    summary = "# Score Comparison Summary\n"
+    summary += f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
+    summary += " freshly calculated score. Here are the results.\n"
+
+    log_title("Compare Score", "Compare production score to local score")
+
+    locally_generated_score_path = constants.DATA_SCORE_CSV_FULL_FILE_PATH
+    if not locally_generated_score_path.is_file():
+        logger.error(
+            f"- No score file exists at {locally_generated_score_path}. Please generate the score and try again."
+        )
+        sys.exit(1)
+
+    # TODO: transition to downloader code when it's available
+    production_score_url = f"https://justice40-data.s3.amazonaws.com/data-versions/{compare_to_version}/data/score/csv/full/usa.csv"
+    production_score_path = WORKING_PATH / "usa.csv"
+
+    log_info(f"Fetching score version {compare_to_version} from AWS")
+    production_score_path.parent.mkdir(parents=True, exist_ok=True)
+    download_file_from_url(
+        file_url=production_score_url, download_file_name=production_score_path
+    )
+
+    log_info("Loading files into pandas for comparisons")
+
+    local_score_df = pd.read_csv(
+        locally_generated_score_path,
+        index_col="GEOID10_TRACT",
+        dtype={"GEOID10_TRACT": str},
+        low_memory=False,
+    ).sort_index()
+    production_score_df = pd.read_csv(
+        production_score_path,
+        index_col="GEOID10_TRACT",
+        dtype={"GEOID10_TRACT": str},
+        low_memory=False,
+    ).sort_index()
+
+    # Because of variations in Python versions and machine-level calculations, some of
+    # our numbers can be really close but not the same. That throws off our comparisons.
+    # So we're going to round to a reasonable amount of digits before doing anything else.
+
+    production_score_df = production_score_df.round(FLOAT_ROUNDING_PLACES)
+    local_score_df = local_score_df.round(FLOAT_ROUNDING_PLACES)
+
+    local_score_df_columns = sorted(local_score_df.columns.array.tolist())
+    production_score_df_columns = sorted(
+        production_score_df.columns.array.tolist()
+    )
+
+    log_info("Comparing columns (production vs local). Differences are: ")
+    summary += "\n## Columns\n"
+    summary += "I compared the columns. Here's what I found.\n"
+
+    col_diff = difflib.unified_diff(
+        production_score_df_columns, local_score_df_columns
+    )
+    col_diff_res = ""
+    for d in col_diff:
+        col_diff_res += str(d) + "\n"
+
+    if len(col_diff_res) == 0:
+        log_info("None. Columns are the same")
+        summary += "* There are no differences in the column names.\n"
+    else:
+        log_info("There are differences. The diff is:")
+        log_info(col_diff_res)
+        summary += f"* There are differences in the column names. Here's a diff:\n{col_diff_res}\n"
+
+    log_info("Comparing dataframe contents (production vs local)")
+    summary += "\n## Scores\n"
+    summary += "I compared the scores, too. Here's what I found.\n"
+
+    production_row_count = len(production_score_df.index)
+    local_row_count = len(local_score_df.index)
+
+    summary += f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
+    summary += (
+        " They match!\n"
+        if production_row_count == local_row_count
+        else " They don't match.\n"
+    )
+
+    try:
+
+        comparison_results_df = production_score_df.compare(
+            local_score_df, align_axis=1, keep_shape=False, keep_equal=False
+        ).rename({"self": "Production", "other": "Local"}, axis=1, level=1)
+
+        summary += f"* I compared all of the census tracts. There are {len(comparison_results_df.index):,} tracts with at least one score difference."
+        summary += " Please examine the logs or run the score comparison locally to view them all.\n"
+        log_info(
+            f"There are {len(comparison_results_df.index)} rows with differences"
+        )
+
+        log_info("Those differences are:")
+        log_info("\n" + str(comparison_results_df))
+
+        comparison_path = WORKING_PATH / "deltas.csv"
+        comparison_results_df.to_csv(path_or_buf=comparison_path)
+
+        log_info(f"Wrote comparison results to {comparison_path}")
+
+    except ValueError as e:
+        summary += "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
+        summary += " Please examine the logs or run the score comparison locally to find out more.\n"
+        log_info(
+            f"Encountered an exception while performing the comparison: {repr(e)}"
+        )
+
+    summary_path = WORKING_PATH / "comparison-summary.md"
+
+    with open(summary_path, "w", encoding="utf-8") as f:
+        f.write(summary)
+        log_info(f"Wrote comparison summary to {summary_path}")
+
+    log_goodbye()
+    sys.exit()
+
+
+if __name__ == "__main__":
+    cli()