mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-21 09:11:26 -08:00
Add ability to compare scores (#2172)
* Add ability to perform score comparisons, and include that ability in github action * Update version of add-pr-comment * Update code with black * Update comparator.py * Fix message-path and remove extra parameter * Update the text to be a bit more friendly and easy to read
This commit is contained in:
parent
a49770038d
commit
79c223b646
2 changed files with 183 additions and 1 deletions
10
.github/workflows/deploy_be_staging.yml
vendored
10
.github/workflows/deploy_be_staging.yml
vendored
|
@ -72,7 +72,6 @@ jobs:
|
|||
run: |
|
||||
poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
|
||||
poetry run s4cmd put ./data_pipeline/files/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read
|
||||
|
||||
- name: Update PR with deployed Score URLs
|
||||
uses: mshick/add-pr-comment@v1
|
||||
with:
|
||||
|
@ -85,6 +84,15 @@ jobs:
|
|||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
repo-token-user-login: "github-actions[bot]"
|
||||
allow-repeats: false
|
||||
- name: Perform Score Comparisons
|
||||
run: |
|
||||
poetry run python3 data_pipeline/comparator.py compare-score
|
||||
- name: Update PR with Score Comparisons
|
||||
uses: mshick/add-pr-comment@v2
|
||||
with:
|
||||
message-path: ./data/data-pipeline/data_pipeline/data/tmp/Comparator/Score/comparison-summary.md
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
allow-repeats: false
|
||||
- name: Set timezone for tippecanoe
|
||||
uses: szenius/set-timezone@v1.0
|
||||
with:
|
||||
|
|
174
data/data-pipeline/data_pipeline/comparator.py
Normal file
174
data/data-pipeline/data_pipeline/comparator.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
import sys
|
||||
import click
|
||||
import difflib
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.score import constants
|
||||
from data_pipeline.utils import get_module_logger, download_file_from_url
|
||||
from data_pipeline.application import log_title, log_info, log_goodbye
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
pd.set_option("display.max_columns", None)
|
||||
pd.set_option("display.max_colwidth", None)
|
||||
pd.set_option("display.max_rows", None)
|
||||
pd.set_option("display.width", 10000)
|
||||
pd.set_option("display.colheader_justify", "left")
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""
|
||||
A helper tool to run comparisons between files in production and those
|
||||
in the local file system.
|
||||
"""
|
||||
|
||||
|
||||
@cli.command(
|
||||
help="Compare score stored in the AWS production environment to the production score. Defaults to checking against version 1.0.",
|
||||
)
|
||||
@click.option(
|
||||
"-v",
|
||||
"--compare-to-version",
|
||||
default="1.0",
|
||||
required=False,
|
||||
type=str,
|
||||
)
|
||||
def compare_score(compare_to_version: str):
|
||||
"""Compares the score in the production environment to the locally generated score. The
|
||||
algorithm is pretty simple:
|
||||
|
||||
1. Fetch and load both scores into dataframes.
|
||||
2. Round floats to a number of decimal places to account for differences in the machine
|
||||
and python versions used to generate the scores. If we skip this step, there are usually
|
||||
thousands of extremely minor differences.
|
||||
3. Compare the columns. Print out the deltas.
|
||||
4. Compare the values. Print out the deltas. Save the deltas to deltas.csv.
|
||||
5. Save a nice summary to comparison-summary.md. End.
|
||||
"""
|
||||
|
||||
FLOAT_ROUNDING_PLACES = 2
|
||||
WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score"
|
||||
|
||||
summary = "# Score Comparison Summary\n"
|
||||
summary += f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
|
||||
summary += " freshly calculated score. Here are the results.\n"
|
||||
|
||||
log_title("Compare Score", "Compare production score to local score")
|
||||
|
||||
locally_generated_score_path = constants.DATA_SCORE_CSV_FULL_FILE_PATH
|
||||
if not locally_generated_score_path.is_file():
|
||||
logger.error(
|
||||
f"- No score file exists at {locally_generated_score_path}. Please generate the score and try again."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: transition to downloader code when it's available
|
||||
production_score_url = f"https://justice40-data.s3.amazonaws.com/data-versions/{compare_to_version}/data/score/csv/full/usa.csv"
|
||||
production_score_path = WORKING_PATH / "usa.csv"
|
||||
|
||||
log_info(f"Fetching score version {compare_to_version} from AWS")
|
||||
production_score_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
download_file_from_url(
|
||||
file_url=production_score_url, download_file_name=production_score_path
|
||||
)
|
||||
|
||||
log_info("Loading files into pandas for comparisons")
|
||||
|
||||
local_score_df = pd.read_csv(
|
||||
locally_generated_score_path,
|
||||
index_col="GEOID10_TRACT",
|
||||
dtype={"GEOID10_TRACT": str},
|
||||
low_memory=False,
|
||||
).sort_index()
|
||||
production_score_df = pd.read_csv(
|
||||
production_score_path,
|
||||
index_col="GEOID10_TRACT",
|
||||
dtype={"GEOID10_TRACT": str},
|
||||
low_memory=False,
|
||||
).sort_index()
|
||||
|
||||
# Because of variations in Python versions and machine-level calculations, some of
|
||||
# our numbers can be really close but not the same. That throws off our comparisons.
|
||||
# So we're going to round to a reasonable amount of digits before doing anything else.
|
||||
|
||||
production_score_df = production_score_df.round(FLOAT_ROUNDING_PLACES)
|
||||
local_score_df = local_score_df.round(FLOAT_ROUNDING_PLACES)
|
||||
|
||||
local_score_df_columns = sorted(local_score_df.columns.array.tolist())
|
||||
production_score_df_columns = sorted(
|
||||
production_score_df.columns.array.tolist()
|
||||
)
|
||||
|
||||
log_info("Comparing columns (production vs local). Differences are: ")
|
||||
summary += "\n## Columns\n"
|
||||
summary += "I compared the columns. Here's what I found.\n"
|
||||
|
||||
col_diff = difflib.unified_diff(
|
||||
production_score_df_columns, local_score_df_columns
|
||||
)
|
||||
col_diff_res = ""
|
||||
for d in col_diff:
|
||||
col_diff_res += str(d) + "\n"
|
||||
|
||||
if len(col_diff_res) == 0:
|
||||
log_info("None. Columns are the same")
|
||||
summary += "* There are no differences in the column names.\n"
|
||||
else:
|
||||
log_info("There are differences. The diff is:")
|
||||
log_info(col_diff_res)
|
||||
summary += f"* There are differences in the column names. Here's a diff:\n{col_diff_res}\n"
|
||||
|
||||
log_info("Comparing dataframe contents (production vs local)")
|
||||
summary += "\n## Scores\n"
|
||||
summary += "I compared the scores, too. Here's what I found.\n"
|
||||
|
||||
production_row_count = len(production_score_df.index)
|
||||
local_row_count = len(local_score_df.index)
|
||||
|
||||
summary += f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
|
||||
summary += (
|
||||
" They match!\n"
|
||||
if production_row_count == local_row_count
|
||||
else " They don't match.\n"
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
comparison_results_df = production_score_df.compare(
|
||||
local_score_df, align_axis=1, keep_shape=False, keep_equal=False
|
||||
).rename({"self": "Production", "other": "Local"}, axis=1, level=1)
|
||||
|
||||
summary += f"* I compared all of the census tracts. There are {len(comparison_results_df.index):,} tracts with at least one score difference."
|
||||
summary += " Please examine the logs or run the score comparison locally to view them all.\n"
|
||||
log_info(
|
||||
f"There are {len(comparison_results_df.index)} rows with differences"
|
||||
)
|
||||
|
||||
log_info("Those differences are:")
|
||||
log_info("\n" + str(comparison_results_df))
|
||||
|
||||
comparison_path = WORKING_PATH / "deltas.csv"
|
||||
comparison_results_df.to_csv(path_or_buf=comparison_path)
|
||||
|
||||
log_info(f"Wrote comparison results to {comparison_path}")
|
||||
|
||||
except ValueError as e:
|
||||
summary += "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
|
||||
summary += " Please examine the logs or run the score comparison locally to find out more.\n"
|
||||
log_info(
|
||||
f"Encountered an exception while performing the comparison: {repr(e)}"
|
||||
)
|
||||
|
||||
summary_path = WORKING_PATH / "comparison-summary.md"
|
||||
|
||||
with open(summary_path, "w", encoding="utf-8") as f:
|
||||
f.write(summary)
|
||||
log_info(f"Wrote comparison summary to {summary_path}")
|
||||
|
||||
log_goodbye()
|
||||
sys.exit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
Loading…
Add table
Reference in a new issue