mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 01:31:25 -08:00
Add ability to compare scores (#2172)
* Add ability to perform score comparisons, and include that ability in github action * Update version of add-pr-comment * Update code with black * Update comparator.py * Fix message-path and remove extra parameter * Update the text to be a bit more friendly and easy to read
This commit is contained in:
parent
a49770038d
commit
79c223b646
2 changed files with 183 additions and 1 deletions
10
.github/workflows/deploy_be_staging.yml
vendored
10
.github/workflows/deploy_be_staging.yml
vendored
|
@ -72,7 +72,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
|
poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
|
||||||
poetry run s4cmd put ./data_pipeline/files/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read
|
poetry run s4cmd put ./data_pipeline/files/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read
|
||||||
|
|
||||||
- name: Update PR with deployed Score URLs
|
- name: Update PR with deployed Score URLs
|
||||||
uses: mshick/add-pr-comment@v1
|
uses: mshick/add-pr-comment@v1
|
||||||
with:
|
with:
|
||||||
|
@ -85,6 +84,15 @@ jobs:
|
||||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
repo-token-user-login: "github-actions[bot]"
|
repo-token-user-login: "github-actions[bot]"
|
||||||
allow-repeats: false
|
allow-repeats: false
|
||||||
|
- name: Perform Score Comparisons
|
||||||
|
run: |
|
||||||
|
poetry run python3 data_pipeline/comparator.py compare-score
|
||||||
|
- name: Update PR with Score Comparisons
|
||||||
|
uses: mshick/add-pr-comment@v2
|
||||||
|
with:
|
||||||
|
message-path: ./data/data-pipeline/data_pipeline/data/tmp/Comparator/Score/comparison-summary.md
|
||||||
|
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
allow-repeats: false
|
||||||
- name: Set timezone for tippecanoe
|
- name: Set timezone for tippecanoe
|
||||||
uses: szenius/set-timezone@v1.0
|
uses: szenius/set-timezone@v1.0
|
||||||
with:
|
with:
|
||||||
|
|
174
data/data-pipeline/data_pipeline/comparator.py
Normal file
174
data/data-pipeline/data_pipeline/comparator.py
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
import sys
|
||||||
|
import click
|
||||||
|
import difflib
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from data_pipeline.etl.score import constants
|
||||||
|
from data_pipeline.utils import get_module_logger, download_file_from_url
|
||||||
|
from data_pipeline.application import log_title, log_info, log_goodbye
|
||||||
|
|
||||||
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
pd.set_option("display.max_columns", None)
|
||||||
|
pd.set_option("display.max_colwidth", None)
|
||||||
|
pd.set_option("display.max_rows", None)
|
||||||
|
pd.set_option("display.width", 10000)
|
||||||
|
pd.set_option("display.colheader_justify", "left")
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def cli():
|
||||||
|
"""
|
||||||
|
A helper tool to run comparisons between files in production and those
|
||||||
|
in the local file system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(
|
||||||
|
help="Compare score stored in the AWS production environment to the production score. Defaults to checking against version 1.0.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-v",
|
||||||
|
"--compare-to-version",
|
||||||
|
default="1.0",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
)
|
||||||
|
def compare_score(compare_to_version: str):
|
||||||
|
"""Compares the score in the production environment to the locally generated score. The
|
||||||
|
algorithm is pretty simple:
|
||||||
|
|
||||||
|
1. Fetch and load both scores into dataframes.
|
||||||
|
2. Round floats to a number of decimal places to account for differences in the machine
|
||||||
|
and python versions used to generate the scores. If we skip this step, there are usually
|
||||||
|
thousands of extremely minor differences.
|
||||||
|
3. Compare the columns. Print out the deltas.
|
||||||
|
4. Compare the values. Print out the deltas. Save the deltas to deltas.csv.
|
||||||
|
5. Save a nice summary to comparison-summary.md. End.
|
||||||
|
"""
|
||||||
|
|
||||||
|
FLOAT_ROUNDING_PLACES = 2
|
||||||
|
WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score"
|
||||||
|
|
||||||
|
summary = "# Score Comparison Summary\n"
|
||||||
|
summary += f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
|
||||||
|
summary += " freshly calculated score. Here are the results.\n"
|
||||||
|
|
||||||
|
log_title("Compare Score", "Compare production score to local score")
|
||||||
|
|
||||||
|
locally_generated_score_path = constants.DATA_SCORE_CSV_FULL_FILE_PATH
|
||||||
|
if not locally_generated_score_path.is_file():
|
||||||
|
logger.error(
|
||||||
|
f"- No score file exists at {locally_generated_score_path}. Please generate the score and try again."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# TODO: transition to downloader code when it's available
|
||||||
|
production_score_url = f"https://justice40-data.s3.amazonaws.com/data-versions/{compare_to_version}/data/score/csv/full/usa.csv"
|
||||||
|
production_score_path = WORKING_PATH / "usa.csv"
|
||||||
|
|
||||||
|
log_info(f"Fetching score version {compare_to_version} from AWS")
|
||||||
|
production_score_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
download_file_from_url(
|
||||||
|
file_url=production_score_url, download_file_name=production_score_path
|
||||||
|
)
|
||||||
|
|
||||||
|
log_info("Loading files into pandas for comparisons")
|
||||||
|
|
||||||
|
local_score_df = pd.read_csv(
|
||||||
|
locally_generated_score_path,
|
||||||
|
index_col="GEOID10_TRACT",
|
||||||
|
dtype={"GEOID10_TRACT": str},
|
||||||
|
low_memory=False,
|
||||||
|
).sort_index()
|
||||||
|
production_score_df = pd.read_csv(
|
||||||
|
production_score_path,
|
||||||
|
index_col="GEOID10_TRACT",
|
||||||
|
dtype={"GEOID10_TRACT": str},
|
||||||
|
low_memory=False,
|
||||||
|
).sort_index()
|
||||||
|
|
||||||
|
# Because of variations in Python versions and machine-level calculations, some of
|
||||||
|
# our numbers can be really close but not the same. That throws off our comparisons.
|
||||||
|
# So we're going to round to a reasonable amount of digits before doing anything else.
|
||||||
|
|
||||||
|
production_score_df = production_score_df.round(FLOAT_ROUNDING_PLACES)
|
||||||
|
local_score_df = local_score_df.round(FLOAT_ROUNDING_PLACES)
|
||||||
|
|
||||||
|
local_score_df_columns = sorted(local_score_df.columns.array.tolist())
|
||||||
|
production_score_df_columns = sorted(
|
||||||
|
production_score_df.columns.array.tolist()
|
||||||
|
)
|
||||||
|
|
||||||
|
log_info("Comparing columns (production vs local). Differences are: ")
|
||||||
|
summary += "\n## Columns\n"
|
||||||
|
summary += "I compared the columns. Here's what I found.\n"
|
||||||
|
|
||||||
|
col_diff = difflib.unified_diff(
|
||||||
|
production_score_df_columns, local_score_df_columns
|
||||||
|
)
|
||||||
|
col_diff_res = ""
|
||||||
|
for d in col_diff:
|
||||||
|
col_diff_res += str(d) + "\n"
|
||||||
|
|
||||||
|
if len(col_diff_res) == 0:
|
||||||
|
log_info("None. Columns are the same")
|
||||||
|
summary += "* There are no differences in the column names.\n"
|
||||||
|
else:
|
||||||
|
log_info("There are differences. The diff is:")
|
||||||
|
log_info(col_diff_res)
|
||||||
|
summary += f"* There are differences in the column names. Here's a diff:\n{col_diff_res}\n"
|
||||||
|
|
||||||
|
log_info("Comparing dataframe contents (production vs local)")
|
||||||
|
summary += "\n## Scores\n"
|
||||||
|
summary += "I compared the scores, too. Here's what I found.\n"
|
||||||
|
|
||||||
|
production_row_count = len(production_score_df.index)
|
||||||
|
local_row_count = len(local_score_df.index)
|
||||||
|
|
||||||
|
summary += f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
|
||||||
|
summary += (
|
||||||
|
" They match!\n"
|
||||||
|
if production_row_count == local_row_count
|
||||||
|
else " They don't match.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
comparison_results_df = production_score_df.compare(
|
||||||
|
local_score_df, align_axis=1, keep_shape=False, keep_equal=False
|
||||||
|
).rename({"self": "Production", "other": "Local"}, axis=1, level=1)
|
||||||
|
|
||||||
|
summary += f"* I compared all of the census tracts. There are {len(comparison_results_df.index):,} tracts with at least one score difference."
|
||||||
|
summary += " Please examine the logs or run the score comparison locally to view them all.\n"
|
||||||
|
log_info(
|
||||||
|
f"There are {len(comparison_results_df.index)} rows with differences"
|
||||||
|
)
|
||||||
|
|
||||||
|
log_info("Those differences are:")
|
||||||
|
log_info("\n" + str(comparison_results_df))
|
||||||
|
|
||||||
|
comparison_path = WORKING_PATH / "deltas.csv"
|
||||||
|
comparison_results_df.to_csv(path_or_buf=comparison_path)
|
||||||
|
|
||||||
|
log_info(f"Wrote comparison results to {comparison_path}")
|
||||||
|
|
||||||
|
except ValueError as e:
|
||||||
|
summary += "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
|
||||||
|
summary += " Please examine the logs or run the score comparison locally to find out more.\n"
|
||||||
|
log_info(
|
||||||
|
f"Encountered an exception while performing the comparison: {repr(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
summary_path = WORKING_PATH / "comparison-summary.md"
|
||||||
|
|
||||||
|
with open(summary_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(summary)
|
||||||
|
log_info(f"Wrote comparison summary to {summary_path}")
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli()
|
Loading…
Add table
Reference in a new issue