From 2f97674413696dd6e119672049b9d50abaa85897 Mon Sep 17 00:00:00 2001
From: Carlos Felix <63804190+carlosfelix2@users.noreply.github.com>
Date: Fri, 6 Dec 2024 09:57:31 -0500
Subject: [PATCH] Updates to comparator and libraries

---
 .../data-pipeline/data_pipeline/comparator.py | 395 ++++++++++--------
 data/data-pipeline/poetry.lock                |   2 +-
 data/data-pipeline/pyproject.toml             |   5 +
 3 files changed, 229 insertions(+), 173 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/comparator.py b/data/data-pipeline/data_pipeline/comparator.py
index 860be7bb..97e512c3 100644
--- a/data/data-pipeline/data_pipeline/comparator.py
+++ b/data/data-pipeline/data_pipeline/comparator.py
@@ -17,6 +17,7 @@ pd.set_option("display.width", 10000)
 pd.set_option("display.colheader_justify", "left")
 
 result_text = []
+WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score"
 
 
 def _add_text(text: str):
@@ -38,7 +39,12 @@ def _get_result_doc() -> str:
 
 
 def _read_from_file(file_path: Path):
-    """Read a CSV file into a Dataframe."""
+    """
+    Read a CSV file into a Dataframe.
+
+    Args:
+        file_path (Path): the path of the file to read
+    """
     if not file_path.is_file():
         logger.error(
             f"- No score file exists at {file_path}. "
@@ -53,6 +59,219 @@ def _read_from_file(file_path: Path):
     ).sort_index()
 
 
+def _add_tract_list(tract_list: list[str]):
+    """
+    Adds a list of tracts to the output grouped by Census state.
+
+    Args:
+        tract_list (list[str]): a list of tracts
+    """
+    if len(tract_list) > 0:
+        _add_text("Those tracts are:\n")
+        # First extract the Census states/territories
+        states_by_tract = []
+        for tract in tract_list:
+            states_by_tract.append(tract[0:2])
+        states = set(states_by_tract)
+        # Now output the grouped tracts
+        for state in sorted(states):
+            tracts_for_state = [
+                item for item in tract_list if item.startswith(state)
+            ]
+            _add_text(
+                f"\t{state} = {len(tracts_for_state)} = {', '.join(tracts_for_state)}\n"
+            )
+
+
+def _compare_score_columns(prod_df: pd.DataFrame, local_df: pd.DataFrame):
+    """
+    Compare the columns between scores.
+
+    Args:
+        prod_df (pd.DataFrame): the production score
+        local_df (pd.DataFrame): the local score
+    """
+    log_info("Comparing columns (production vs local)")
+    _add_text("## Columns\n")
+    local_score_df_columns = sorted(local_df.columns.array.tolist())
+    production_score_df_columns = sorted(prod_df.columns.array.tolist())
+    extra_cols_in_local = set(local_score_df_columns) - set(
+        production_score_df_columns
+    )
+    extra_cols_in_prod = set(production_score_df_columns) - set(
+        local_score_df_columns
+    )
+    if len(extra_cols_in_local) == 0 and len(extra_cols_in_prod) == 0:
+        _add_text("* There are no differences in the column names.\n")
+    else:
+        _add_text(
+            f"* There are {len(extra_cols_in_local)} columns that were added as compared to the production score."
+        )
+        if len(extra_cols_in_local) > 0:
+            _add_text(f" Those colums are:\n{extra_cols_in_local}")
+        _add_text(
+            f"\n* There are {len(extra_cols_in_prod)} columns that were removed as compared to the production score."
+        )
+        if len(extra_cols_in_prod) > 0:
+            _add_text(f" Those colums are:\n{extra_cols_in_prod}")
+
+
+def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
+    """
+    Compare the scores.
+
+    Args:
+        prod_df (pd.DataFrame): the production score
+        local_df (pd.DataFrame): the local score
+    """
+    log_info("Comparing dataframe contents (production vs local)")
+    _add_text("\n\n## Scores\n")
+
+    production_row_count = len(prod_df.index)
+    local_row_count = len(local_df.index)
+
+    # Tract comparison
+    _add_text(
+        f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
+    )
+    if production_row_count == local_row_count:
+        _add_text(" They match!\n")
+    else:
+        _add_text(" They don't match. The differences are:\n")
+        _add_text(
+            "  * New tracts added to the local score are:\n"
+            f"{local_df.index.difference(prod_df.index).to_list()}"
+            "\n  * Tracts removed from the local score are:\n"
+            f"{prod_df.index.difference(local_df.index).to_list()}"
+            "\n"
+        )
+
+    # Population comparison
+    production_total_population = prod_df[field_names.TOTAL_POP_FIELD].sum()
+    local_total_population = local_df[field_names.TOTAL_POP_FIELD].sum()
+
+    _add_text(
+        f"* The total population in all census tracts in the production score is {production_total_population:,}. "
+        f"The total population in all census tracts locally is {local_total_population:,}. "
+    )
+    _add_text(
+        "They match!\n"
+        if production_total_population == local_total_population
+        else f"The difference is {abs(production_total_population - local_total_population):,}.\n"
+    )
+
+    dacs_query = f"`{field_names.FINAL_SCORE_N_BOOLEAN}` == True"
+    production_disadvantaged_tracts_df = prod_df.query(dacs_query)
+    local_disadvantaged_tracts_df = local_df.query(dacs_query)
+
+    production_disadvantaged_tracts_set = set(
+        production_disadvantaged_tracts_df.index.array
+    )
+    local_disadvantaged_tracts_set = set(
+        local_disadvantaged_tracts_df.index.array
+    )
+
+    production_pct_of_population_represented = (
+        production_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
+        / production_total_population
+    )
+    local_pct_of_population_represented = (
+        local_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
+        / local_total_population
+    )
+
+    # DACS comparison
+    _add_text(
+        f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
+        f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
+    )
+    _add_text(
+        f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
+    )
+    _add_text(
+        " The number of tracts match!\n "
+        if len(production_disadvantaged_tracts_set)
+        == len(local_disadvantaged_tracts_set)
+        else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n "
+    )
+
+    removed_tracts = production_disadvantaged_tracts_set.difference(
+        local_disadvantaged_tracts_set
+    )
+    added_tracts = local_disadvantaged_tracts_set.difference(
+        production_disadvantaged_tracts_set
+    )
+    _add_text(
+        f"* There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the production score that are not disadvantaged in the locally"
+        f" generated score (i.e. disadvantaged tracts that were removed by the new score). "
+    )
+    _add_tract_list(removed_tracts)
+
+    _add_text(
+        f"\n* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
+        f" production score (i.e. disadvantaged tracts that were added by the new score). "
+    )
+    _add_tract_list(added_tracts)
+
+    # Grandfathered tracts from v1.0
+    grandfathered_tracts = local_df.loc[
+        local_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
+    ].index
+    if len(grandfathered_tracts) > 0:
+        _add_text(
+            f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
+        )
+        _add_tract_list(grandfathered_tracts)
+    else:
+        _add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
+
+
+def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
+    """
+    Generate a delta of scores
+
+    Args:
+        prod_df (pd.DataFrame): the production score
+        local_df (pd.DataFrame): the local score
+    """
+    _add_text("\n## Delta\n")
+    # First we make the columns on two dataframes to be the same to be able to compare
+    local_score_df_columns = local_df.columns.array.tolist()
+    production_score_df_columns = prod_df.columns.array.tolist()
+    extra_cols_in_local = set(local_score_df_columns) - set(
+        production_score_df_columns
+    )
+    extra_cols_in_prod = set(production_score_df_columns) - set(
+        local_score_df_columns
+    )
+    trimmed_prod_df = prod_df.drop(extra_cols_in_prod, axis=1)
+    trimmed_local_df = local_df.drop(extra_cols_in_local, axis=1)
+    try:
+
+        comparison_results_df = trimmed_prod_df.compare(
+            trimmed_local_df, align_axis=1, keep_shape=False, keep_equal=False
+        ).rename({"self": "Production", "other": "Local"}, axis=1, level=1)
+
+        _add_text(
+            "* I compared all values across all census tracts. Note this ignores any columns that have been added or removed."
+            f" There are {len(comparison_results_df.index):,} tracts with at least one difference.\n"
+        )
+
+        comparison_path = WORKING_PATH / "deltas.csv"
+        comparison_results_df.to_csv(path_or_buf=comparison_path)
+
+        _add_text(f"* Wrote comparison results to {comparison_path}")
+
+    except ValueError as e:
+        _add_text(
+            "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
+            " Please examine the logs or run the score comparison locally to find out more.\n"
+        )
+        _add_text(
+            f"Encountered an exception while performing the comparison: {repr(e)}\n"
+        )
+
+
 @click.group()
 def cli():
     """
@@ -101,7 +320,6 @@ def compare_score(
     """
 
     FLOAT_ROUNDING_PLACES = 2
-    WORKING_PATH = constants.TMP_PATH / "Comparator" / "Score"
 
     log_title("Compare Score", "Compare production score to local score")
 
@@ -132,188 +350,21 @@ def compare_score(
     production_score_df = production_score_df.round(FLOAT_ROUNDING_PLACES)
     local_score_df = local_score_df.round(FLOAT_ROUNDING_PLACES)
 
-    local_score_df_columns = sorted(local_score_df.columns.array.tolist())
-    production_score_df_columns = sorted(
-        production_score_df.columns.array.tolist()
-    )
-    extra_cols_in_local = set(local_score_df_columns) - set(
-        production_score_df_columns
-    )
-    extra_cols_in_prod = set(production_score_df_columns) - set(
-        local_score_df_columns
-    )
-
     _add_text("# Score Comparison Summary\n")
     _add_text(
         f"Hi! I'm the Score Comparator. I compared the score in production (version {compare_to_version}) to the"
         " locally calculated score. Here are the results:\n\n"
     )
 
-    #####################
-    # Compare the columns
-    #####################
-    log_info("Comparing columns (production vs local)")
-    _add_text("## Columns\n")
-    if len(extra_cols_in_local) == 0 and len(extra_cols_in_prod) == 0:
-        _add_text("* There are no differences in the column names.\n")
-    else:
-        _add_text(
-            f"* There are {len(extra_cols_in_local)} columns that were added as compared to the production score."
-        )
-        if len(extra_cols_in_local) > 0:
-            _add_text(f" Those colums are:\n{extra_cols_in_local}")
-        _add_text(
-            f"\n* There are {len(extra_cols_in_prod)} columns that were removed as compared to the production score."
-        )
-        if len(extra_cols_in_prod) > 0:
-            _add_text(f" Those colums are:\n{extra_cols_in_prod}")
-
-    ####################
-    # Compare the scores
-    ####################
-    log_info("Comparing dataframe contents (production vs local)")
-    _add_text("\n\n## Scores\n")
-
-    production_row_count = len(production_score_df.index)
-    local_row_count = len(local_score_df.index)
-
-    # Tract comparison
-    _add_text(
-        f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
-    )
-    if production_row_count == local_row_count:
-        _add_text(" They match!\n")
-    else:
-        _add_text(" They don't match. The differences are:\n")
-        _add_text(
-            "  * New tracts added to the local score are:\n"
-            f"{local_score_df.index.difference(production_score_df.index).to_list()}"
-            "\n  * Tracts removed from the local score are:\n"
-            f"{production_score_df.index.difference(local_score_df.index).to_list()}"
-            "\n"
-        )
-
-    # Population comparison
-    production_total_population = production_score_df[
-        field_names.TOTAL_POP_FIELD
-    ].sum()
-    local_total_population = local_score_df[field_names.TOTAL_POP_FIELD].sum()
-
-    _add_text(
-        f"* The total population in all census tracts in the production score is {production_total_population:,}. "
-        f"The total population in all census tracts locally is {local_total_population:,}. "
-    )
-    _add_text(
-        "They match!\n"
-        if production_total_population == local_total_population
-        else f"The difference is {abs(production_total_population - local_total_population):,}.\n"
-    )
-
-    dacs_query = f"`{field_names.FINAL_SCORE_N_BOOLEAN}` == True"
-    production_disadvantaged_tracts_df = production_score_df.query(dacs_query)
-    local_disadvantaged_tracts_df = local_score_df.query(dacs_query)
-
-    production_disadvantaged_tracts_set = set(
-        production_disadvantaged_tracts_df.index.array
-    )
-    local_disadvantaged_tracts_set = set(
-        local_disadvantaged_tracts_df.index.array
-    )
-
-    production_pct_of_population_represented = (
-        production_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
-        / production_total_population
-    )
-    local_pct_of_population_represented = (
-        local_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
-        / local_total_population
-    )
-
-    # DACS comparison
-    _add_text(
-        f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
-        f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
-    )
-    _add_text(
-        f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
-    )
-    _add_text(
-        " The number of tracts match!\n "
-        if len(production_disadvantaged_tracts_set)
-        == len(local_disadvantaged_tracts_set)
-        else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n "
-    )
-
-    removed_tracts = production_disadvantaged_tracts_set.difference(
-        local_disadvantaged_tracts_set
-    )
-    added_tracts = local_disadvantaged_tracts_set.difference(
-        production_disadvantaged_tracts_set
-    )
-    _add_text(
-        f"* There are {len(removed_tracts):,} tract(s) marked as disadvantaged in the production score that are not disadvantaged in the locally"
-        f" generated score (i.e. disadvantaged tracts that were removed by the new score). "
-    )
-    if len(removed_tracts) > 0:
-        _add_text(f"Those tracts are:\n{removed_tracts}")
-
-    _add_text(
-        f"\n* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
-        f" production score (i.e. disadvantaged tracts that were added by the new score). "
-    )
-    if len(added_tracts) > 0:
-        _add_text(f"Those tracts are:\n{added_tracts}\n")
-
-    # Grandfathered tracts from v1.0
-    grandfathered_tracts = local_score_df.loc[
-        local_score_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
-    ].index
-    if len(grandfathered_tracts) > 0:
-        _add_text(
-            f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring. They are:\n"
-            f"{grandfathered_tracts.to_list()}\n"
-        )
-    else:
-        _add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
-
-    ################
-    # Create a delta
-    ################
-    _add_text("\n## Delta\n")
-    # First we make the columns on two dataframes to be the same to be able to compare
-    trimmed_prod_df = production_score_df.drop(extra_cols_in_prod, axis=1)
-    trimmed_local_df = local_score_df.drop(extra_cols_in_local, axis=1)
-    try:
-
-        comparison_results_df = trimmed_prod_df.compare(
-            trimmed_local_df, align_axis=1, keep_shape=False, keep_equal=False
-        ).rename({"self": "Production", "other": "Local"}, axis=1, level=1)
-
-        _add_text(
-            "* I compared all values across all census tracts. Note this ignores any columns that have been added or removed."
-            f" There are {len(comparison_results_df.index):,} tracts with at least one difference.\n"
-        )
-
-        comparison_path = WORKING_PATH / "deltas.csv"
-        comparison_results_df.to_csv(path_or_buf=comparison_path)
-
-        _add_text(f"* Wrote comparison results to {comparison_path}")
-
-    except ValueError as e:
-        _add_text(
-            "* I could not run a full comparison. This is likely because there are column or index (census tract) differences."
-            " Please examine the logs or run the score comparison locally to find out more.\n"
-        )
-        _add_text(
-            f"Encountered an exception while performing the comparison: {repr(e)}\n"
-        )
+    _compare_score_columns(production_score_df, local_score_df)
+    _compare_score_results(production_score_df, local_score_df)
+    _generate_delta(production_score_df, local_score_df)
 
     result_doc = _get_result_doc()
     print(result_doc)
 
     # Write the report
     summary_path = WORKING_PATH / "comparison-summary.md"
-
     with open(summary_path, "w", encoding="utf-8") as f:
         f.write(result_doc)
         log_info(f"Wrote comparison summary to {summary_path}")
diff --git a/data/data-pipeline/poetry.lock b/data/data-pipeline/poetry.lock
index 51b54284..6141851e 100644
--- a/data/data-pipeline/poetry.lock
+++ b/data/data-pipeline/poetry.lock
@@ -5053,4 +5053,4 @@ test = ["mypy", "pre-commit", "pytest", "pytest-asyncio", "websockets (>=10.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "bdce0f2249243262fbfd1e73df3f2525c8ca624df6da458480636a19db26c4fe"
+content-hash = "04639d2eaf33218ba4fef190f76620b00fb2285d86d58458511d85dafd304658"
diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml
index e6fff8e8..f7122078 100644
--- a/data/data-pipeline/pyproject.toml
+++ b/data/data-pipeline/pyproject.toml
@@ -60,6 +60,11 @@ seaborn = "^0.11.2"
 papermill = "^2.3.4"
 jupyterlab = "^3.6.7"
 
+
+[tool.poetry.group.test.dependencies]
+openpyxl = "^3.1.5"
+pytest-snapshot = "^0.9.0"
+
 [build-system]
 build-backend = "poetry.core.masonry.api"
 requires = ["poetry-core>=1.0.0"]