Issue 970: reverse percentiles for AMI and life expectancy (#1018)

* switching to low * fixing score-etl-post * updating comments * fixing comparison * create separate field for clarity * comment fix * removing healthy food * fixing bug in score post * running black and adding comment * Update pickles and add a helpful notes to README Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
2025-09-21 17:01:12 -07:00 · 2021-12-10 10:16:22 -05:00 · 2021-12-10 10:16:22 -05:00 · 7fcecaee42
commit 7fcecaee42
parent 24bac56d9e
11 changed files with 144 additions and 100 deletions
--- a/data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb
@ -8,7 +8,10 @@
   "outputs": [],
   "source": [
    "import IPython\n",
+    "import os\n",
    "import pandas as pd\n",
+    "import pathlib\n",
+    "import sys\n",
    "\n",
    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
    "if module_path not in sys.path:\n",
@ -28,12 +31,8 @@
   "outputs": [],
   "source": [
    "# Load\n",
-    "path_to_score_file_1 = (\n",
-    "    DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa1.csv\"\n",
-    ")\n",
-    "path_to_score_file_2 = (\n",
-    "        DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa2.csv\"\n",
-    ")\n",
+    "path_to_score_file_1 = DATA_DIR / \"compare_two_score_csvs/usa (pre 970).csv\"\n",
+    "path_to_score_file_2 = DATA_DIR / \"compare_two_score_csvs/usa (post 970).csv\"\n",
    "\n",
    "score_1_df = pd.read_csv(\n",
    "    path_to_score_file_1,\n",
@ -55,7 +54,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# List columns in one but not the other \n",
+    "# List columns in one but not the other\n",
    "score_2_df.columns.difference(score_1_df.columns)"
   ]
  },
@ -68,11 +67,16 @@
   "source": [
    "# List rows in one but not the other\n",
    "\n",
-    "if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):\n",
+    "if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(\n",
+    "    score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n",
+    "):\n",
    "    print(\"Different lengths!\")\n",
    "\n",
    "print(\"Difference in tract IDs:\")\n",
-    "print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))\n"
+    "print(\n",
+    "    set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])\n",
+    "    ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME])\n",
+    ")"
   ]
  },
  {
@ -82,8 +86,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Join \n",
-    "merged_df = score_1_df.merge(score_2_df, how=\"outer\", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))\n",
+    "# Join\n",
+    "merged_df = score_1_df.merge(\n",
+    "    score_2_df,\n",
+    "    how=\"outer\",\n",
+    "    on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME,\n",
+    "    suffixes=(\"_1\", \"_2\"),\n",
+    ")\n",
    "merged_df"
   ]
  },
@ -94,14 +103,32 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Check each duplicate column: \n",
+    "# Check each duplicate column:\n",
+    "# Remove the suffix \"_1\"\n",
    "duplicate_columns = [x[:-2] for x in merged_df.columns if \"_1\" in x]\n",
    "\n",
-    "for duplicate_column in duplicate_columns:\n",
-    "    print(f\"Checking duplicate column {duplicate_column}\")\n",
-    "    if not merged_df[f\"{duplicate_column}_1\"].equals(merged_df[f\"{duplicate_column}_2\"]):\n",
-    "        print(merged_df[f\"{duplicate_column}_1\"].compare(merged_df[f\"{duplicate_column}_2\"]))\n",
-    "        raise ValueError(f\"Error! Different values in {duplicate_column}\")"
+    "columns_to_exclude_from_duplicates_check = [\n",
+    "    \"Total threshold criteria exceeded\"\n",
+    "]\n",
+    "\n",
+    "columns_to_check = [column for column in duplicate_columns if column not in columns_to_exclude_from_duplicates_check]\n",
+    "\n",
+    "any_errors_found = False\n",
+    "for column_to_check in columns_to_check:\n",
+    "    print(f\"Checking duplicate column {column_to_check}\")\n",
+    "    if not merged_df[f\"{column_to_check}_1\"].equals(\n",
+    "        merged_df[f\"{column_to_check}_2\"]\n",
+    "    ):\n",
+    "        print(f\"Error! Different values in {column_to_check}\")\n",
+    "        print(\n",
+    "            merged_df[f\"{column_to_check}_1\"].compare(\n",
+    "                merged_df[f\"{column_to_check}_2\"]\n",
+    "            )\n",
+    "        )\n",
+    "        any_errors_found = True\n",
+    "\n",
+    "if any_errors_found:\n",
+    "    raise ValueError(f\"Error! Different values in one or more columns.\")"
   ]
  }
 ],