Creating notebook to compare two score files for differences (#984)

2025-09-10 21:38:18 -07:00 · 2021-12-07 16:20:41 -05:00 · 2021-12-07 16:20:41 -05:00 · 780d1126ff
commit 780d1126ff
parent 9d28f5a4c4
1 changed files with 129 additions and 0 deletions
--- a/data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb
@ -0,0 +1,129 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ef80f77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "import pandas as pd\n",
+    "\n",
+    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)\n",
+    "\n",
+    "from data_pipeline.etl.base import ExtractTransformLoad\n",
+    "from data_pipeline.score import field_names\n",
+    "\n",
+    "DATA_DIR = pathlib.Path.cwd().parent / \"data\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9838abab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load\n",
+    "path_to_score_file_1 = (\n",
+    "    DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa1.csv\"\n",
+    ")\n",
+    "path_to_score_file_2 = (\n",
+    "        DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa2.csv\"\n",
+    ")\n",
+    "\n",
+    "score_1_df = pd.read_csv(\n",
+    "    path_to_score_file_1,\n",
+    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
+    ")\n",
+    "\n",
+    "score_2_df = pd.read_csv(\n",
+    "    path_to_score_file_2,\n",
+    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
+    ")\n",
+    "\n",
+    "score_2_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76781a40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# List columns in one but not the other \n",
+    "score_2_df.columns.difference(score_1_df.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05615567",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# List rows in one but not the other\n",
+    "\n",
+    "if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):\n",
+    "    print(\"Different lengths!\")\n",
+    "\n",
+    "print(\"Difference in tract IDs:\")\n",
+    "print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1407910",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Join \n",
+    "merged_df = score_1_df.merge(score_2_df, how=\"outer\", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))\n",
+    "merged_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b4f5bfd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check each duplicate column: \n",
+    "duplicate_columns = [x[:-2] for x in merged_df.columns if \"_1\" in x]\n",
+    "\n",
+    "for duplicate_column in duplicate_columns:\n",
+    "    print(f\"Checking duplicate column {duplicate_column}\")\n",
+    "    if not merged_df[f\"{duplicate_column}_1\"].equals(merged_df[f\"{duplicate_column}_2\"]):\n",
+    "        print(merged_df[f\"{duplicate_column}_1\"].compare(merged_df[f\"{duplicate_column}_2\"]))\n",
+    "        raise ValueError(f\"Error! Different values in {duplicate_column}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}