From 780d1126ff3022c65bbf096464b4b2e8709f856c Mon Sep 17 00:00:00 2001 From: Lucas Merrill Brown Date: Tue, 7 Dec 2021 16:20:41 -0500 Subject: [PATCH] Creating notebook to compare two score files for differences (#984) --- ...pare_two_score_files_for_differences.ipynb | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb diff --git a/data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb b/data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb new file mode 100644 index 00000000..d6aa454d --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/compare_two_score_files_for_differences.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6ef80f77", + "metadata": {}, + "outputs": [], + "source": [ + "import IPython\n", + "import pandas as pd\n", + "\n", + "module_path = os.path.abspath(os.path.join(\"../..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from data_pipeline.etl.base import ExtractTransformLoad\n", + "from data_pipeline.score import field_names\n", + "\n", + "DATA_DIR = pathlib.Path.cwd().parent / \"data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9838abab", + "metadata": {}, + "outputs": [], + "source": [ + "# Load\n", + "path_to_score_file_1 = (\n", + " DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa1.csv\"\n", + ")\n", + "path_to_score_file_2 = (\n", + " DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa2.csv\"\n", + ")\n", + "\n", + "score_1_df = pd.read_csv(\n", + " path_to_score_file_1,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", + ")\n", + "\n", + "score_2_df = pd.read_csv(\n", + " path_to_score_file_2,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", + ")\n", + "\n", + "score_2_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76781a40", + "metadata": {}, + "outputs": [], + "source": [ + "# List columns in one but not the other \n", + "score_2_df.columns.difference(score_1_df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05615567", + "metadata": {}, + "outputs": [], + "source": [ + "# List rows in one but not the other\n", + "\n", + "if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):\n", + " print(\"Different lengths!\")\n", + "\n", + "print(\"Difference in tract IDs:\")\n", + "print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1407910", + "metadata": {}, + "outputs": [], + "source": [ + "# Join \n", + "merged_df = score_1_df.merge(score_2_df, how=\"outer\", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))\n", + "merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b4f5bfd", + "metadata": {}, + "outputs": [], + "source": [ + "# Check each duplicate column: \n", + "duplicate_columns = [x[:-2] for x in merged_df.columns if \"_1\" in x]\n", + "\n", + "for duplicate_column in duplicate_columns:\n", + " print(f\"Checking duplicate column {duplicate_column}\")\n", + " if not merged_df[f\"{duplicate_column}_1\"].equals(merged_df[f\"{duplicate_column}_2\"]):\n", + " print(merged_df[f\"{duplicate_column}_1\"].compare(merged_df[f\"{duplicate_column}_2\"]))\n", + " raise ValueError(f\"Error! Different values in {duplicate_column}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}