Creating notebook to compare two score files for differences (#984)

This commit is contained in:
Lucas Merrill Brown 2021-12-07 16:20:41 -05:00 committed by GitHub
parent 9d28f5a4c4
commit 780d1126ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -0,0 +1,129 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "6ef80f77",
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"import pandas as pd\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from data_pipeline.etl.base import ExtractTransformLoad\n",
"from data_pipeline.score import field_names\n",
"\n",
"DATA_DIR = pathlib.Path.cwd().parent / \"data\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9838abab",
"metadata": {},
"outputs": [],
"source": [
"# Load\n",
"path_to_score_file_1 = (\n",
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa1.csv\"\n",
")\n",
"path_to_score_file_2 = (\n",
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa2.csv\"\n",
")\n",
"\n",
"score_1_df = pd.read_csv(\n",
" path_to_score_file_1,\n",
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
")\n",
"\n",
"score_2_df = pd.read_csv(\n",
" path_to_score_file_2,\n",
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
")\n",
"\n",
"score_2_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76781a40",
"metadata": {},
"outputs": [],
"source": [
"# List columns in one but not the other \n",
"score_2_df.columns.difference(score_1_df.columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05615567",
"metadata": {},
"outputs": [],
"source": [
"# List rows in one but not the other\n",
"\n",
"if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):\n",
" print(\"Different lengths!\")\n",
"\n",
"print(\"Difference in tract IDs:\")\n",
"print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1407910",
"metadata": {},
"outputs": [],
"source": [
"# Join \n",
"merged_df = score_1_df.merge(score_2_df, how=\"outer\", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))\n",
"merged_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b4f5bfd",
"metadata": {},
"outputs": [],
"source": [
"# Check each duplicate column: \n",
"duplicate_columns = [x[:-2] for x in merged_df.columns if \"_1\" in x]\n",
"\n",
"for duplicate_column in duplicate_columns:\n",
" print(f\"Checking duplicate column {duplicate_column}\")\n",
" if not merged_df[f\"{duplicate_column}_1\"].equals(merged_df[f\"{duplicate_column}_2\"]):\n",
" print(merged_df[f\"{duplicate_column}_1\"].compare(merged_df[f\"{duplicate_column}_2\"]))\n",
" raise ValueError(f\"Error! Different values in {duplicate_column}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}