mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Creating notebook to compare two score files for differences (#984)
This commit is contained in:
parent
9d28f5a4c4
commit
780d1126ff
1 changed files with 129 additions and 0 deletions
|
@ -0,0 +1,129 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6ef80f77",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"from data_pipeline.etl.base import ExtractTransformLoad\n",
|
||||
"from data_pipeline.score import field_names\n",
|
||||
"\n",
|
||||
"DATA_DIR = pathlib.Path.cwd().parent / \"data\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9838abab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load\n",
|
||||
"path_to_score_file_1 = (\n",
|
||||
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa1.csv\"\n",
|
||||
")\n",
|
||||
"path_to_score_file_2 = (\n",
|
||||
" DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa2.csv\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"score_1_df = pd.read_csv(\n",
|
||||
" path_to_score_file_1,\n",
|
||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"score_2_df = pd.read_csv(\n",
|
||||
" path_to_score_file_2,\n",
|
||||
" dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"score_2_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "76781a40",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# List columns in one but not the other \n",
|
||||
"score_2_df.columns.difference(score_1_df.columns)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "05615567",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# List rows in one but not the other\n",
|
||||
"\n",
|
||||
"if len(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) != len(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]):\n",
|
||||
" print(\"Different lengths!\")\n",
|
||||
"\n",
|
||||
"print(\"Difference in tract IDs:\")\n",
|
||||
"print(set(score_2_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]) ^ set(score_1_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f1407910",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Join \n",
|
||||
"merged_df = score_1_df.merge(score_2_df, how=\"outer\", on=ExtractTransformLoad.GEOID_TRACT_FIELD_NAME, suffixes=('_1', '_2'))\n",
|
||||
"merged_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1b4f5bfd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check each duplicate column: \n",
|
||||
"duplicate_columns = [x[:-2] for x in merged_df.columns if \"_1\" in x]\n",
|
||||
"\n",
|
||||
"for duplicate_column in duplicate_columns:\n",
|
||||
" print(f\"Checking duplicate column {duplicate_column}\")\n",
|
||||
" if not merged_df[f\"{duplicate_column}_1\"].equals(merged_df[f\"{duplicate_column}_2\"]):\n",
|
||||
" print(merged_df[f\"{duplicate_column}_1\"].compare(merged_df[f\"{duplicate_column}_2\"]))\n",
|
||||
" raise ValueError(f\"Error! Different values in {duplicate_column}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Add table
Reference in a new issue