Comparison tool refactor & ETL HUD RECAP (#272)

* Refactoring comparison tool and creating two new ETL notebooks
2025-09-30 09:13:17 -07:00 · 2021-07-06 12:10:58 -05:00 · 2021-07-06 12:10:58 -05:00 · 11d13e034e
commit 11d13e034e
parent e8385e1439
6 changed files with 2071 additions and 274 deletions
--- a/score/ipython/score_calc.ipynb
+++ b/score/ipython/score_calc.ipynb
@ -16,6 +16,7 @@
    "import collections\n",
    "import functools\n",
    "from pathlib import Path\n",
+    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import csv\n",
    "import os\n",
@ -363,7 +364,7 @@
   },
   "outputs": [],
   "source": [
-    "# calculate percentiles\n",
+    "# Calculate percentiles for each data set.\n",
    "for data_set in data_sets:\n",
    "    df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n",
    "        data_set.renamed_field\n",
@ -379,7 +380,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# calculate min max\n",
+    "# Calculate min-max for each data set.\n",
    "# Math:\n",
    "# (\n",
    "#     Observed value\n",
@ -410,6 +411,28 @@
    "df.head()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4eec326",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Graph distributions and correlations.\n",
+    "min_max_fields = [\n",
+    "    f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"\n",
+    "    for data_set in data_sets\n",
+    "    if data_set.renamed_field != GEOID_FIELD_NAME\n",
+    "]\n",
+    "df.hist(\n",
+    "    column=min_max_fields, layout=(len(min_max_fields), 1), figsize=(10, 30), bins=30\n",
+    ")\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -476,7 +499,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "fields_to_use_in_score = [\n",
+    "# Calculate scores D and E.\n",
+    "fields_to_use_in_score_d_and_e = [\n",
    "    UNEMPLOYED_FIELD_NAME,\n",
    "    LINGUISTIC_ISOLATION_FIELD_NAME,\n",
    "    HOUSING_BURDEN_FIELD_NAME,\n",
@ -484,9 +508,11 @@
    "    HIGH_SCHOOL_FIELD_NAME,\n",
    "]\n",
    "\n",
-    "fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n",
+    "fields_min_max = [\n",
+    "    f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
+    "]\n",
    "fields_percentile = [\n",
-    "    f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n",
+    "    f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
    "]\n",
    "\n",
    "# Calculate \"Score D\", which uses min-max normalization\n",
@ -498,6 +524,32 @@
    "print(df[\"Score E\"].describe())"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a02e5bac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Graph distributions\n",
+    "df.hist(\n",
+    "    column=fields_min_max, layout=(len(fields_min_max), 1), figsize=(10, 30), bins=30\n",
+    ")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0e608c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate correlations\n",
+    "df[fields_min_max].corr()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,