mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-09-30 09:13:17 -07:00
Comparison tool refactor & ETL HUD RECAP (#272)
* Refactoring comparison tool and creating two new ETL notebooks
This commit is contained in:
parent
e8385e1439
commit
11d13e034e
6 changed files with 2071 additions and 274 deletions
|
@ -16,6 +16,7 @@
|
|||
"import collections\n",
|
||||
"import functools\n",
|
||||
"from pathlib import Path\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import pandas as pd\n",
|
||||
"import csv\n",
|
||||
"import os\n",
|
||||
|
@ -363,7 +364,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# calculate percentiles\n",
|
||||
"# Calculate percentiles for each data set.\n",
|
||||
"for data_set in data_sets:\n",
|
||||
" df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n",
|
||||
" data_set.renamed_field\n",
|
||||
|
@ -379,7 +380,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# calculate min max\n",
|
||||
"# Calculate min-max for each data set.\n",
|
||||
"# Math:\n",
|
||||
"# (\n",
|
||||
"# Observed value\n",
|
||||
|
@ -410,6 +411,28 @@
|
|||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f4eec326",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Graph distributions and correlations.\n",
|
||||
"min_max_fields = [\n",
|
||||
" f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"\n",
|
||||
" for data_set in data_sets\n",
|
||||
" if data_set.renamed_field != GEOID_FIELD_NAME\n",
|
||||
"]\n",
|
||||
"df.hist(\n",
|
||||
" column=min_max_fields, layout=(len(min_max_fields), 1), figsize=(10, 30), bins=30\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"plt.tight_layout()\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -476,7 +499,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fields_to_use_in_score = [\n",
|
||||
"# Calculate scores D and E.\n",
|
||||
"fields_to_use_in_score_d_and_e = [\n",
|
||||
" UNEMPLOYED_FIELD_NAME,\n",
|
||||
" LINGUISTIC_ISOLATION_FIELD_NAME,\n",
|
||||
" HOUSING_BURDEN_FIELD_NAME,\n",
|
||||
|
@ -484,9 +508,11 @@
|
|||
" HIGH_SCHOOL_FIELD_NAME,\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n",
|
||||
"fields_min_max = [\n",
|
||||
" f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
|
||||
"]\n",
|
||||
"fields_percentile = [\n",
|
||||
" f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n",
|
||||
" f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Calculate \"Score D\", which uses min-max normalization\n",
|
||||
|
@ -498,6 +524,32 @@
|
|||
"print(df[\"Score E\"].describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a02e5bac",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Graph distributions\n",
|
||||
"df.hist(\n",
|
||||
" column=fields_min_max, layout=(len(fields_min_max), 1), figsize=(10, 30), bins=30\n",
|
||||
")\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a0e608c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Calculate correlations\n",
|
||||
"df[fields_min_max].corr()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue