Comparison tool refactor & ETL HUD RECAP (#272)

* Refactoring comparison tool and creating two new ETL notebooks
This commit is contained in:
Lucas Merrill Brown 2021-07-06 12:10:58 -05:00 committed by GitHub
commit 11d13e034e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 2071 additions and 274 deletions

View file

@ -16,6 +16,7 @@
"import collections\n",
"import functools\n",
"from pathlib import Path\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import csv\n",
"import os\n",
@ -363,7 +364,7 @@
},
"outputs": [],
"source": [
"# calculate percentiles\n",
"# Calculate percentiles for each data set.\n",
"for data_set in data_sets:\n",
" df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n",
" data_set.renamed_field\n",
@ -379,7 +380,7 @@
"metadata": {},
"outputs": [],
"source": [
"# calculate min max\n",
"# Calculate min-max for each data set.\n",
"# Math:\n",
"# (\n",
"# Observed value\n",
@ -410,6 +411,28 @@
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4eec326",
"metadata": {},
"outputs": [],
"source": [
"# Graph distributions and correlations.\n",
"min_max_fields = [\n",
" f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"\n",
" for data_set in data_sets\n",
" if data_set.renamed_field != GEOID_FIELD_NAME\n",
"]\n",
"df.hist(\n",
" column=min_max_fields, layout=(len(min_max_fields), 1), figsize=(10, 30), bins=30\n",
")\n",
"\n",
"plt.tight_layout()\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -476,7 +499,8 @@
"metadata": {},
"outputs": [],
"source": [
"fields_to_use_in_score = [\n",
"# Calculate scores D and E.\n",
"fields_to_use_in_score_d_and_e = [\n",
" UNEMPLOYED_FIELD_NAME,\n",
" LINGUISTIC_ISOLATION_FIELD_NAME,\n",
" HOUSING_BURDEN_FIELD_NAME,\n",
@ -484,9 +508,11 @@
" HIGH_SCHOOL_FIELD_NAME,\n",
"]\n",
"\n",
"fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n",
"fields_min_max = [\n",
" f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
"]\n",
"fields_percentile = [\n",
" f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n",
" f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
"]\n",
"\n",
"# Calculate \"Score D\", which uses min-max normalization\n",
@ -498,6 +524,32 @@
"print(df[\"Score E\"].describe())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a02e5bac",
"metadata": {},
"outputs": [],
"source": [
"# Graph distributions\n",
"df.hist(\n",
" column=fields_min_max, layout=(len(fields_min_max), 1), figsize=(10, 30), bins=30\n",
")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0e608c8",
"metadata": {},
"outputs": [],
"source": [
"# Calculate correlations\n",
"df[fields_min_max].corr()"
]
},
{
"cell_type": "code",
"execution_count": null,