Generate Geo-aware scores for all zoom levels (#391)

* generate Geo-aware scores for all zoom levels

* usa high progress

* testing dissolve

* checkpoint

* changing type

* removing breakpoint

* validation notebooks

* quick update

* score validation

* fixes for county merge

* code completed
This commit is contained in:
Jorge Escobar 2021-07-28 16:07:28 -04:00 committed by GitHub
commit b404fdcc43
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 3023 additions and 270 deletions

View file

@ -0,0 +1,567 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "43c5dbee",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f97c95f6",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b8a2b53e",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"ACS_YEAR = \"2019\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0d33e8db",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "01e6dbe3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "341dbcb6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "eb25d4bf",
"metadata": {},
"outputs": [],
"source": [
"acs_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d4c9d010",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010399620002</td>\n",
" <td>0.077108</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010399618002</td>\n",
" <td>0.126214</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010399616004</td>\n",
" <td>0.133172</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010399616002</td>\n",
" <td>0.028249</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010399616001</td>\n",
" <td>0.063037</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 010399620002 0.077108 \n",
"1 010399618002 0.126214 \n",
"2 010399616004 0.133172 \n",
"3 010399616002 0.028249 \n",
"4 010399616001 0.063037 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "dd390179",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Unemployed civilians (percent) float64\n",
"Linguistic isolation (percent) float64\n",
"dtype: object"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "236eb093",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" acs_df, on=\"GEOID10\", how=\"left\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "4fff1845",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.030612</td>\n",
" <td>0.065963</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.118056</td>\n",
" <td>0.010283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.042373</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.042473</td>\n",
" <td>0.010435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.054358</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 100010414002 0.030612 \n",
"1 100010415002 0.118056 \n",
"2 100010417011 0.042373 \n",
"3 100010417012 0.042473 \n",
"4 100010422011 0.054358 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.065963 \n",
"1 0.010283 \n",
"2 0.000000 \n",
"3 0.010435 \n",
"4 0.000000 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "f8903557",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>100019900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>377</th>\n",
" <td>100030169041</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>392</th>\n",
" <td>100059900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400</th>\n",
" <td>100039901000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>416</th>\n",
" <td>100039801001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219505</th>\n",
" <td>340057048013</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219508</th>\n",
" <td>340057048024</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219758</th>\n",
" <td>340258047001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219807</th>\n",
" <td>340259900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220134</th>\n",
" <td>340076113001</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1462 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"34 100019900000 NaN \n",
"377 100030169041 NaN \n",
"392 100059900000 NaN \n",
"400 100039901000 NaN \n",
"416 100039801001 NaN \n",
"... ... ... \n",
"219505 340057048013 NaN \n",
"219508 340057048024 NaN \n",
"219758 340258047001 NaN \n",
"219807 340259900000 NaN \n",
"220134 340076113001 NaN \n",
"\n",
" Linguistic isolation (percent) \n",
"34 NaN \n",
"377 NaN \n",
"392 NaN \n",
"400 NaN \n",
"416 NaN \n",
"... ... \n",
"219505 NaN \n",
"219508 NaN \n",
"219758 NaN \n",
"219807 NaN \n",
"220134 0.0 \n",
"\n",
"[1462 rows x 3 columns]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b870a21f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}