Generate Geo-aware scores for all zoom levels (#391)

* generate Geo-aware scores for all zoom levels

* usa high progress

* testing dissolve

* checkpoint

* changing type

* removing breakpoint

* validation notebooks

* quick update

* score validation

* fixes for county merge

* code completed
This commit is contained in:
Jorge Escobar 2021-07-28 16:07:28 -04:00 committed by GitHub
commit b404fdcc43
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 3023 additions and 270 deletions

View file

@ -0,0 +1,567 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "43c5dbee",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f97c95f6",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b8a2b53e",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"ACS_YEAR = \"2019\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0d33e8db",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "01e6dbe3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "341dbcb6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "eb25d4bf",
"metadata": {},
"outputs": [],
"source": [
"acs_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d4c9d010",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010399620002</td>\n",
" <td>0.077108</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010399618002</td>\n",
" <td>0.126214</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010399616004</td>\n",
" <td>0.133172</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010399616002</td>\n",
" <td>0.028249</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010399616001</td>\n",
" <td>0.063037</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 010399620002 0.077108 \n",
"1 010399618002 0.126214 \n",
"2 010399616004 0.133172 \n",
"3 010399616002 0.028249 \n",
"4 010399616001 0.063037 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "dd390179",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Unemployed civilians (percent) float64\n",
"Linguistic isolation (percent) float64\n",
"dtype: object"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "236eb093",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" acs_df, on=\"GEOID10\", how=\"left\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "4fff1845",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.030612</td>\n",
" <td>0.065963</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.118056</td>\n",
" <td>0.010283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.042373</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.042473</td>\n",
" <td>0.010435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.054358</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 100010414002 0.030612 \n",
"1 100010415002 0.118056 \n",
"2 100010417011 0.042373 \n",
"3 100010417012 0.042473 \n",
"4 100010422011 0.054358 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.065963 \n",
"1 0.010283 \n",
"2 0.000000 \n",
"3 0.010435 \n",
"4 0.000000 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "f8903557",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>100019900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>377</th>\n",
" <td>100030169041</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>392</th>\n",
" <td>100059900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400</th>\n",
" <td>100039901000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>416</th>\n",
" <td>100039801001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219505</th>\n",
" <td>340057048013</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219508</th>\n",
" <td>340057048024</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219758</th>\n",
" <td>340258047001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219807</th>\n",
" <td>340259900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220134</th>\n",
" <td>340076113001</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1462 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"34 100019900000 NaN \n",
"377 100030169041 NaN \n",
"392 100059900000 NaN \n",
"400 100039901000 NaN \n",
"416 100039801001 NaN \n",
"... ... ... \n",
"219505 340057048013 NaN \n",
"219508 340057048024 NaN \n",
"219758 340258047001 NaN \n",
"219807 340259900000 NaN \n",
"220134 340076113001 NaN \n",
"\n",
" Linguistic isolation (percent) \n",
"34 NaN \n",
"377 NaN \n",
"392 NaN \n",
"400 NaN \n",
"416 NaN \n",
"... ... \n",
"219505 NaN \n",
"219508 NaN \n",
"219758 NaN \n",
"219807 NaN \n",
"220134 0.0 \n",
"\n",
"[1462 rows x 3 columns]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b870a21f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,777 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "3ab8f7c1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8c22494f",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "eb31e9a1",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "95a5f8d8",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bdd9ab60",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "05a40080",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "114af777",
"metadata": {},
"outputs": [],
"source": [
"score_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d5f3ebd4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.808889</td>\n",
" <td>True</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.555160</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.272392</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.345686</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.472567</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220256</th>\n",
" <td>340076020004</td>\n",
" <td>0.921941</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220257</th>\n",
" <td>340076017002</td>\n",
" <td>0.934490</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220258</th>\n",
" <td>340076015005</td>\n",
" <td>0.889613</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220259</th>\n",
" <td>340076091032</td>\n",
" <td>0.627822</td>\n",
" <td>False</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220260</th>\n",
" <td>340076053002</td>\n",
" <td>0.762237</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>220261 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"0 100010414002 0.808889 True \n",
"1 100010415002 0.555160 False \n",
"2 100010417011 0.272392 False \n",
"3 100010417012 0.345686 False \n",
"4 100010422011 0.472567 False \n",
"... ... ... ... \n",
"220256 340076020004 0.921941 True \n",
"220257 340076017002 0.934490 True \n",
"220258 340076015005 0.889613 True \n",
"220259 340076091032 0.627822 False \n",
"220260 340076053002 0.762237 True \n",
"\n",
" GEOID State Abbreviation County Name \n",
"0 10001 DE Kent County \n",
"1 10001 DE Kent County \n",
"2 10001 DE Kent County \n",
"3 10001 DE Kent County \n",
"4 10001 DE Kent County \n",
"... ... ... ... \n",
"220256 34007 NJ Camden County \n",
"220257 34007 NJ Camden County \n",
"220258 34007 NJ Camden County \n",
"220259 34007 NJ Camden County \n",
"220260 34007 NJ Camden County \n",
"\n",
"[220261 rows x 6 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f84f9e1d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Score E (percentile) float64\n",
"Score E (top 25th percentile) bool\n",
"GEOID int64\n",
"State Abbreviation object\n",
"County Name object\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8d61e29e",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" score_df, on=\"GEOID10\", how=\"left\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7e8c2f2a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.808889</td>\n",
" <td>True</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.555160</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.272392</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.345686</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.472567</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220329</th>\n",
" <td>340076020004</td>\n",
" <td>0.921941</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220330</th>\n",
" <td>340076017002</td>\n",
" <td>0.934490</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220331</th>\n",
" <td>340076015005</td>\n",
" <td>0.889613</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220332</th>\n",
" <td>340076091032</td>\n",
" <td>0.627822</td>\n",
" <td>False</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220333</th>\n",
" <td>340076053002</td>\n",
" <td>0.762237</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>220334 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"0 100010414002 0.808889 True \n",
"1 100010415002 0.555160 False \n",
"2 100010417011 0.272392 False \n",
"3 100010417012 0.345686 False \n",
"4 100010422011 0.472567 False \n",
"... ... ... ... \n",
"220329 340076020004 0.921941 True \n",
"220330 340076017002 0.934490 True \n",
"220331 340076015005 0.889613 True \n",
"220332 340076091032 0.627822 False \n",
"220333 340076053002 0.762237 True \n",
"\n",
" GEOID State Abbreviation County Name \n",
"0 10001.0 DE Kent County \n",
"1 10001.0 DE Kent County \n",
"2 10001.0 DE Kent County \n",
"3 10001.0 DE Kent County \n",
"4 10001.0 DE Kent County \n",
"... ... ... ... \n",
"220329 34007.0 NJ Camden County \n",
"220330 34007.0 NJ Camden County \n",
"220331 34007.0 NJ Camden County \n",
"220332 34007.0 NJ Camden County \n",
"220333 34007.0 NJ Camden County \n",
"\n",
"[220334 rows x 6 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e81b1321",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10614</th>\n",
" <td>515150501002</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10615</th>\n",
" <td>515150501003</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10627</th>\n",
" <td>515150501001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10628</th>\n",
" <td>515150501005</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10629</th>\n",
" <td>515150501004</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174140</th>\n",
" <td>040190029031</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174143</th>\n",
" <td>040190027012</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174184</th>\n",
" <td>040190027011</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174242</th>\n",
" <td>040194105021</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174243</th>\n",
" <td>040194105011</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"10614 515150501002 NaN NaN \n",
"10615 515150501003 NaN NaN \n",
"10627 515150501001 NaN NaN \n",
"10628 515150501005 NaN NaN \n",
"10629 515150501004 NaN NaN \n",
"... ... ... ... \n",
"174140 040190029031 NaN NaN \n",
"174143 040190027012 NaN NaN \n",
"174184 040190027011 NaN NaN \n",
"174242 040194105021 NaN NaN \n",
"174243 040194105011 NaN NaN \n",
"\n",
" GEOID State Abbreviation County Name \n",
"10614 NaN NaN NaN \n",
"10615 NaN NaN NaN \n",
"10627 NaN NaN NaN \n",
"10628 NaN NaN NaN \n",
"10629 NaN NaN NaN \n",
"... ... ... ... \n",
"174140 NaN NaN NaN \n",
"174143 NaN NaN NaN \n",
"174184 NaN NaN NaN \n",
"174242 NaN NaN NaN \n",
"174243 NaN NaN NaN \n",
"\n",
"[73 rows x 6 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Score E (percentile)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1a7b71d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -2,7 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
@ -10,24 +12,24 @@
"import pathlib\n",
"import os\n",
"import sys"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
" state_gdf = gpd.read_file(file_name)\n",
@ -100,104 +102,133 @@
" state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n",
" compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n",
" write_to_file(compressed, file_name)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"id": "Ia5bqxS2LJqe"
},
"outputs": [],
"source": [
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
"CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n",
"score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})"
],
"outputs": [],
"metadata": {
"id": "Ia5bqxS2LJqe"
}
"CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
"score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"master_df = gpd.GeoDataFrame()"
],
"outputs": [],
"execution_count": 7,
"metadata": {
"id": "Dtf5qD50JvCw"
}
},
"outputs": [],
"source": [
"master_df = gpd.GeoDataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty GeoDataFrame\n",
"Columns: []\n",
"Index: []"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"master_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PNdw8bERJyKk"
},
"outputs": [],
"source": [
"for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
" state_gdf = gpd.read_file(file_name)\n",
" master_df = master_df.append(state_gdf)"
],
"outputs": [],
"metadata": {
"id": "PNdw8bERJyKk"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")"
],
"outputs": [],
"metadata": {
"id": "B5SS9y2pLwks"
}
},
"outputs": [],
"source": [
"master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"master_df.shape"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(220742, 13)"
]
},
"metadata": {
"tags": []
},
"execution_count": 68
}
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_C6vaR9HQeLa",
"outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4"
}
},
"outputs": [],
"source": [
"master_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"master_df.head(2)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oMoubjqCQiw5",
"outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" STATEFP10 ... geometry\n",
"0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n",
"1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n",
"\n",
"[2 rows x 13 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
@ -268,61 +299,61 @@
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" STATEFP10 ... geometry\n",
"0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n",
"1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n",
"\n",
"[2 rows x 13 columns]"
]
},
"execution_count": 69,
"metadata": {
"tags": []
},
"execution_count": 69
"output_type": "execute_result"
}
],
"source": [
"master_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bAMmGSgzVml0"
},
"outputs": [],
"source": [
"usa_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "U7M7dExdV2Vh"
},
"outputs": [],
"source": [
"usa_merged = master_df.merge(usa_df, on=\"GEOID10\", how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oMoubjqCQiw5",
"outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0"
}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})"
],
"outputs": [],
"metadata": {
"id": "bAMmGSgzVml0"
}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_merged = master_df.merge(usa_df, on=\"GEOID10\", how=\"left\")"
],
"outputs": [],
"metadata": {
"id": "U7M7dExdV2Vh"
}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_merged.head(2)"
],
"id": "Sr25DUkxWVhg",
"outputId": "1e804075-0f7d-4174-82d7-e21b8519c8bf"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" STATEFP10 COUNTYFP10 ... Score E (percentile) Score E (top 25th percentile)\n",
"0 01 005 ... 0.576986 False\n",
"1 01 005 ... 0.670349 False\n",
"\n",
"[2 rows x 98 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
@ -598,292 +629,296 @@
"</table>\n",
"<p>2 rows × 98 columns</p>\n",
"</div>"
],
"text/plain": [
" STATEFP10 COUNTYFP10 ... Score E (percentile) Score E (top 25th percentile)\n",
"0 01 005 ... 0.576986 False\n",
"1 01 005 ... 0.670349 False\n",
"\n",
"[2 rows x 98 columns]"
]
},
"execution_count": 72,
"metadata": {
"tags": []
},
"execution_count": 72
"output_type": "execute_result"
}
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Sr25DUkxWVhg",
"outputId": "1e804075-0f7d-4174-82d7-e21b8519c8bf"
}
"source": [
"usa_merged.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_merged_compressed = gpd.GeoDataFrame(usa_merged, crs=\"EPSG:4326\")"
],
"outputs": [],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ANMlAB8Qmtu8",
"outputId": "44934741-90a9-4664-fab5-2c39b348d2be"
}
},
"outputs": [],
"source": [
"usa_merged_compressed = gpd.GeoDataFrame(usa_merged, crs=\"EPSG:4326\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
],
"outputs": [],
"metadata": {
"id": "PBPD9LQctvPJ"
}
},
"outputs": [],
"source": [
"usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qAAEr1z-WZAT"
},
"outputs": [],
"source": [
"usa_simplified = usa_merged[\n",
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
" ].reset_index(drop=True)"
],
"outputs": [],
"metadata": {
"id": "qAAEr1z-WZAT"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SCNUjEbzWg-o"
},
"outputs": [],
"source": [
"usa_simplified.rename(\n",
" columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
" )"
],
"outputs": [],
"metadata": {
"id": "SCNUjEbzWg-o"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_cbg_compressed = gpd.GeoDataFrame(\n",
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
],
"outputs": [],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ej70uX0AmW0J",
"outputId": "88908f5e-b62d-494f-f0ea-649089b6652a"
}
},
"outputs": [],
"source": [
"usa_cbg_compressed = gpd.GeoDataFrame(\n",
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
],
"outputs": [],
"metadata": {
"id": "UE12dWmame3I"
}
},
"outputs": [],
"source": [
"usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_tracts = aggregate_to_tracts(usa_simplified)"
],
"outputs": [],
"metadata": {
"id": "wWFBduQQXGtM"
}
},
"outputs": [],
"source": [
"usa_tracts = aggregate_to_tracts(usa_simplified)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"source": [
"num_buckets = 10"
],
"outputs": [],
"metadata": {
"id": "L-PTnEWOpDtX"
}
},
"outputs": [],
"source": [
"num_buckets = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"tracts_compressed = gpd.GeoDataFrame(\n",
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
],
"outputs": [],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kTJafXcqXC01",
"outputId": "bd197952-76b7-4f99-edef-983f20d7acfb"
}
},
"outputs": [],
"source": [
"tracts_compressed = gpd.GeoDataFrame(\n",
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
],
"outputs": [],
"metadata": {
"id": "E2Nh97IlYhCF"
}
},
"outputs": [],
"source": [
"tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_bucketed = create_buckets_from_tracts(usa_tracts)"
],
"outputs": [],
"metadata": {
"id": "557zPMWFZC8R"
}
},
"outputs": [],
"source": [
"usa_bucketed = create_buckets_from_tracts(usa_tracts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"usa_aggregated = aggregate_buckets(usa_bucketed, agg_func=\"mean\")"
],
"outputs": [],
"metadata": {
"id": "k6RRdKlsaO0a"
}
},
"outputs": [],
"source": [
"usa_aggregated = aggregate_buckets(usa_bucketed, agg_func=\"mean\")"
]
},
{
"cell_type": "code",
"execution_count": 80,
"source": [
"usa_aggregated.shape"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(10, 2)"
]
},
"metadata": {
"tags": []
},
"execution_count": 80
}
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-cm5eET2pA1Z",
"outputId": "8d5d2e80-ad62-41d5-f1b0-922345f92d62"
}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"compressed = breakup_multipolygons(usa_aggregated, num_buckets)"
],
"outputs": [],
"metadata": {
"id": "4ZvJra-RaZ4v"
}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"len(compressed)"
],
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"36836"
"(10, 2)"
]
},
"execution_count": 80,
"metadata": {
"tags": []
},
"execution_count": 82
"output_type": "execute_result"
}
],
"source": [
"usa_aggregated.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4ZvJra-RaZ4v"
},
"outputs": [],
"source": [
"compressed = breakup_multipolygons(usa_aggregated, num_buckets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RDS7Q2WAb4Rx",
"outputId": "dcd28a31-083d-482e-b000-b4cd1046d4c2"
}
},
"outputs": [
{
"data": {
"text/plain": [
"36836"
]
},
"execution_count": 82,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"len(compressed)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VXTv8UuXb-qU"
},
"outputs": [],
"source": [
"gdf_compressed = gpd.GeoDataFrame(\n",
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
],
"outputs": [],
"metadata": {
"id": "VXTv8UuXb-qU"
}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"gdf_compressed.shape"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(36836, 2)"
]
},
"metadata": {
"tags": []
},
"execution_count": 84
}
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5v7TyB_rcRgT",
"outputId": "997625cc-c57a-4335-9b27-a08e4f8ad117"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(36836, 2)"
]
},
"execution_count": 84,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"gdf_compressed.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
],
"outputs": [],
"metadata": {
"id": "5eAnPL8McJpn"
}
},
"outputs": [],
"source": [
"gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
]
}
],
"metadata": {
@ -892,9 +927,9 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3.9.5 ('.venv': venv)",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python395jvsc74a57bd0935cbd69f49565f763db1e6a6adc70b468d078eb4d5856e64428cea33b57a041"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@ -906,9 +941,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View file

@ -153,7 +153,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.8.2"
}
},
"nbformat": 4,