This commit is contained in:
lucasmbrown-usds 2022-09-28 17:22:54 -04:00
parent f0804646df
commit bfb08e455e

View file

@ -0,0 +1,506 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "bb24db55",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import pyogrio\n",
"from data_pipeline.etl.sources.census.etl import CensusETL\n",
"from data_pipeline.etl.sources.geocorr_alternatives.etl import GeoCorrAlternativesETL\n",
"\n",
"import geopandas as gpd\n",
"import pandas as pd\n",
"from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel\n",
"from data_pipeline.etl.sources.geo_utils import (\n",
" add_tracts_for_geometries,\n",
" get_tract_geojson,\n",
")\n",
"from data_pipeline.score import field_names\n",
"from data_pipeline.utils import get_module_logger, unzip_file_from_url\n",
"\n",
"logger = get_module_logger(__name__)\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "41bd360f",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# ZCTA_2020_SHAPEFILE_PATH = (\n",
"# \"https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_zcta520_500k.zip\"\n",
"# )\n",
"\n",
"# ZCTA_2010_SHAPEFILE_PATH = (\n",
"# \"https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_zcta510_500k.zip\"\n",
"# )\n",
"\n",
"ZCTA_2020_SHAPEFILE_PATH = (\n",
" \"~/Downloads/cb_2020_us_zcta520_500k\"\n",
")\n",
"\n",
"ZCTA_2010_SHAPEFILE_PATH = (\n",
" \"~/Downloads/cb_2019_us_zcta510_500k\"\n",
")\n",
"\n",
"ZCTA_2010_FIELD = \"ZCTA5CE10\"\n",
"\n",
"PERCENT_OF_2020_in_2010_FIELD = \"percent of 2020 in 2010\""
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "37ca370a",
"metadata": {},
"outputs": [],
"source": [
"# Read in ZCTA data.\n",
"zcta_2020_gdf = gpd.read_file(\n",
" filename=ZCTA_2020_SHAPEFILE_PATH\n",
")\n",
"zcta_2020_gdf = zcta_2020_gdf.rename(\n",
" columns={GeoCorrAlternativesETL.ZIP_CODE_INPUT_FIELD: field_names.ZIP_CODE},\n",
" errors=\"raise\",\n",
")\n",
"\n",
"\n",
"# Read in ZCTA data.\n",
"zcta_2010_gdf = gpd.read_file(\n",
" filename=ZCTA_2010_SHAPEFILE_PATH\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "6178cd2f",
"metadata": {},
"outputs": [],
"source": [
"#switch to projected \n",
"zcta_2020_gdf=zcta_2020_gdf.to_crs(crs=GeoCorrAlternativesETL.CRS_INTEGER)\n",
"zcta_2010_gdf=zcta_2010_gdf.to_crs(crs=GeoCorrAlternativesETL.CRS_INTEGER)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ff3532eb",
"metadata": {},
"outputs": [],
"source": [
"zcta_2020_gdf[\"zcta_2020_area\"] = zcta_2020_gdf.area"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "7d95b1ef",
"metadata": {},
"outputs": [],
"source": [
"joined_gdf = gpd.overlay(\n",
" df1=zcta_2020_gdf,\n",
" df2=zcta_2010_gdf,\n",
" how=\"intersection\",\n",
" keep_geom_type=False,\n",
" )\n",
"\n",
"# Calculating the areas of the newly-created overlapping geometries\n",
"joined_gdf[GeoCorrAlternativesETL.AREA_JOINED_FIELD] = joined_gdf.area\n",
"\n",
"# Calculating the areas of the newly-created geometries in relation\n",
"# to the original tract geometries\n",
"joined_gdf[PERCENT_OF_2020_in_2010_FIELD] = (\n",
" joined_gdf[GeoCorrAlternativesETL.AREA_JOINED_FIELD] / joined_gdf[\"zcta_2020_area\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "b206ad11",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.0 164183\n",
"1.0 17832\n",
"0.9 9812\n",
"0.1 8771\n",
"0.8 2787\n",
"0.2 2029\n",
"0.7 1454\n",
"0.3 1146\n",
"0.6 988\n",
"0.4 894\n",
"0.5 800\n",
"Name: percent of 2020 in 2010, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joined_gdf[PERCENT_OF_2020_in_2010_FIELD].round(decimals=1).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "4df18de2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Zip code</th>\n",
" <th>AFFGEOID20</th>\n",
" <th>GEOID20</th>\n",
" <th>NAME20</th>\n",
" <th>LSAD20</th>\n",
" <th>ALAND20</th>\n",
" <th>AWATER20</th>\n",
" <th>zcta_2020_area</th>\n",
" <th>ZCTA5CE10</th>\n",
" <th>AFFGEOID10</th>\n",
" <th>GEOID10</th>\n",
" <th>ALAND10</th>\n",
" <th>AWATER10</th>\n",
" <th>geometry</th>\n",
" <th>area_joined</th>\n",
" <th>percent of 2020 in 2010</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>35768</td>\n",
" <td>860Z200US35768</td>\n",
" <td>35768</td>\n",
" <td>35768</td>\n",
" <td>Z5</td>\n",
" <td>446231990</td>\n",
" <td>3736014</td>\n",
" <td>6.688056e+08</td>\n",
" <td>35776</td>\n",
" <td>8600000US35776</td>\n",
" <td>35776</td>\n",
" <td>234072461</td>\n",
" <td>1041223</td>\n",
" <td>GEOMETRYCOLLECTION (POLYGON ((-9597648.456 411...</td>\n",
" <td>2.193417e+06</td>\n",
" <td>0.003280</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>35769</td>\n",
" <td>860Z200US35769</td>\n",
" <td>35769</td>\n",
" <td>35769</td>\n",
" <td>Z5</td>\n",
" <td>163279214</td>\n",
" <td>57835709</td>\n",
" <td>3.270629e+08</td>\n",
" <td>35776</td>\n",
" <td>8600000US35776</td>\n",
" <td>35776</td>\n",
" <td>234072461</td>\n",
" <td>1041223</td>\n",
" <td>GEOMETRYCOLLECTION (POLYGON ((-9596251.174 410...</td>\n",
" <td>4.423302e+03</td>\n",
" <td>0.000014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>35776</td>\n",
" <td>860Z200US35776</td>\n",
" <td>35776</td>\n",
" <td>35776</td>\n",
" <td>Z5</td>\n",
" <td>268376689</td>\n",
" <td>1277083</td>\n",
" <td>3.994217e+08</td>\n",
" <td>35776</td>\n",
" <td>8600000US35776</td>\n",
" <td>35776</td>\n",
" <td>234072461</td>\n",
" <td>1041223</td>\n",
" <td>GEOMETRYCOLLECTION (POLYGON ((-9609657.158 411...</td>\n",
" <td>3.377086e+08</td>\n",
" <td>0.845494</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>35774</td>\n",
" <td>860Z200US35774</td>\n",
" <td>35774</td>\n",
" <td>35774</td>\n",
" <td>Z5</td>\n",
" <td>36139337</td>\n",
" <td>362969</td>\n",
" <td>5.424829e+07</td>\n",
" <td>35776</td>\n",
" <td>8600000US35776</td>\n",
" <td>35776</td>\n",
" <td>234072461</td>\n",
" <td>1041223</td>\n",
" <td>GEOMETRYCOLLECTION (POLYGON ((-9592577.408 413...</td>\n",
" <td>3.181710e+06</td>\n",
" <td>0.058651</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>35747</td>\n",
" <td>860Z200US35747</td>\n",
" <td>35747</td>\n",
" <td>35747</td>\n",
" <td>Z5</td>\n",
" <td>195112094</td>\n",
" <td>9300885</td>\n",
" <td>3.016341e+08</td>\n",
" <td>35776</td>\n",
" <td>8600000US35776</td>\n",
" <td>35776</td>\n",
" <td>234072461</td>\n",
" <td>1041223</td>\n",
" <td>GEOMETRYCOLLECTION (POLYGON ((-9608245.738 410...</td>\n",
" <td>5.117346e+06</td>\n",
" <td>0.016965</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210691</th>\n",
" <td>72046</td>\n",
" <td>860Z200US72046</td>\n",
" <td>72046</td>\n",
" <td>72046</td>\n",
" <td>Z5</td>\n",
" <td>603015221</td>\n",
" <td>20383513</td>\n",
" <td>9.214782e+08</td>\n",
" <td>72037</td>\n",
" <td>8600000US72037</td>\n",
" <td>72037</td>\n",
" <td>388714</td>\n",
" <td>0</td>\n",
" <td>MULTIPOLYGON (((-10226266.281 4102252.880, -10...</td>\n",
" <td>5.753217e+05</td>\n",
" <td>0.000624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210692</th>\n",
" <td>13156</td>\n",
" <td>860Z200US13156</td>\n",
" <td>13156</td>\n",
" <td>13156</td>\n",
" <td>Z5</td>\n",
" <td>76214823</td>\n",
" <td>6028269</td>\n",
" <td>1.521320e+08</td>\n",
" <td>13064</td>\n",
" <td>8600000US13064</td>\n",
" <td>13064</td>\n",
" <td>461830</td>\n",
" <td>2948339</td>\n",
" <td>MULTIPOLYGON (((-8538756.977 5360283.214, -853...</td>\n",
" <td>6.455585e+06</td>\n",
" <td>0.042434</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210693</th>\n",
" <td>06850</td>\n",
" <td>860Z200US06850</td>\n",
" <td>06850</td>\n",
" <td>06850</td>\n",
" <td>Z5</td>\n",
" <td>17563836</td>\n",
" <td>206664</td>\n",
" <td>3.140506e+07</td>\n",
" <td>06856</td>\n",
" <td>8600000US06856</td>\n",
" <td>06856</td>\n",
" <td>9568</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((-8173142.581 5028860.482, -8173156.2...</td>\n",
" <td>1.245927e+04</td>\n",
" <td>0.000397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210694</th>\n",
" <td>99632</td>\n",
" <td>860Z200US99632</td>\n",
" <td>99632</td>\n",
" <td>99632</td>\n",
" <td>Z5</td>\n",
" <td>65153947</td>\n",
" <td>0</td>\n",
" <td>2.965687e+08</td>\n",
" <td>99632</td>\n",
" <td>8600000US99632</td>\n",
" <td>99632</td>\n",
" <td>65153947</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((-18231773.509 8888791.228, -18230845...</td>\n",
" <td>2.959838e+08</td>\n",
" <td>0.998028</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210695</th>\n",
" <td>99658</td>\n",
" <td>860Z200US99658</td>\n",
" <td>99658</td>\n",
" <td>99658</td>\n",
" <td>Z5</td>\n",
" <td>111057651</td>\n",
" <td>318145</td>\n",
" <td>5.059522e+08</td>\n",
" <td>99658</td>\n",
" <td>8600000US99658</td>\n",
" <td>99658</td>\n",
" <td>110490326</td>\n",
" <td>913652</td>\n",
" <td>POLYGON ((-18196204.928 8884024.527, -18158121...</td>\n",
" <td>5.010581e+08</td>\n",
" <td>0.990327</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>210696 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" Zip code AFFGEOID20 GEOID20 NAME20 LSAD20 ALAND20 AWATER20 \\\n",
"0 35768 860Z200US35768 35768 35768 Z5 446231990 3736014 \n",
"1 35769 860Z200US35769 35769 35769 Z5 163279214 57835709 \n",
"2 35776 860Z200US35776 35776 35776 Z5 268376689 1277083 \n",
"3 35774 860Z200US35774 35774 35774 Z5 36139337 362969 \n",
"4 35747 860Z200US35747 35747 35747 Z5 195112094 9300885 \n",
"... ... ... ... ... ... ... ... \n",
"210691 72046 860Z200US72046 72046 72046 Z5 603015221 20383513 \n",
"210692 13156 860Z200US13156 13156 13156 Z5 76214823 6028269 \n",
"210693 06850 860Z200US06850 06850 06850 Z5 17563836 206664 \n",
"210694 99632 860Z200US99632 99632 99632 Z5 65153947 0 \n",
"210695 99658 860Z200US99658 99658 99658 Z5 111057651 318145 \n",
"\n",
" zcta_2020_area ZCTA5CE10 AFFGEOID10 GEOID10 ALAND10 AWATER10 \\\n",
"0 6.688056e+08 35776 8600000US35776 35776 234072461 1041223 \n",
"1 3.270629e+08 35776 8600000US35776 35776 234072461 1041223 \n",
"2 3.994217e+08 35776 8600000US35776 35776 234072461 1041223 \n",
"3 5.424829e+07 35776 8600000US35776 35776 234072461 1041223 \n",
"4 3.016341e+08 35776 8600000US35776 35776 234072461 1041223 \n",
"... ... ... ... ... ... ... \n",
"210691 9.214782e+08 72037 8600000US72037 72037 388714 0 \n",
"210692 1.521320e+08 13064 8600000US13064 13064 461830 2948339 \n",
"210693 3.140506e+07 06856 8600000US06856 06856 9568 0 \n",
"210694 2.965687e+08 99632 8600000US99632 99632 65153947 0 \n",
"210695 5.059522e+08 99658 8600000US99658 99658 110490326 913652 \n",
"\n",
" geometry area_joined \\\n",
"0 GEOMETRYCOLLECTION (POLYGON ((-9597648.456 411... 2.193417e+06 \n",
"1 GEOMETRYCOLLECTION (POLYGON ((-9596251.174 410... 4.423302e+03 \n",
"2 GEOMETRYCOLLECTION (POLYGON ((-9609657.158 411... 3.377086e+08 \n",
"3 GEOMETRYCOLLECTION (POLYGON ((-9592577.408 413... 3.181710e+06 \n",
"4 GEOMETRYCOLLECTION (POLYGON ((-9608245.738 410... 5.117346e+06 \n",
"... ... ... \n",
"210691 MULTIPOLYGON (((-10226266.281 4102252.880, -10... 5.753217e+05 \n",
"210692 MULTIPOLYGON (((-8538756.977 5360283.214, -853... 6.455585e+06 \n",
"210693 POLYGON ((-8173142.581 5028860.482, -8173156.2... 1.245927e+04 \n",
"210694 POLYGON ((-18231773.509 8888791.228, -18230845... 2.959838e+08 \n",
"210695 POLYGON ((-18196204.928 8884024.527, -18158121... 5.010581e+08 \n",
"\n",
" percent of 2020 in 2010 \n",
"0 0.003280 \n",
"1 0.000014 \n",
"2 0.845494 \n",
"3 0.058651 \n",
"4 0.016965 \n",
"... ... \n",
"210691 0.000624 \n",
"210692 0.042434 \n",
"210693 0.000397 \n",
"210694 0.998028 \n",
"210695 0.990327 \n",
"\n",
"[210696 rows x 16 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"joined_gdf"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}