j40-cejst-2/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb
Jorge Escobar b404fdcc43
Generate Geo-aware scores for all zoom levels (#391)
* generate Geo-aware scores for all zoom levels

* usa high progress

* testing dissolve

* checkpoint

* changing type

* removing breakpoint

* validation notebooks

* quick update

* score validation

* fixes for county merge

* code completed
2021-07-28 16:07:28 -04:00

949 lines
28 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"import math\n",
"import pathlib\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
" state_gdf = gpd.read_file(file_name)\n",
" state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n",
" state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n",
" state_merged_simplified = state_merged[\n",
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
" ].reset_index(drop=True)\n",
" state_merged_simplified.rename(\n",
" columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
" )\n",
" return state_merged_simplified\n",
"\n",
"\n",
"def aggregate_to_tracts(block_group_df: pd.DataFrame):\n",
" # The tract identifier is the first 11 digits of the GEOID\n",
" block_group_df[\"tract\"] = block_group_df.apply(\n",
" lambda row: row[\"GEOID10\"][0:11], axis=1\n",
" )\n",
" state_tracts = block_group_df.dissolve(by=\"tract\", aggfunc=\"mean\")\n",
" return state_tracts\n",
"\n",
"\n",
"def create_buckets_from_tracts(state_tracts: pd.DataFrame, num_buckets: int):\n",
" # assign tracts to buckets by D_SCORE\n",
" state_tracts.sort_values(\"D_SCORE\", inplace=True)\n",
" D_SCORE_bucket = []\n",
" num_buckets = num_buckets\n",
" bucket_size = math.ceil(len(state_tracts.index) / num_buckets)\n",
" for i in range(len(state_tracts.index)):\n",
" D_SCORE_bucket.extend([math.floor(i / bucket_size)])\n",
" state_tracts[\"D_SCORE_bucket\"] = D_SCORE_bucket\n",
" return state_tracts\n",
"\n",
"\n",
"def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n",
" # dissolve tracts by bucket\n",
" state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n",
" drop=True\n",
" )\n",
" state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n",
" return state_dissolve\n",
"\n",
"\n",
"def breakup_multipolygons(state_bucketed_df: pd.DataFrame, num_buckets: int):\n",
" compressed = []\n",
" for i in range(num_buckets):\n",
" for j in range(len(state_bucketed_df[\"geometry\"][i].geoms)):\n",
" compressed.append(\n",
" [\n",
" state_bucketed_df[\"D_SCORE\"][i],\n",
" state_bucketed_df[\"geometry\"][i].geoms[j],\n",
" ]\n",
" )\n",
" return compressed\n",
"\n",
"\n",
"def write_to_file(compressed: pd.DataFrame, file_name: str):\n",
" gdf_compressed = gpd.GeoDataFrame(\n",
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )\n",
" gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n",
"\n",
"\n",
"def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n",
" print(f\"Processing file {file_name}...\")\n",
" state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n",
" state_tracts = aggregate_to_tracts(state_merged_simplified)\n",
" state_tracts = create_buckets_from_tracts(state_tracts, num_buckets)\n",
" state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n",
" compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n",
" write_to_file(compressed, file_name)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "Ia5bqxS2LJqe"
},
"outputs": [],
"source": [
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
"CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
"score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "Dtf5qD50JvCw"
},
"outputs": [],
"source": [
"master_df = gpd.GeoDataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty GeoDataFrame\n",
"Columns: []\n",
"Index: []"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"master_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PNdw8bERJyKk"
},
"outputs": [],
"source": [
"for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
" state_gdf = gpd.read_file(file_name)\n",
" master_df = master_df.append(state_gdf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B5SS9y2pLwks"
},
"outputs": [],
"source": [
"master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_C6vaR9HQeLa",
"outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4"
},
"outputs": [],
"source": [
"master_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oMoubjqCQiw5",
"outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STATEFP10</th>\n",
" <th>COUNTYFP10</th>\n",
" <th>TRACTCE10</th>\n",
" <th>BLKGRPCE10</th>\n",
" <th>GEOID10</th>\n",
" <th>NAMELSAD10</th>\n",
" <th>MTFCC10</th>\n",
" <th>FUNCSTAT10</th>\n",
" <th>ALAND10</th>\n",
" <th>AWATER10</th>\n",
" <th>INTPTLAT10</th>\n",
" <th>INTPTLON10</th>\n",
" <th>geometry</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>2</td>\n",
" <td>010059505002</td>\n",
" <td>Block Group 2</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>191306077</td>\n",
" <td>605058</td>\n",
" <td>+31.7728221</td>\n",
" <td>-085.3325011</td>\n",
" <td>POLYGON ((-85.17240 31.82508, -85.17334 31.824...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>1</td>\n",
" <td>010059505001</td>\n",
" <td>Block Group 1</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>44574612</td>\n",
" <td>8952734</td>\n",
" <td>+31.7523221</td>\n",
" <td>-085.2009470</td>\n",
" <td>POLYGON ((-85.16283 31.81051, -85.16284 31.813...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" STATEFP10 ... geometry\n",
"0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n",
"1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n",
"\n",
"[2 rows x 13 columns]"
]
},
"execution_count": 69,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"master_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bAMmGSgzVml0"
},
"outputs": [],
"source": [
"usa_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "U7M7dExdV2Vh"
},
"outputs": [],
"source": [
"usa_merged = master_df.merge(usa_df, on=\"GEOID10\", how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Sr25DUkxWVhg",
"outputId": "1e804075-0f7d-4174-82d7-e21b8519c8bf"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STATEFP10</th>\n",
" <th>COUNTYFP10</th>\n",
" <th>TRACTCE10</th>\n",
" <th>BLKGRPCE10</th>\n",
" <th>GEOID10</th>\n",
" <th>NAMELSAD10</th>\n",
" <th>MTFCC10</th>\n",
" <th>FUNCSTAT10</th>\n",
" <th>ALAND10</th>\n",
" <th>AWATER10</th>\n",
" <th>INTPTLAT10</th>\n",
" <th>INTPTLON10</th>\n",
" <th>geometry</th>\n",
" <th>Housing burden (percent)</th>\n",
" <th>Total population</th>\n",
" <th>Air toxics cancer risk</th>\n",
" <th>Respiratory hazard index</th>\n",
" <th>Diesel particulate matter</th>\n",
" <th>Particulate matter (PM2.5)</th>\n",
" <th>Ozone</th>\n",
" <th>Traffic proximity and volume</th>\n",
" <th>Proximity to RMP sites</th>\n",
" <th>Proximity to TSDF sites</th>\n",
" <th>Proximity to NPL sites</th>\n",
" <th>Wastewater discharge</th>\n",
" <th>Percent pre-1960s housing (lead paint indicator)</th>\n",
" <th>Individuals under 5 years old</th>\n",
" <th>Individuals over 64 years old</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" <th>Percent of households in linguistic isolation</th>\n",
" <th>Poverty (Less than 200% of federal poverty line)</th>\n",
" <th>Percent individuals age 25 or over with less than high school degree</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Housing + Transportation Costs % Income for the Regional Typical Household</th>\n",
" <th>GEOID10 (percentile)</th>\n",
" <th>Housing burden (percent) (percentile)</th>\n",
" <th>Total population (percentile)</th>\n",
" <th>Air toxics cancer risk (percentile)</th>\n",
" <th>Respiratory hazard index (percentile)</th>\n",
" <th>Diesel particulate matter (percentile)</th>\n",
" <th>...</th>\n",
" <th>Air toxics cancer risk (min-max normalized)</th>\n",
" <th>Respiratory hazard index (min-max normalized)</th>\n",
" <th>Diesel particulate matter (min-max normalized)</th>\n",
" <th>Particulate matter (PM2.5) (min-max normalized)</th>\n",
" <th>Ozone (min-max normalized)</th>\n",
" <th>Traffic proximity and volume (min-max normalized)</th>\n",
" <th>Proximity to RMP sites (min-max normalized)</th>\n",
" <th>Proximity to TSDF sites (min-max normalized)</th>\n",
" <th>Proximity to NPL sites (min-max normalized)</th>\n",
" <th>Wastewater discharge (min-max normalized)</th>\n",
" <th>Percent pre-1960s housing (lead paint indicator) (min-max normalized)</th>\n",
" <th>Individuals under 5 years old (min-max normalized)</th>\n",
" <th>Individuals over 64 years old (min-max normalized)</th>\n",
" <th>Linguistic isolation (percent) (min-max normalized)</th>\n",
" <th>Percent of households in linguistic isolation (min-max normalized)</th>\n",
" <th>Poverty (Less than 200% of federal poverty line) (min-max normalized)</th>\n",
" <th>Percent individuals age 25 or over with less than high school degree (min-max normalized)</th>\n",
" <th>Unemployed civilians (percent) (min-max normalized)</th>\n",
" <th>Housing + Transportation Costs % Income for the Regional Typical Household (min-max normalized)</th>\n",
" <th>Score A</th>\n",
" <th>Score B</th>\n",
" <th>Socioeconomic Factors</th>\n",
" <th>Sensitive populations</th>\n",
" <th>Environmental effects</th>\n",
" <th>Exposures</th>\n",
" <th>Pollution Burden</th>\n",
" <th>Population Characteristics</th>\n",
" <th>Score C</th>\n",
" <th>Score D</th>\n",
" <th>Score E</th>\n",
" <th>Score A (percentile)</th>\n",
" <th>Score A (top 25th percentile)</th>\n",
" <th>Score B (percentile)</th>\n",
" <th>Score B (top 25th percentile)</th>\n",
" <th>Score C (percentile)</th>\n",
" <th>Score C (top 25th percentile)</th>\n",
" <th>Score D (percentile)</th>\n",
" <th>Score D (top 25th percentile)</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>2</td>\n",
" <td>010059505002</td>\n",
" <td>Block Group 2</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>191306077</td>\n",
" <td>605058</td>\n",
" <td>+31.7728221</td>\n",
" <td>-085.3325011</td>\n",
" <td>POLYGON ((-85.17240 31.82508, -85.17334 31.824...</td>\n",
" <td>0.176565</td>\n",
" <td>923.0</td>\n",
" <td>44.636463</td>\n",
" <td>0.784089</td>\n",
" <td>0.121767</td>\n",
" <td>9.536056</td>\n",
" <td>34.660008</td>\n",
" <td>0.880242</td>\n",
" <td>0.295180</td>\n",
" <td>0.023752</td>\n",
" <td>0.019262</td>\n",
" <td>0.050677</td>\n",
" <td>0.20177</td>\n",
" <td>0.047671</td>\n",
" <td>0.286024</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.276273</td>\n",
" <td>0.181102</td>\n",
" <td>0.159836</td>\n",
" <td>64.0</td>\n",
" <td>0.000631</td>\n",
" <td>0.25485</td>\n",
" <td>0.272930</td>\n",
" <td>0.944257</td>\n",
" <td>0.982043</td>\n",
" <td>0.082062</td>\n",
" <td>...</td>\n",
" <td>0.025691</td>\n",
" <td>0.181789</td>\n",
" <td>0.020039</td>\n",
" <td>0.444097</td>\n",
" <td>0.190363</td>\n",
" <td>0.000023</td>\n",
" <td>0.016043</td>\n",
" <td>0.000054</td>\n",
" <td>0.002143</td>\n",
" <td>1.179715e-07</td>\n",
" <td>0.20177</td>\n",
" <td>0.090801</td>\n",
" <td>0.286024</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.276273</td>\n",
" <td>0.181102</td>\n",
" <td>0.159836</td>\n",
" <td>0.322034</td>\n",
" <td>0.597295</td>\n",
" <td>0.335222</td>\n",
" <td>0.638895</td>\n",
" <td>0.535636</td>\n",
" <td>0.381877</td>\n",
" <td>0.494252</td>\n",
" <td>0.456794</td>\n",
" <td>0.587265</td>\n",
" <td>0.268259</td>\n",
" <td>0.149124</td>\n",
" <td>0.529853</td>\n",
" <td>0.617238</td>\n",
" <td>False</td>\n",
" <td>0.61452</td>\n",
" <td>False</td>\n",
" <td>0.615988</td>\n",
" <td>False</td>\n",
" <td>0.565349</td>\n",
" <td>False</td>\n",
" <td>0.576986</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>01</td>\n",
" <td>005</td>\n",
" <td>950500</td>\n",
" <td>1</td>\n",
" <td>010059505001</td>\n",
" <td>Block Group 1</td>\n",
" <td>G5030</td>\n",
" <td>S</td>\n",
" <td>44574612</td>\n",
" <td>8952734</td>\n",
" <td>+31.7523221</td>\n",
" <td>-085.2009470</td>\n",
" <td>POLYGON ((-85.16283 31.81051, -85.16284 31.813...</td>\n",
" <td>0.176565</td>\n",
" <td>818.0</td>\n",
" <td>44.636463</td>\n",
" <td>0.784089</td>\n",
" <td>0.121767</td>\n",
" <td>9.536056</td>\n",
" <td>34.660008</td>\n",
" <td>60.055410</td>\n",
" <td>0.232153</td>\n",
" <td>0.027767</td>\n",
" <td>0.018079</td>\n",
" <td>0.007115</td>\n",
" <td>0.00000</td>\n",
" <td>0.007335</td>\n",
" <td>0.264059</td>\n",
" <td>0.039261</td>\n",
" <td>0.038369</td>\n",
" <td>0.391198</td>\n",
" <td>0.186147</td>\n",
" <td>0.053125</td>\n",
" <td>80.0</td>\n",
" <td>0.000626</td>\n",
" <td>0.25485</td>\n",
" <td>0.200764</td>\n",
" <td>0.944257</td>\n",
" <td>0.982043</td>\n",
" <td>0.082062</td>\n",
" <td>...</td>\n",
" <td>0.025691</td>\n",
" <td>0.181789</td>\n",
" <td>0.020039</td>\n",
" <td>0.444097</td>\n",
" <td>0.190363</td>\n",
" <td>0.001598</td>\n",
" <td>0.012618</td>\n",
" <td>0.000063</td>\n",
" <td>0.002011</td>\n",
" <td>1.656256e-08</td>\n",
" <td>0.00000</td>\n",
" <td>0.013971</td>\n",
" <td>0.264059</td>\n",
" <td>0.039261</td>\n",
" <td>0.038369</td>\n",
" <td>0.391198</td>\n",
" <td>0.186147</td>\n",
" <td>0.053125</td>\n",
" <td>0.412429</td>\n",
" <td>0.693861</td>\n",
" <td>0.477826</td>\n",
" <td>0.728309</td>\n",
" <td>0.557538</td>\n",
" <td>0.264424</td>\n",
" <td>0.530404</td>\n",
" <td>0.441744</td>\n",
" <td>0.642924</td>\n",
" <td>0.284008</td>\n",
" <td>0.159628</td>\n",
" <td>0.589397</td>\n",
" <td>0.723269</td>\n",
" <td>False</td>\n",
" <td>0.73044</td>\n",
" <td>False</td>\n",
" <td>0.661758</td>\n",
" <td>False</td>\n",
" <td>0.608434</td>\n",
" <td>False</td>\n",
" <td>0.670349</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 98 columns</p>\n",
"</div>"
],
"text/plain": [
" STATEFP10 COUNTYFP10 ... Score E (percentile) Score E (top 25th percentile)\n",
"0 01 005 ... 0.576986 False\n",
"1 01 005 ... 0.670349 False\n",
"\n",
"[2 rows x 98 columns]"
]
},
"execution_count": 72,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"usa_merged.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ANMlAB8Qmtu8",
"outputId": "44934741-90a9-4664-fab5-2c39b348d2be"
},
"outputs": [],
"source": [
"usa_merged_compressed = gpd.GeoDataFrame(usa_merged, crs=\"EPSG:4326\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PBPD9LQctvPJ"
},
"outputs": [],
"source": [
"usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qAAEr1z-WZAT"
},
"outputs": [],
"source": [
"usa_simplified = usa_merged[\n",
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
" ].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SCNUjEbzWg-o"
},
"outputs": [],
"source": [
"usa_simplified.rename(\n",
" columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ej70uX0AmW0J",
"outputId": "88908f5e-b62d-494f-f0ea-649089b6652a"
},
"outputs": [],
"source": [
"usa_cbg_compressed = gpd.GeoDataFrame(\n",
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UE12dWmame3I"
},
"outputs": [],
"source": [
"usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wWFBduQQXGtM"
},
"outputs": [],
"source": [
"usa_tracts = aggregate_to_tracts(usa_simplified)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"id": "L-PTnEWOpDtX"
},
"outputs": [],
"source": [
"num_buckets = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kTJafXcqXC01",
"outputId": "bd197952-76b7-4f99-edef-983f20d7acfb"
},
"outputs": [],
"source": [
"tracts_compressed = gpd.GeoDataFrame(\n",
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "E2Nh97IlYhCF"
},
"outputs": [],
"source": [
"tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "557zPMWFZC8R"
},
"outputs": [],
"source": [
"usa_bucketed = create_buckets_from_tracts(usa_tracts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "k6RRdKlsaO0a"
},
"outputs": [],
"source": [
"usa_aggregated = aggregate_buckets(usa_bucketed, agg_func=\"mean\")"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-cm5eET2pA1Z",
"outputId": "8d5d2e80-ad62-41d5-f1b0-922345f92d62"
},
"outputs": [
{
"data": {
"text/plain": [
"(10, 2)"
]
},
"execution_count": 80,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"usa_aggregated.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4ZvJra-RaZ4v"
},
"outputs": [],
"source": [
"compressed = breakup_multipolygons(usa_aggregated, num_buckets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RDS7Q2WAb4Rx",
"outputId": "dcd28a31-083d-482e-b000-b4cd1046d4c2"
},
"outputs": [
{
"data": {
"text/plain": [
"36836"
]
},
"execution_count": 82,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"len(compressed)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VXTv8UuXb-qU"
},
"outputs": [],
"source": [
"gdf_compressed = gpd.GeoDataFrame(\n",
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5v7TyB_rcRgT",
"outputId": "997625cc-c57a-4335-9b27-a08e4f8ad117"
},
"outputs": [
{
"data": {
"text/plain": [
"(36836, 2)"
]
},
"execution_count": 84,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"gdf_compressed.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5eAnPL8McJpn"
},
"outputs": [],
"source": [
"gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
]
}
],
"metadata": {
"colab": {
"name": "Score_Dissolve_Script",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}