j40-cejst-2/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import math\n",
    "import pathlib\n",
    "import os\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "module_path = os.path.abspath(os.path.join(\"..\"))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
    "    state_gdf = gpd.read_file(file_name)\n",
    "    state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n",
    "    state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n",
    "    state_merged_simplified = state_merged[\n",
    "        [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
    "    ].reset_index(drop=True)\n",
    "    state_merged_simplified.rename(\n",
    "        columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
    "    )\n",
    "    return state_merged_simplified\n",
    "\n",
    "\n",
    "def aggregate_to_tracts(block_group_df: pd.DataFrame):\n",
    "    # The tract identifier is the first 11 digits of the GEOID\n",
    "    block_group_df[\"tract\"] = block_group_df.apply(\n",
    "        lambda row: row[\"GEOID10\"][0:11], axis=1\n",
    "    )\n",
    "    state_tracts = block_group_df.dissolve(by=\"tract\", aggfunc=\"mean\")\n",
    "    return state_tracts\n",
    "\n",
    "\n",
    "def create_buckets_from_tracts(state_tracts: pd.DataFrame, num_buckets: int):\n",
    "    # assign tracts to buckets by D_SCORE\n",
    "    state_tracts.sort_values(\"D_SCORE\", inplace=True)\n",
    "    D_SCORE_bucket = []\n",
    "    num_buckets = num_buckets\n",
    "    bucket_size = math.ceil(len(state_tracts.index) / num_buckets)\n",
    "    for i in range(len(state_tracts.index)):\n",
    "        D_SCORE_bucket.extend([math.floor(i / bucket_size)])\n",
    "    state_tracts[\"D_SCORE_bucket\"] = D_SCORE_bucket\n",
    "    return state_tracts\n",
    "\n",
    "\n",
    "def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n",
    "    # dissolve tracts by bucket\n",
    "    state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n",
    "        drop=True\n",
    "    )\n",
    "    state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n",
    "    return state_dissolve\n",
    "\n",
    "\n",
    "def breakup_multipolygons(state_bucketed_df: pd.DataFrame, num_buckets: int):\n",
    "    compressed = []\n",
    "    for i in range(num_buckets):\n",
    "        for j in range(len(state_bucketed_df[\"geometry\"][i].geoms)):\n",
    "            compressed.append(\n",
    "                [\n",
    "                    state_bucketed_df[\"D_SCORE\"][i],\n",
    "                    state_bucketed_df[\"geometry\"][i].geoms[j],\n",
    "                ]\n",
    "            )\n",
    "    return compressed\n",
    "\n",
    "\n",
    "def write_to_file(compressed: pd.DataFrame, file_name: str):\n",
    "    gdf_compressed = gpd.GeoDataFrame(\n",
    "        compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
    "    )\n",
    "    gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n",
    "\n",
    "\n",
    "def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n",
    "    print(f\"Processing file {file_name}...\")\n",
    "    state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n",
    "    state_tracts = aggregate_to_tracts(state_merged_simplified)\n",
    "    state_tracts = create_buckets_from_tracts(state_tracts, num_buckets)\n",
    "    state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n",
    "    compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n",
    "    write_to_file(compressed, file_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "Ia5bqxS2LJqe"
   },
   "outputs": [],
   "source": [
    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
    "CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
    "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
    "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "id": "Dtf5qD50JvCw"
   },
   "outputs": [],
   "source": [
    "master_df = gpd.GeoDataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty GeoDataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "master_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PNdw8bERJyKk"
   },
   "outputs": [],
   "source": [
    "for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
    "   state_gdf = gpd.read_file(file_name)\n",
    "   master_df = master_df.append(state_gdf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "B5SS9y2pLwks"
   },
   "outputs": [],
   "source": [
    "master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "_C6vaR9HQeLa",
    "outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4"
   },
   "outputs": [],
   "source": [
    "master_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "oMoubjqCQiw5",
    "outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>STATEFP10</th>\n",
       "      <th>COUNTYFP10</th>\n",
       "      <th>TRACTCE10</th>\n",
       "      <th>BLKGRPCE10</th>\n",
       "      <th>GEOID10</th>\n",
       "      <th>NAMELSAD10</th>\n",
       "      <th>MTFCC10</th>\n",
       "      <th>FUNCSTAT10</th>\n",
       "      <th>ALAND10</th>\n",
       "      <th>AWATER10</th>\n",
       "      <th>INTPTLAT10</th>\n",
       "      <th>INTPTLON10</th>\n",
       "      <th>geometry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>01</td>\n",
       "      <td>005</td>\n",
       "      <td>950500</td>\n",
       "      <td>2</td>\n",
       "      <td>010059505002</td>\n",
       "      <td>Block Group 2</td>\n",
       "      <td>G5030</td>\n",
       "      <td>S</td>\n",
       "      <td>191306077</td>\n",
       "      <td>605058</td>\n",
       "      <td>+31.7728221</td>\n",
       "      <td>-085.3325011</td>\n",
       "      <td>POLYGON ((-85.17240 31.82508, -85.17334 31.824...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>01</td>\n",
       "      <td>005</td>\n",
       "      <td>950500</td>\n",
       "      <td>1</td>\n",
       "      <td>010059505001</td>\n",
       "      <td>Block Group 1</td>\n",
       "      <td>G5030</td>\n",
       "      <td>S</td>\n",
       "      <td>44574612</td>\n",
       "      <td>8952734</td>\n",
       "      <td>+31.7523221</td>\n",
       "      <td>-085.2009470</td>\n",
       "      <td>POLYGON ((-85.16283 31.81051, -85.16284 31.813...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  STATEFP10  ...                                           geometry\n",
       "0        01  ...  POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n",
       "1        01  ...  POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n",
       "\n",
       "[2 rows x 13 columns]"
      ]
     },
     "execution_count": 69,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "master_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "bAMmGSgzVml0"
   },
   "outputs": [],
   "source": [
    "usa_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "U7M7dExdV2Vh"
   },
   "outputs": [],
   "source": [
    "usa_merged = master_df.merge(usa_df, on=\"GEOID10\", how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Sr25DUkxWVhg",
    "outputId": "1e804075-0f7d-4174-82d7-e21b8519c8bf"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>STATEFP10</th>\n",
       "      <th>COUNTYFP10</th>\n",
       "      <th>TRACTCE10</th>\n",
       "      <th>BLKGRPCE10</th>\n",
       "      <th>GEOID10</th>\n",
       "      <th>NAMELSAD10</th>\n",
       "      <th>MTFCC10</th>\n",
       "      <th>FUNCSTAT10</th>\n",
       "      <th>ALAND10</th>\n",
       "      <th>AWATER10</th>\n",
       "      <th>INTPTLAT10</th>\n",
       "      <th>INTPTLON10</th>\n",
       "      <th>geometry</th>\n",
       "      <th>Housing burden (percent)</th>\n",
       "      <th>Total population</th>\n",
       "      <th>Air toxics cancer risk</th>\n",
       "      <th>Respiratory hazard index</th>\n",
       "      <th>Diesel particulate matter</th>\n",
       "      <th>Particulate matter (PM2.5)</th>\n",
       "      <th>Ozone</th>\n",
       "      <th>Traffic proximity and volume</th>\n",
       "      <th>Proximity to RMP sites</th>\n",
       "      <th>Proximity to TSDF sites</th>\n",
       "      <th>Proximity to NPL sites</th>\n",
       "      <th>Wastewater discharge</th>\n",
       "      <th>Percent pre-1960s housing (lead paint indicator)</th>\n",
       "      <th>Individuals under 5 years old</th>\n",
       "      <th>Individuals over 64 years old</th>\n",
       "      <th>Linguistic isolation (percent)</th>\n",
       "      <th>Percent of households in linguistic isolation</th>\n",
       "      <th>Poverty (Less than 200% of federal poverty line)</th>\n",
       "      <th>Percent individuals age 25 or over with less than high school degree</th>\n",
       "      <th>Unemployed civilians (percent)</th>\n",
       "      <th>Housing + Transportation Costs % Income for the Regional Typical Household</th>\n",
       "      <th>GEOID10 (percentile)</th>\n",
       "      <th>Housing burden (percent) (percentile)</th>\n",
       "      <th>Total population (percentile)</th>\n",
       "      <th>Air toxics cancer risk (percentile)</th>\n",
       "      <th>Respiratory hazard index (percentile)</th>\n",
       "      <th>Diesel particulate matter (percentile)</th>\n",
       "      <th>...</th>\n",
       "      <th>Air toxics cancer risk (min-max normalized)</th>\n",
       "      <th>Respiratory hazard index (min-max normalized)</th>\n",
       "      <th>Diesel particulate matter (min-max normalized)</th>\n",
       "      <th>Particulate matter (PM2.5) (min-max normalized)</th>\n",
       "      <th>Ozone (min-max normalized)</th>\n",
       "      <th>Traffic proximity and volume (min-max normalized)</th>\n",
       "      <th>Proximity to RMP sites (min-max normalized)</th>\n",
       "      <th>Proximity to TSDF sites (min-max normalized)</th>\n",
       "      <th>Proximity to NPL sites (min-max normalized)</th>\n",
       "      <th>Wastewater discharge (min-max normalized)</th>\n",
       "      <th>Percent pre-1960s housing (lead paint indicator) (min-max normalized)</th>\n",
       "      <th>Individuals under 5 years old (min-max normalized)</th>\n",
       "      <th>Individuals over 64 years old (min-max normalized)</th>\n",
       "      <th>Linguistic isolation (percent) (min-max normalized)</th>\n",
       "      <th>Percent of households in linguistic isolation (min-max normalized)</th>\n",
       "      <th>Poverty (Less than 200% of federal poverty line) (min-max normalized)</th>\n",
       "      <th>Percent individuals age 25 or over with less than high school degree (min-max normalized)</th>\n",
       "      <th>Unemployed civilians (percent) (min-max normalized)</th>\n",
       "      <th>Housing + Transportation Costs % Income for the Regional Typical Household (min-max normalized)</th>\n",
       "      <th>Score A</th>\n",
       "      <th>Score B</th>\n",
       "      <th>Socioeconomic Factors</th>\n",
       "      <th>Sensitive populations</th>\n",
       "      <th>Environmental effects</th>\n",
       "      <th>Exposures</th>\n",
       "      <th>Pollution Burden</th>\n",
       "      <th>Population Characteristics</th>\n",
       "      <th>Score C</th>\n",
       "      <th>Score D</th>\n",
       "      <th>Score E</th>\n",
       "      <th>Score A (percentile)</th>\n",
       "      <th>Score A (top 25th percentile)</th>\n",
       "      <th>Score B (percentile)</th>\n",
       "      <th>Score B (top 25th percentile)</th>\n",
       "      <th>Score C (percentile)</th>\n",
       "      <th>Score C (top 25th percentile)</th>\n",
       "      <th>Score D (percentile)</th>\n",
       "      <th>Score D (top 25th percentile)</th>\n",
       "      <th>Score E (percentile)</th>\n",
       "      <th>Score E (top 25th percentile)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>01</td>\n",
       "      <td>005</td>\n",
       "      <td>950500</td>\n",
       "      <td>2</td>\n",
       "      <td>010059505002</td>\n",
       "      <td>Block Group 2</td>\n",
       "      <td>G5030</td>\n",
       "      <td>S</td>\n",
       "      <td>191306077</td>\n",
       "      <td>605058</td>\n",
       "      <td>+31.7728221</td>\n",
       "      <td>-085.3325011</td>\n",
       "      <td>POLYGON ((-85.17240 31.82508, -85.17334 31.824...</td>\n",
       "      <td>0.176565</td>\n",
       "      <td>923.0</td>\n",
       "      <td>44.636463</td>\n",
       "      <td>0.784089</td>\n",
       "      <td>0.121767</td>\n",
       "      <td>9.536056</td>\n",
       "      <td>34.660008</td>\n",
       "      <td>0.880242</td>\n",
       "      <td>0.295180</td>\n",
       "      <td>0.023752</td>\n",
       "      <td>0.019262</td>\n",
       "      <td>0.050677</td>\n",
       "      <td>0.20177</td>\n",
       "      <td>0.047671</td>\n",
       "      <td>0.286024</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.276273</td>\n",
       "      <td>0.181102</td>\n",
       "      <td>0.159836</td>\n",
       "      <td>64.0</td>\n",
       "      <td>0.000631</td>\n",
       "      <td>0.25485</td>\n",
       "      <td>0.272930</td>\n",
       "      <td>0.944257</td>\n",
       "      <td>0.982043</td>\n",
       "      <td>0.082062</td>\n",
       "      <td>...</td>\n",
       "      <td>0.025691</td>\n",
       "      <td>0.181789</td>\n",
       "      <td>0.020039</td>\n",
       "      <td>0.444097</td>\n",
       "      <td>0.190363</td>\n",
       "      <td>0.000023</td>\n",
       "      <td>0.016043</td>\n",
       "      <td>0.000054</td>\n",
       "      <td>0.002143</td>\n",
       "      <td>1.179715e-07</td>\n",
       "      <td>0.20177</td>\n",
       "      <td>0.090801</td>\n",
       "      <td>0.286024</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.276273</td>\n",
       "      <td>0.181102</td>\n",
       "      <td>0.159836</td>\n",
       "      <td>0.322034</td>\n",
       "      <td>0.597295</td>\n",
       "      <td>0.335222</td>\n",
       "      <td>0.638895</td>\n",
       "      <td>0.535636</td>\n",
       "      <td>0.381877</td>\n",
       "      <td>0.494252</td>\n",
       "      <td>0.456794</td>\n",
       "      <td>0.587265</td>\n",
       "      <td>0.268259</td>\n",
       "      <td>0.149124</td>\n",
       "      <td>0.529853</td>\n",
       "      <td>0.617238</td>\n",
       "      <td>False</td>\n",
       "      <td>0.61452</td>\n",
       "      <td>False</td>\n",
       "      <td>0.615988</td>\n",
       "      <td>False</td>\n",
       "      <td>0.565349</td>\n",
       "      <td>False</td>\n",
       "      <td>0.576986</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>01</td>\n",
       "      <td>005</td>\n",
       "      <td>950500</td>\n",
       "      <td>1</td>\n",
       "      <td>010059505001</td>\n",
       "      <td>Block Group 1</td>\n",
       "      <td>G5030</td>\n",
       "      <td>S</td>\n",
       "      <td>44574612</td>\n",
       "      <td>8952734</td>\n",
       "      <td>+31.7523221</td>\n",
       "      <td>-085.2009470</td>\n",
       "      <td>POLYGON ((-85.16283 31.81051, -85.16284 31.813...</td>\n",
       "      <td>0.176565</td>\n",
       "      <td>818.0</td>\n",
       "      <td>44.636463</td>\n",
       "      <td>0.784089</td>\n",
       "      <td>0.121767</td>\n",
       "      <td>9.536056</td>\n",
       "      <td>34.660008</td>\n",
       "      <td>60.055410</td>\n",
       "      <td>0.232153</td>\n",
       "      <td>0.027767</td>\n",
       "      <td>0.018079</td>\n",
       "      <td>0.007115</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.007335</td>\n",
       "      <td>0.264059</td>\n",
       "      <td>0.039261</td>\n",
       "      <td>0.038369</td>\n",
       "      <td>0.391198</td>\n",
       "      <td>0.186147</td>\n",
       "      <td>0.053125</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0.000626</td>\n",
       "      <td>0.25485</td>\n",
       "      <td>0.200764</td>\n",
       "      <td>0.944257</td>\n",
       "      <td>0.982043</td>\n",
       "      <td>0.082062</td>\n",
       "      <td>...</td>\n",
       "      <td>0.025691</td>\n",
       "      <td>0.181789</td>\n",
       "      <td>0.020039</td>\n",
       "      <td>0.444097</td>\n",
       "      <td>0.190363</td>\n",
       "      <td>0.001598</td>\n",
       "      <td>0.012618</td>\n",
       "      <td>0.000063</td>\n",
       "      <td>0.002011</td>\n",
       "      <td>1.656256e-08</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.013971</td>\n",
       "      <td>0.264059</td>\n",
       "      <td>0.039261</td>\n",
       "      <td>0.038369</td>\n",
       "      <td>0.391198</td>\n",
       "      <td>0.186147</td>\n",
       "      <td>0.053125</td>\n",
       "      <td>0.412429</td>\n",
       "      <td>0.693861</td>\n",
       "      <td>0.477826</td>\n",
       "      <td>0.728309</td>\n",
       "      <td>0.557538</td>\n",
       "      <td>0.264424</td>\n",
       "      <td>0.530404</td>\n",
       "      <td>0.441744</td>\n",
       "      <td>0.642924</td>\n",
       "      <td>0.284008</td>\n",
       "      <td>0.159628</td>\n",
       "      <td>0.589397</td>\n",
       "      <td>0.723269</td>\n",
       "      <td>False</td>\n",
       "      <td>0.73044</td>\n",
       "      <td>False</td>\n",
       "      <td>0.661758</td>\n",
       "      <td>False</td>\n",
       "      <td>0.608434</td>\n",
       "      <td>False</td>\n",
       "      <td>0.670349</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 98 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  STATEFP10 COUNTYFP10  ... Score E (percentile) Score E (top 25th percentile)\n",
       "0        01        005  ...             0.576986                         False\n",
       "1        01        005  ...             0.670349                         False\n",
       "\n",
       "[2 rows x 98 columns]"
      ]
     },
     "execution_count": 72,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "usa_merged.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ANMlAB8Qmtu8",
    "outputId": "44934741-90a9-4664-fab5-2c39b348d2be"
   },
   "outputs": [],
   "source": [
    "usa_merged_compressed = gpd.GeoDataFrame(usa_merged, crs=\"EPSG:4326\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PBPD9LQctvPJ"
   },
   "outputs": [],
   "source": [
    "usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "qAAEr1z-WZAT"
   },
   "outputs": [],
   "source": [
    "usa_simplified = usa_merged[\n",
    "        [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
    "    ].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "SCNUjEbzWg-o"
   },
   "outputs": [],
   "source": [
    "usa_simplified.rename(\n",
    "        columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Ej70uX0AmW0J",
    "outputId": "88908f5e-b62d-494f-f0ea-649089b6652a"
   },
   "outputs": [],
   "source": [
    "usa_cbg_compressed = gpd.GeoDataFrame(\n",
    "        usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UE12dWmame3I"
   },
   "outputs": [],
   "source": [
    "usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "wWFBduQQXGtM"
   },
   "outputs": [],
   "source": [
    "usa_tracts = aggregate_to_tracts(usa_simplified)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "id": "L-PTnEWOpDtX"
   },
   "outputs": [],
   "source": [
    "num_buckets = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "kTJafXcqXC01",
    "outputId": "bd197952-76b7-4f99-edef-983f20d7acfb"
   },
   "outputs": [],
   "source": [
    "tracts_compressed = gpd.GeoDataFrame(\n",
    "        usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "E2Nh97IlYhCF"
   },
   "outputs": [],
   "source": [
    "tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "557zPMWFZC8R"
   },
   "outputs": [],
   "source": [
    "usa_bucketed = create_buckets_from_tracts(usa_tracts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "k6RRdKlsaO0a"
   },
   "outputs": [],
   "source": [
    "usa_aggregated = aggregate_buckets(usa_bucketed, agg_func=\"mean\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "-cm5eET2pA1Z",
    "outputId": "8d5d2e80-ad62-41d5-f1b0-922345f92d62"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10, 2)"
      ]
     },
     "execution_count": 80,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "usa_aggregated.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4ZvJra-RaZ4v"
   },
   "outputs": [],
   "source": [
    "compressed = breakup_multipolygons(usa_aggregated, num_buckets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "RDS7Q2WAb4Rx",
    "outputId": "dcd28a31-083d-482e-b000-b4cd1046d4c2"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "36836"
      ]
     },
     "execution_count": 82,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(compressed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "VXTv8UuXb-qU"
   },
   "outputs": [],
   "source": [
    "gdf_compressed = gpd.GeoDataFrame(\n",
    "        compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5v7TyB_rcRgT",
    "outputId": "997625cc-c57a-4335-9b27-a08e4f8ad117"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(36836, 2)"
      ]
     },
     "execution_count": 84,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gdf_compressed.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "5eAnPL8McJpn"
   },
   "outputs": [],
   "source": [
    "gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Score_Dissolve_Script",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}