{ "cells": [ { "cell_type": "code", "execution_count": null, "source": [ "import pandas as pd\n", "import geopandas as gpd\n", "import math\n", "import pathlib\n", "import os\n", "import sys" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n", " state_gdf = gpd.read_file(file_name)\n", " state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n", " state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n", " state_merged_simplified = state_merged[\n", " [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n", " ].reset_index(drop=True)\n", " state_merged_simplified.rename(\n", " columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n", " )\n", " return state_merged_simplified\n", "\n", "\n", "def aggregate_to_tracts(block_group_df: pd.DataFrame):\n", " # The tract identifier is the first 11 digits of the GEOID\n", " block_group_df[\"tract\"] = block_group_df.apply(\n", " lambda row: row[\"GEOID10\"][0:11], axis=1\n", " )\n", " state_tracts = block_group_df.dissolve(by=\"tract\", aggfunc=\"mean\")\n", " return state_tracts\n", "\n", "\n", "def create_buckets_from_tracts(state_tracts: pd.DataFrame, num_buckets: int):\n", " # assign tracts to buckets by D_SCORE\n", " state_tracts.sort_values(\"D_SCORE\", inplace=True)\n", " D_SCORE_bucket = []\n", " num_buckets = num_buckets\n", " bucket_size = math.ceil(len(state_tracts.index) / num_buckets)\n", " for i in range(len(state_tracts.index)):\n", " D_SCORE_bucket.extend([math.floor(i / bucket_size)])\n", " state_tracts[\"D_SCORE_bucket\"] = D_SCORE_bucket\n", " return state_tracts\n", "\n", "\n", "def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n", " # dissolve tracts by bucket\n", " state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n", " drop=True\n", " )\n", " state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n", " return state_dissolve\n", "\n", "\n", "def breakup_multipolygons(state_bucketed_df: pd.DataFrame, num_buckets: int):\n", " compressed = []\n", " for i in range(num_buckets):\n", " for j in range(len(state_bucketed_df[\"geometry\"][i].geoms)):\n", " compressed.append(\n", " [\n", " state_bucketed_df[\"D_SCORE\"][i],\n", " state_bucketed_df[\"geometry\"][i].geoms[j],\n", " ]\n", " )\n", " return compressed\n", "\n", "\n", "def write_to_file(compressed: pd.DataFrame, file_name: str):\n", " gdf_compressed = gpd.GeoDataFrame(\n", " compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", " )\n", " gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n", "\n", "\n", "def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n", " print(f\"Processing file {file_name}...\")\n", " state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n", " state_tracts = aggregate_to_tracts(state_merged_simplified)\n", " state_tracts = create_buckets_from_tracts(state_tracts, num_buckets)\n", " state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n", " compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n", " write_to_file(compressed, file_name)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", "CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n", "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n", "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})" ], "outputs": [], "metadata": { "id": "Ia5bqxS2LJqe" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df = gpd.GeoDataFrame()" ], "outputs": [], "metadata": { "id": "Dtf5qD50JvCw" } }, { "cell_type": "code", "execution_count": null, "source": [ "for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n", " state_gdf = gpd.read_file(file_name)\n", " master_df = master_df.append(state_gdf)" ], "outputs": [], "metadata": { "id": "PNdw8bERJyKk" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")" ], "outputs": [], "metadata": { "id": "B5SS9y2pLwks" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df.shape" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(220742, 13)" ] }, "metadata": { "tags": [] }, "execution_count": 68 } ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_C6vaR9HQeLa", "outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df.head(2)" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " STATEFP10 ... geometry\n", "0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n", "1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n", "\n", "[2 rows x 13 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATEFP10COUNTYFP10TRACTCE10BLKGRPCE10GEOID10NAMELSAD10MTFCC10FUNCSTAT10ALAND10AWATER10INTPTLAT10INTPTLON10geometry
0010059505002010059505002Block Group 2G5030S191306077605058+31.7728221-085.3325011POLYGON ((-85.17240 31.82508, -85.17334 31.824...
1010059505001010059505001Block Group 1G5030S445746128952734+31.7523221-085.2009470POLYGON ((-85.16283 31.81051, -85.16284 31.813...
\n", "
" ] }, "metadata": { "tags": [] }, "execution_count": 69 } ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oMoubjqCQiw5", "outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})" ], "outputs": [], "metadata": { "id": "bAMmGSgzVml0" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_merged = master_df.merge(usa_df, on=\"GEOID10\", how=\"left\")" ], "outputs": [], "metadata": { "id": "U7M7dExdV2Vh" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_merged.head(2)" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " STATEFP10 COUNTYFP10 ... Score E (percentile) Score E (top 25th percentile)\n", "0 01 005 ... 0.576986 False\n", "1 01 005 ... 0.670349 False\n", "\n", "[2 rows x 98 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
STATEFP10COUNTYFP10TRACTCE10BLKGRPCE10GEOID10NAMELSAD10MTFCC10FUNCSTAT10ALAND10AWATER10INTPTLAT10INTPTLON10geometryHousing burden (percent)Total populationAir toxics cancer riskRespiratory hazard indexDiesel particulate matterParticulate matter (PM2.5)OzoneTraffic proximity and volumeProximity to RMP sitesProximity to TSDF sitesProximity to NPL sitesWastewater dischargePercent pre-1960s housing (lead paint indicator)Individuals under 5 years oldIndividuals over 64 years oldLinguistic isolation (percent)Percent of households in linguistic isolationPoverty (Less than 200% of federal poverty line)Percent individuals age 25 or over with less than high school degreeUnemployed civilians (percent)Housing + Transportation Costs % Income for the Regional Typical HouseholdGEOID10 (percentile)Housing burden (percent) (percentile)Total population (percentile)Air toxics cancer risk (percentile)Respiratory hazard index (percentile)Diesel particulate matter (percentile)...Air toxics cancer risk (min-max normalized)Respiratory hazard index (min-max normalized)Diesel particulate matter (min-max normalized)Particulate matter (PM2.5) (min-max normalized)Ozone (min-max normalized)Traffic proximity and volume (min-max normalized)Proximity to RMP sites (min-max normalized)Proximity to TSDF sites (min-max normalized)Proximity to NPL sites (min-max normalized)Wastewater discharge (min-max normalized)Percent pre-1960s housing (lead paint indicator) (min-max normalized)Individuals under 5 years old (min-max normalized)Individuals over 64 years old (min-max normalized)Linguistic isolation (percent) (min-max normalized)Percent of households in linguistic isolation (min-max normalized)Poverty (Less than 200% of federal poverty line) (min-max normalized)Percent individuals age 25 or over with less than high school degree (min-max normalized)Unemployed civilians (percent) (min-max normalized)Housing + Transportation Costs % Income for the Regional Typical Household (min-max normalized)Score AScore BSocioeconomic FactorsSensitive populationsEnvironmental effectsExposuresPollution BurdenPopulation CharacteristicsScore CScore DScore EScore A (percentile)Score A (top 25th percentile)Score B (percentile)Score B (top 25th percentile)Score C (percentile)Score C (top 25th percentile)Score D (percentile)Score D (top 25th percentile)Score E (percentile)Score E (top 25th percentile)
0010059505002010059505002Block Group 2G5030S191306077605058+31.7728221-085.3325011POLYGON ((-85.17240 31.82508, -85.17334 31.824...0.176565923.044.6364630.7840890.1217679.53605634.6600080.8802420.2951800.0237520.0192620.0506770.201770.0476710.2860240.0000000.0000000.2762730.1811020.15983664.00.0006310.254850.2729300.9442570.9820430.082062...0.0256910.1817890.0200390.4440970.1903630.0000230.0160430.0000540.0021431.179715e-070.201770.0908010.2860240.0000000.0000000.2762730.1811020.1598360.3220340.5972950.3352220.6388950.5356360.3818770.4942520.4567940.5872650.2682590.1491240.5298530.617238False0.61452False0.615988False0.565349False0.576986False
1010059505001010059505001Block Group 1G5030S445746128952734+31.7523221-085.2009470POLYGON ((-85.16283 31.81051, -85.16284 31.813...0.176565818.044.6364630.7840890.1217679.53605634.66000860.0554100.2321530.0277670.0180790.0071150.000000.0073350.2640590.0392610.0383690.3911980.1861470.05312580.00.0006260.254850.2007640.9442570.9820430.082062...0.0256910.1817890.0200390.4440970.1903630.0015980.0126180.0000630.0020111.656256e-080.000000.0139710.2640590.0392610.0383690.3911980.1861470.0531250.4124290.6938610.4778260.7283090.5575380.2644240.5304040.4417440.6429240.2840080.1596280.5893970.723269False0.73044False0.661758False0.608434False0.670349False
\n", "

2 rows × 98 columns

\n", "
" ] }, "metadata": { "tags": [] }, "execution_count": 72 } ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Sr25DUkxWVhg", "outputId": "1e804075-0f7d-4174-82d7-e21b8519c8bf" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_merged_compressed = gpd.GeoDataFrame(usa_merged, crs=\"EPSG:4326\")" ], "outputs": [], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ANMlAB8Qmtu8", "outputId": "44934741-90a9-4664-fab5-2c39b348d2be" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")" ], "outputs": [], "metadata": { "id": "PBPD9LQctvPJ" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_simplified = usa_merged[\n", " [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n", " ].reset_index(drop=True)" ], "outputs": [], "metadata": { "id": "qAAEr1z-WZAT" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_simplified.rename(\n", " columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n", " )" ], "outputs": [], "metadata": { "id": "SCNUjEbzWg-o" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_cbg_compressed = gpd.GeoDataFrame(\n", " usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", " )" ], "outputs": [], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ej70uX0AmW0J", "outputId": "88908f5e-b62d-494f-f0ea-649089b6652a" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")" ], "outputs": [], "metadata": { "id": "UE12dWmame3I" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_tracts = aggregate_to_tracts(usa_simplified)" ], "outputs": [], "metadata": { "id": "wWFBduQQXGtM" } }, { "cell_type": "code", "execution_count": 76, "source": [ "num_buckets = 10" ], "outputs": [], "metadata": { "id": "L-PTnEWOpDtX" } }, { "cell_type": "code", "execution_count": null, "source": [ "tracts_compressed = gpd.GeoDataFrame(\n", " usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", " )" ], "outputs": [], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kTJafXcqXC01", "outputId": "bd197952-76b7-4f99-edef-983f20d7acfb" } }, { "cell_type": "code", "execution_count": null, "source": [ "tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")" ], "outputs": [], "metadata": { "id": "E2Nh97IlYhCF" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_bucketed = create_buckets_from_tracts(usa_tracts)" ], "outputs": [], "metadata": { "id": "557zPMWFZC8R" } }, { "cell_type": "code", "execution_count": null, "source": [ "usa_aggregated = aggregate_buckets(usa_bucketed, agg_func=\"mean\")" ], "outputs": [], "metadata": { "id": "k6RRdKlsaO0a" } }, { "cell_type": "code", "execution_count": 80, "source": [ "usa_aggregated.shape" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(10, 2)" ] }, "metadata": { "tags": [] }, "execution_count": 80 } ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-cm5eET2pA1Z", "outputId": "8d5d2e80-ad62-41d5-f1b0-922345f92d62" } }, { "cell_type": "code", "execution_count": null, "source": [ "compressed = breakup_multipolygons(usa_aggregated, num_buckets)" ], "outputs": [], "metadata": { "id": "4ZvJra-RaZ4v" } }, { "cell_type": "code", "execution_count": null, "source": [ "len(compressed)" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "36836" ] }, "metadata": { "tags": [] }, "execution_count": 82 } ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RDS7Q2WAb4Rx", "outputId": "dcd28a31-083d-482e-b000-b4cd1046d4c2" } }, { "cell_type": "code", "execution_count": null, "source": [ "gdf_compressed = gpd.GeoDataFrame(\n", " compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", " )" ], "outputs": [], "metadata": { "id": "VXTv8UuXb-qU" } }, { "cell_type": "code", "execution_count": null, "source": [ "gdf_compressed.shape" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(36836, 2)" ] }, "metadata": { "tags": [] }, "execution_count": 84 } ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5v7TyB_rcRgT", "outputId": "997625cc-c57a-4335-9b27-a08e4f8ad117" } }, { "cell_type": "code", "execution_count": null, "source": [ "gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")" ], "outputs": [], "metadata": { "id": "5eAnPL8McJpn" } } ], "metadata": { "colab": { "name": "Score_Dissolve_Script", "provenance": [] }, "kernelspec": { "display_name": "Python 3.9.5 ('.venv': venv)", "language": "python", "name": "python395jvsc74a57bd0935cbd69f49565f763db1e6a6adc70b468d078eb4d5856e64428cea33b57a041" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 2 }