{ "cells": [ { "cell_type": "code", "execution_count": null, "source": [ "import pandas as pd\n", "import geopandas as gpd\n", "import math\n", "import pathlib\n", "import os\n", "import sys" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n", " state_gdf = gpd.read_file(file_name)\n", " state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n", " state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n", " state_merged_simplified = state_merged[\n", " [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n", " ].reset_index(drop=True)\n", " state_merged_simplified.rename(\n", " columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n", " )\n", " return state_merged_simplified\n", "\n", "\n", "def aggregate_to_tracts(block_group_df: pd.DataFrame):\n", " # The tract identifier is the first 11 digits of the GEOID\n", " block_group_df[\"tract\"] = block_group_df.apply(\n", " lambda row: row[\"GEOID10\"][0:11], axis=1\n", " )\n", " state_tracts = block_group_df.dissolve(by=\"tract\", aggfunc=\"mean\")\n", " return state_tracts\n", "\n", "\n", "def create_buckets_from_tracts(state_tracts: pd.DataFrame, num_buckets: int):\n", " # assign tracts to buckets by D_SCORE\n", " state_tracts.sort_values(\"D_SCORE\", inplace=True)\n", " D_SCORE_bucket = []\n", " num_buckets = num_buckets\n", " bucket_size = math.ceil(len(state_tracts.index) / num_buckets)\n", " for i in range(len(state_tracts.index)):\n", " D_SCORE_bucket.extend([math.floor(i / bucket_size)])\n", " state_tracts[\"D_SCORE_bucket\"] = D_SCORE_bucket\n", " return state_tracts\n", "\n", "\n", "def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n", " # dissolve tracts by bucket\n", " state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n", " drop=True\n", " )\n", " state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n", " return state_dissolve\n", "\n", "\n", "def breakup_multipolygons(state_bucketed_df: pd.DataFrame, num_buckets: int):\n", " compressed = []\n", " for i in range(num_buckets):\n", " for j in range(len(state_bucketed_df[\"geometry\"][i].geoms)):\n", " compressed.append(\n", " [\n", " state_bucketed_df[\"D_SCORE\"][i],\n", " state_bucketed_df[\"geometry\"][i].geoms[j],\n", " ]\n", " )\n", " return compressed\n", "\n", "\n", "def write_to_file(compressed: pd.DataFrame, file_name: str):\n", " gdf_compressed = gpd.GeoDataFrame(\n", " compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n", " )\n", " gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n", "\n", "\n", "def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n", " print(f\"Processing file {file_name}...\")\n", " state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n", " state_tracts = aggregate_to_tracts(state_merged_simplified)\n", " state_tracts = create_buckets_from_tracts(state_tracts, num_buckets)\n", " state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n", " compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n", " write_to_file(compressed, file_name)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", "CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n", "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n", "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})" ], "outputs": [], "metadata": { "id": "Ia5bqxS2LJqe" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df = gpd.GeoDataFrame()" ], "outputs": [], "metadata": { "id": "Dtf5qD50JvCw" } }, { "cell_type": "code", "execution_count": null, "source": [ "for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n", " state_gdf = gpd.read_file(file_name)\n", " master_df = master_df.append(state_gdf)" ], "outputs": [], "metadata": { "id": "PNdw8bERJyKk" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")" ], "outputs": [], "metadata": { "id": "B5SS9y2pLwks" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df.shape" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(220742, 13)" ] }, "metadata": { "tags": [] }, "execution_count": 68 } ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_C6vaR9HQeLa", "outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4" } }, { "cell_type": "code", "execution_count": null, "source": [ "master_df.head(2)" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " STATEFP10 ... geometry\n", "0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n", "1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n", "\n", "[2 rows x 13 columns]" ], "text/html": [ "
\n", " | STATEFP10 | \n", "COUNTYFP10 | \n", "TRACTCE10 | \n", "BLKGRPCE10 | \n", "GEOID10 | \n", "NAMELSAD10 | \n", "MTFCC10 | \n", "FUNCSTAT10 | \n", "ALAND10 | \n", "AWATER10 | \n", "INTPTLAT10 | \n", "INTPTLON10 | \n", "geometry | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "01 | \n", "005 | \n", "950500 | \n", "2 | \n", "010059505002 | \n", "Block Group 2 | \n", "G5030 | \n", "S | \n", "191306077 | \n", "605058 | \n", "+31.7728221 | \n", "-085.3325011 | \n", "POLYGON ((-85.17240 31.82508, -85.17334 31.824... | \n", "
1 | \n", "01 | \n", "005 | \n", "950500 | \n", "1 | \n", "010059505001 | \n", "Block Group 1 | \n", "G5030 | \n", "S | \n", "44574612 | \n", "8952734 | \n", "+31.7523221 | \n", "-085.2009470 | \n", "POLYGON ((-85.16283 31.81051, -85.16284 31.813... | \n", "
\n", " | STATEFP10 | \n", "COUNTYFP10 | \n", "TRACTCE10 | \n", "BLKGRPCE10 | \n", "GEOID10 | \n", "NAMELSAD10 | \n", "MTFCC10 | \n", "FUNCSTAT10 | \n", "ALAND10 | \n", "AWATER10 | \n", "INTPTLAT10 | \n", "INTPTLON10 | \n", "geometry | \n", "Housing burden (percent) | \n", "Total population | \n", "Air toxics cancer risk | \n", "Respiratory hazard index | \n", "Diesel particulate matter | \n", "Particulate matter (PM2.5) | \n", "Ozone | \n", "Traffic proximity and volume | \n", "Proximity to RMP sites | \n", "Proximity to TSDF sites | \n", "Proximity to NPL sites | \n", "Wastewater discharge | \n", "Percent pre-1960s housing (lead paint indicator) | \n", "Individuals under 5 years old | \n", "Individuals over 64 years old | \n", "Linguistic isolation (percent) | \n", "Percent of households in linguistic isolation | \n", "Poverty (Less than 200% of federal poverty line) | \n", "Percent individuals age 25 or over with less than high school degree | \n", "Unemployed civilians (percent) | \n", "Housing + Transportation Costs % Income for the Regional Typical Household | \n", "GEOID10 (percentile) | \n", "Housing burden (percent) (percentile) | \n", "Total population (percentile) | \n", "Air toxics cancer risk (percentile) | \n", "Respiratory hazard index (percentile) | \n", "Diesel particulate matter (percentile) | \n", "... | \n", "Air toxics cancer risk (min-max normalized) | \n", "Respiratory hazard index (min-max normalized) | \n", "Diesel particulate matter (min-max normalized) | \n", "Particulate matter (PM2.5) (min-max normalized) | \n", "Ozone (min-max normalized) | \n", "Traffic proximity and volume (min-max normalized) | \n", "Proximity to RMP sites (min-max normalized) | \n", "Proximity to TSDF sites (min-max normalized) | \n", "Proximity to NPL sites (min-max normalized) | \n", "Wastewater discharge (min-max normalized) | \n", "Percent pre-1960s housing (lead paint indicator) (min-max normalized) | \n", "Individuals under 5 years old (min-max normalized) | \n", "Individuals over 64 years old (min-max normalized) | \n", "Linguistic isolation (percent) (min-max normalized) | \n", "Percent of households in linguistic isolation (min-max normalized) | \n", "Poverty (Less than 200% of federal poverty line) (min-max normalized) | \n", "Percent individuals age 25 or over with less than high school degree (min-max normalized) | \n", "Unemployed civilians (percent) (min-max normalized) | \n", "Housing + Transportation Costs % Income for the Regional Typical Household (min-max normalized) | \n", "Score A | \n", "Score B | \n", "Socioeconomic Factors | \n", "Sensitive populations | \n", "Environmental effects | \n", "Exposures | \n", "Pollution Burden | \n", "Population Characteristics | \n", "Score C | \n", "Score D | \n", "Score E | \n", "Score A (percentile) | \n", "Score A (top 25th percentile) | \n", "Score B (percentile) | \n", "Score B (top 25th percentile) | \n", "Score C (percentile) | \n", "Score C (top 25th percentile) | \n", "Score D (percentile) | \n", "Score D (top 25th percentile) | \n", "Score E (percentile) | \n", "Score E (top 25th percentile) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "01 | \n", "005 | \n", "950500 | \n", "2 | \n", "010059505002 | \n", "Block Group 2 | \n", "G5030 | \n", "S | \n", "191306077 | \n", "605058 | \n", "+31.7728221 | \n", "-085.3325011 | \n", "POLYGON ((-85.17240 31.82508, -85.17334 31.824... | \n", "0.176565 | \n", "923.0 | \n", "44.636463 | \n", "0.784089 | \n", "0.121767 | \n", "9.536056 | \n", "34.660008 | \n", "0.880242 | \n", "0.295180 | \n", "0.023752 | \n", "0.019262 | \n", "0.050677 | \n", "0.20177 | \n", "0.047671 | \n", "0.286024 | \n", "0.000000 | \n", "0.000000 | \n", "0.276273 | \n", "0.181102 | \n", "0.159836 | \n", "64.0 | \n", "0.000631 | \n", "0.25485 | \n", "0.272930 | \n", "0.944257 | \n", "0.982043 | \n", "0.082062 | \n", "... | \n", "0.025691 | \n", "0.181789 | \n", "0.020039 | \n", "0.444097 | \n", "0.190363 | \n", "0.000023 | \n", "0.016043 | \n", "0.000054 | \n", "0.002143 | \n", "1.179715e-07 | \n", "0.20177 | \n", "0.090801 | \n", "0.286024 | \n", "0.000000 | \n", "0.000000 | \n", "0.276273 | \n", "0.181102 | \n", "0.159836 | \n", "0.322034 | \n", "0.597295 | \n", "0.335222 | \n", "0.638895 | \n", "0.535636 | \n", "0.381877 | \n", "0.494252 | \n", "0.456794 | \n", "0.587265 | \n", "0.268259 | \n", "0.149124 | \n", "0.529853 | \n", "0.617238 | \n", "False | \n", "0.61452 | \n", "False | \n", "0.615988 | \n", "False | \n", "0.565349 | \n", "False | \n", "0.576986 | \n", "False | \n", "
1 | \n", "01 | \n", "005 | \n", "950500 | \n", "1 | \n", "010059505001 | \n", "Block Group 1 | \n", "G5030 | \n", "S | \n", "44574612 | \n", "8952734 | \n", "+31.7523221 | \n", "-085.2009470 | \n", "POLYGON ((-85.16283 31.81051, -85.16284 31.813... | \n", "0.176565 | \n", "818.0 | \n", "44.636463 | \n", "0.784089 | \n", "0.121767 | \n", "9.536056 | \n", "34.660008 | \n", "60.055410 | \n", "0.232153 | \n", "0.027767 | \n", "0.018079 | \n", "0.007115 | \n", "0.00000 | \n", "0.007335 | \n", "0.264059 | \n", "0.039261 | \n", "0.038369 | \n", "0.391198 | \n", "0.186147 | \n", "0.053125 | \n", "80.0 | \n", "0.000626 | \n", "0.25485 | \n", "0.200764 | \n", "0.944257 | \n", "0.982043 | \n", "0.082062 | \n", "... | \n", "0.025691 | \n", "0.181789 | \n", "0.020039 | \n", "0.444097 | \n", "0.190363 | \n", "0.001598 | \n", "0.012618 | \n", "0.000063 | \n", "0.002011 | \n", "1.656256e-08 | \n", "0.00000 | \n", "0.013971 | \n", "0.264059 | \n", "0.039261 | \n", "0.038369 | \n", "0.391198 | \n", "0.186147 | \n", "0.053125 | \n", "0.412429 | \n", "0.693861 | \n", "0.477826 | \n", "0.728309 | \n", "0.557538 | \n", "0.264424 | \n", "0.530404 | \n", "0.441744 | \n", "0.642924 | \n", "0.284008 | \n", "0.159628 | \n", "0.589397 | \n", "0.723269 | \n", "False | \n", "0.73044 | \n", "False | \n", "0.661758 | \n", "False | \n", "0.608434 | \n", "False | \n", "0.670349 | \n", "False | \n", "
2 rows × 98 columns
\n", "