In [1]:
import pandas as pd
import geopandas as gpd
import math
import pathlib
import os
import sys

In [2]:
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):
    state_gdf = gpd.read_file(file_name)
    state_repr = state_gdf.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
    state_merged = state_repr.merge(usa_df, on="GEOID10", how="left")
    state_merged_simplified = state_merged[
        ["GEOID10", "Score D (percentile)", "geometry"]
    ].reset_index(drop=True)
    state_merged_simplified.rename(
        columns={"Score D (percentile)": "D_SCORE"}, inplace=True
    )
    return state_merged_simplified


def aggregate_to_tracts(block_group_df: pd.DataFrame):
    # The tract identifier is the first 11 digits of the GEOID
    block_group_df["tract"] = block_group_df.apply(
        lambda row: row["GEOID10"][0:11], axis=1
    )
    state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean")
    return state_tracts


def create_buckets_from_tracts(state_tracts: pd.DataFrame, num_buckets: int):
    # assign tracts to buckets by D_SCORE
    state_tracts.sort_values("D_SCORE", inplace=True)
    D_SCORE_bucket = []
    num_buckets = num_buckets
    bucket_size = math.ceil(len(state_tracts.index) / num_buckets)
    for i in range(len(state_tracts.index)):
        D_SCORE_bucket.extend([math.floor(i / bucket_size)])
    state_tracts["D_SCORE_bucket"] = D_SCORE_bucket
    return state_tracts


def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):
    # dissolve tracts by bucket
    state_attr = state_tracts[["D_SCORE", "D_SCORE_bucket", "geometry"]].reset_index(
        drop=True
    )
    state_dissolve = state_attr.dissolve(by="D_SCORE_bucket", aggfunc=agg_func)
    return state_dissolve


def breakup_multipolygons(state_bucketed_df: pd.DataFrame, num_buckets: int):
    compressed = []
    for i in range(num_buckets):
        for j in range(len(state_bucketed_df["geometry"][i].geoms)):
            compressed.append(
                [
                    state_bucketed_df["D_SCORE"][i],
                    state_bucketed_df["geometry"][i].geoms[j],
                ]
            )
    return compressed


def write_to_file(compressed: pd.DataFrame, file_name: str):
    gdf_compressed = gpd.GeoDataFrame(
        compressed, columns=["D_SCORE", "geometry"], crs="EPSG:4326"
    )
    gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f"{file_name}_low.geojson", driver="GeoJSON")


def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):
    print(f"Processing file {file_name}...")
    state_merged_simplified = merge_and_simplify_file(file_name, usa_df)
    state_tracts = aggregate_to_tracts(state_merged_simplified)
    state_tracts = create_buckets_from_tracts(state_tracts, num_buckets)
    state_bucketed_df = aggregate_buckets(state_tracts, "mean")
    compressed = breakup_multipolygons(state_bucketed_df, num_buckets)
    write_to_file(compressed, file_name)

In [6]:
DATA_DIR = pathlib.Path.cwd().parent / "data"
CENSUS_GEOJSON_DIR = DATA_DIR / "census" / "geojson"
CEJST_DATA_PATH = DATA_DIR / "score" / "csv" / "tiles" / "usa.csv"
score_df = pd.read_csv(CEJST_DATA_PATH, dtype={"GEOID10": "object"}, low_memory=False)

In [7]:
master_df = gpd.GeoDataFrame()

In [8]:
master_df.head()

In [None]:
for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):
   state_gdf = gpd.read_file(file_name)
   master_df = master_df.append(state_gdf)

In [None]:
master_df = master_df.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [None]:
master_df.shape

In [None]:
master_df.head(2)

Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLKGRPCE10,GEOID10,NAMELSAD10,MTFCC10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,geometry
0,1,5,950500,2,10059505002,Block Group 2,G5030,S,191306077,605058,31.7728221,-85.3325011,"POLYGON ((-85.17240 31.82508, -85.17334 31.824..."
1,1,5,950500,1,10059505001,Block Group 1,G5030,S,44574612,8952734,31.7523221,-85.200947,"POLYGON ((-85.16283 31.81051, -85.16284 31.813..."


In [None]:
usa_df = pd.read_csv(CEJST_DATA_PATH, dtype={"GEOID10": "object"})

In [None]:
usa_merged = master_df.merge(usa_df, on="GEOID10", how="left")

In [None]:
usa_merged.head(2)

Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLKGRPCE10,GEOID10,NAMELSAD10,MTFCC10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,geometry,Housing burden (percent),Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter,Particulate matter (PM2.5),Ozone,Traffic proximity and volume,Proximity to RMP sites,Proximity to TSDF sites,Proximity to NPL sites,Wastewater discharge,Percent pre-1960s housing (lead paint indicator),Individuals under 5 years old,Individuals over 64 years old,Linguistic isolation (percent),Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Percent individuals age 25 or over with less than high school degree,Unemployed civilians (percent),Housing + Transportation Costs % Income for the Regional Typical Household,GEOID10 (percentile),Housing burden (percent) (percentile),Total population (percentile),Air toxics cancer risk (percentile),Respiratory hazard index (percentile),Diesel particulate matter (percentile),...,Air toxics cancer risk (min-max normalized),Respiratory hazard index (min-max normalized),Diesel particulate matter (min-max normalized),Particulate matter (PM2.5) (min-max normalized),Ozone (min-max normalized),Traffic proximity and volume (min-max normalized),Proximity to RMP sites (min-max normalized),Proximity to TSDF sites (min-max normalized),Proximity to NPL sites (min-max normalized),Wastewater discharge (min-max normalized),Percent pre-1960s housing (lead paint indicator) (min-max normalized),Individuals under 5 years old (min-max normalized),Individuals over 64 years old (min-max normalized),Linguistic isolation (percent) (min-max normalized),Percent of households in linguistic isolation (min-max normalized),Poverty (Less than 200% of federal poverty line) (min-max normalized),Percent individuals age 25 or over with less than high school degree (min-max normalized),Unemployed civilians (percent) (min-max normalized),Housing + Transportation Costs % Income for the Regional Typical Household (min-max normalized),Score A,Score B,Socioeconomic Factors,Sensitive populations,Environmental effects,Exposures,Pollution Burden,Population Characteristics,Score C,Score D,Score E,Score A (percentile),Score A (top 25th percentile),Score B (percentile),Score B (top 25th percentile),Score C (percentile),Score C (top 25th percentile),Score D (percentile),Score D (top 25th percentile),Score E (percentile),Score E (top 25th percentile)
0,1,5,950500,2,10059505002,Block Group 2,G5030,S,191306077,605058,31.7728221,-85.3325011,"POLYGON ((-85.17240 31.82508, -85.17334 31.824...",0.176565,923.0,44.636463,0.784089,0.121767,9.536056,34.660008,0.880242,0.29518,0.023752,0.019262,0.050677,0.20177,0.047671,0.286024,0.0,0.0,0.276273,0.181102,0.159836,64.0,0.000631,0.25485,0.27293,0.944257,0.982043,0.082062,...,0.025691,0.181789,0.020039,0.444097,0.190363,2.3e-05,0.016043,5.4e-05,0.002143,1.179715e-07,0.20177,0.090801,0.286024,0.0,0.0,0.276273,0.181102,0.159836,0.322034,0.597295,0.335222,0.638895,0.535636,0.381877,0.494252,0.456794,0.587265,0.268259,0.149124,0.529853,0.617238,False,0.61452,False,0.615988,False,0.565349,False,0.576986,False
1,1,5,950500,1,10059505001,Block Group 1,G5030,S,44574612,8952734,31.7523221,-85.200947,"POLYGON ((-85.16283 31.81051, -85.16284 31.813...",0.176565,818.0,44.636463,0.784089,0.121767,9.536056,34.660008,60.05541,0.232153,0.027767,0.018079,0.007115,0.0,0.007335,0.264059,0.039261,0.038369,0.391198,0.186147,0.053125,80.0,0.000626,0.25485,0.200764,0.944257,0.982043,0.082062,...,0.025691,0.181789,0.020039,0.444097,0.190363,0.001598,0.012618,6.3e-05,0.002011,1.656256e-08,0.0,0.013971,0.264059,0.039261,0.038369,0.391198,0.186147,0.053125,0.412429,0.693861,0.477826,0.728309,0.557538,0.264424,0.530404,0.441744,0.642924,0.284008,0.159628,0.589397,0.723269,False,0.73044,False,0.661758,False,0.608434,False,0.670349,False


In [None]:
usa_merged_compressed = gpd.GeoDataFrame(usa_merged, crs="EPSG:4326")

In [None]:
usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / "usa_merged.geojson", driver="GeoJSON")

In [None]:
usa_simplified = usa_merged[
        ["GEOID10", "Score D (percentile)", "geometry"]
    ].reset_index(drop=True)

In [None]:
usa_simplified.rename(
        columns={"Score D (percentile)": "D_SCORE"}, inplace=True
    )

In [None]:
usa_cbg_compressed = gpd.GeoDataFrame(
        usa_simplified, columns=["D_SCORE", "geometry"], crs="EPSG:4326"
    )

In [None]:
usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / "usa_cbg_scoreD.geojson", driver="GeoJSON")

In [None]:
usa_tracts = aggregate_to_tracts(usa_simplified)

In [76]:
num_buckets = 10

In [None]:
tracts_compressed = gpd.GeoDataFrame(
        usa_tracts, columns=["D_SCORE", "geometry"], crs="EPSG:4326"
    )

In [None]:
tracts_compressed.to_file(CENSUS_GEOJSON_DIR / "usa_tracts_score.geojson", driver="GeoJSON")

In [None]:
usa_bucketed = create_buckets_from_tracts(usa_tracts)

In [None]:
usa_aggregated = aggregate_buckets(usa_bucketed, agg_func="mean")

In [80]:
usa_aggregated.shape

(10, 2)

In [None]:
compressed = breakup_multipolygons(usa_aggregated, num_buckets)

In [None]:
len(compressed)

36836

In [None]:
gdf_compressed = gpd.GeoDataFrame(
        compressed, columns=["D_SCORE", "geometry"], crs="EPSG:4326"
    )

In [None]:
gdf_compressed.shape

(36836, 2)

In [None]:
gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f"usa_low.geojson", driver="GeoJSON")