Data Pipeline performance improvements for Census GeoJson and Score file

2025-09-30 03:43:18 -07:00 · 2025-01-13 09:28:14 -05:00 · 2025-01-13 09:28:14 -05:00 · c32bd1f363
commit c32bd1f363
parent d5d055864f
37 changed files with 1305 additions and 1413 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@ -1,10 +1,9 @@
 import csv
-import json
-import subprocess
 from enum import Enum
 from pathlib import Path

 import geopandas as gpd
+import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
@ -26,8 +25,8 @@ class CensusETL(ExtractTransformLoad):
    CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
    GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
    NATIONAL_TRACT_CSV_PATH = CSV_BASE_PATH / "us.csv"
-    NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us.json"
-    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
+    NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us_geo.parquet"
+    GEOID_TRACT_FIELD_NAME: str = "GEOID10"

    def __init__(self):

@ -59,7 +58,7 @@ class CensusETL(ExtractTransformLoad):
                / f"tl_2010_{fips_code}_tract10.shp"
            )
        elif file_type == GeoFileType.GEOJSON:
-            file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.json")
+            file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.parquet")
        elif file_type == GeoFileType.CSV:
            file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
        return file_path
@ -93,14 +92,8 @@ class CensusETL(ExtractTransformLoad):
        )

        if not geojson_file_path.is_file():
-            cmd = [
-                "ogr2ogr",
-                "-f",
-                "GeoJSON",
-                str(geojson_file_path),
-                str(shp_file_path),
-            ]
-            subprocess.run(cmd, check=True)
+            gdf = gpd.read_file(shp_file_path)
+            gdf.to_parquet(geojson_file_path)

    def _generate_tract_table(self) -> None:
        """Generate Tract CSV table for pandas, load in memory
@ -110,20 +103,15 @@ class CensusETL(ExtractTransformLoad):
        """
        logger.debug("Transforming tracts")

-        for file in self.GEOJSON_BASE_PATH.iterdir():
-            if file.suffix == ".json":
-                logger.debug(f"Adding GEOID10 for file {file.name}")
-                with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f:
-                    geojson = json.load(f)
-                    for feature in geojson["features"]:
-                        tractid10 = feature["properties"]["GEOID10"]
-                        self.TRACT_NATIONAL.append(str(tractid10))
-                        tractid10_state_id = tractid10[:2]
-                        if not self.TRACT_PER_STATE.get(tractid10_state_id):
-                            self.TRACT_PER_STATE[tractid10_state_id] = []
-                        self.TRACT_PER_STATE[tractid10_state_id].append(
-                            tractid10
-                        )
+        files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
+        files.sort()
+        for file in files:
+            logger.debug(f"Adding GEOID10 for file {file.name}")
+            state_df = gpd.read_parquet(file)
+            tract_list = state_df["GEOID10"].to_list()
+            self.TRACT_NATIONAL.extend(tract_list)
+            tractid10_state_id = state_df["STATEFP10"][0]
+            self.TRACT_PER_STATE[tractid10_state_id] = tract_list

    def transform(self) -> None:
        """Download all census shape files from the Census FTP and extract the geojson
@ -210,18 +198,24 @@ class CensusETL(ExtractTransformLoad):

        usa_df = gpd.GeoDataFrame()

-        for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"):
+        # Read state only files and append them into a MEGA US GPD
+        files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
+        files.sort()
+        for file_name in files:
            logger.debug(f"Adding national GeoJSON file {file_name.name}")
-            state_gdf = gpd.read_file(file_name)
-            usa_df = usa_df.append(state_gdf)
+            state_gdf = gpd.read_parquet(file_name)
+            usa_df = pd.concat([usa_df, state_gdf], ignore_index=True)

+        assert len(usa_df.columns) > 0
        logger.debug("Converting to CRS")
-        usa_df = usa_df.to_crs(
-            "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
-        )
+        usa_df = usa_df.to_crs("EPSG:4326")

        logger.debug("Saving national GeoJSON file")
-        usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
+        # Convert tract ID to a string
+        usa_df[self.GEOID_TRACT_FIELD_NAME] = usa_df[
+            self.GEOID_TRACT_FIELD_NAME
+        ].astype(str, errors="ignore")
+        usa_df.to_parquet(self.NATIONAL_TRACT_JSON_PATH)

    def load(self) -> None:
        """Create state CSVs, National CSV, and National GeoJSON
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -104,7 +104,7 @@ def check_census_data_source(
        )
    else:
        # check if census data is found locally
-        if not os.path.isfile(census_data_path / "geojson" / "us.json"):
+        if not os.path.isfile(census_data_path / "geojson" / "us_geo.parquet"):
            logger.error(
                "No local census data found. Please use '-s aws` to fetch from AWS"
            )
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -507,7 +507,7 @@ class CensusACSETL(ExtractTransformLoad):
        # geojson file for all of the US, this will read it off of S3
        logger.debug("Reading in geojson for the country")
        if not os.path.exists(
-            self.DATA_PATH / "census" / "geojson" / "us.json"
+            self.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
        ):
            logger.debug("Fetching Census data from AWS S3")
            unzip_file_from_url(
@ -515,9 +515,8 @@ class CensusACSETL(ExtractTransformLoad):
                self.DATA_PATH / "tmp",
                self.DATA_PATH,
            )
-
-        self.geo_df = gpd.read_file(
-            self.DATA_PATH / "census" / "geojson" / "us.json",
+        self.geo_df = gpd.read_parquet(
+            self.DATA_PATH / "census" / "geojson" / "us_geo.parquet",
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@ -33,7 +33,7 @@ class CensusDecennialETL(ExtractTransformLoad):
        / f"census_decennial_{DECENNIAL_YEAR}"
    )
    CENSUS_GEOJSON_PATH = (
-        ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
+        ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
    )

    def __get_api_url(
@ -148,7 +148,7 @@ class CensusDecennialETL(ExtractTransformLoad):
        """Impute income for both income measures."""
        # Merges Census geojson to imput values from.
        logger.debug(f"Reading GeoJSON from {geojson_path}")
-        geo_df = gpd.read_file(geojson_path)
+        geo_df = gpd.read_parquet(geojson_path)
        self.df_all = CensusACSETL.merge_geojson(
            df=self.df_all,
            usa_geo_df=geo_df,
--- a/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py
@ -26,10 +26,7 @@ def get_tract_geojson(
        census_etl.extract()
        census_etl.transform()
        census_etl.load()
-    tract_data = gpd.read_file(
-        GEOJSON_PATH,
-        include_fields=["GEOID10"],
-    )
+    tract_data = gpd.read_parquet(GEOJSON_PATH)
    tract_data = tract_data.rename(
        columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
    )