Data Pipeline performance improvements for Census GeoJson and Score file

2025-09-12 13:18:17 -07:00 · 2025-01-13 09:28:14 -05:00 · 2025-01-13 09:28:14 -05:00 · c32bd1f363
commit c32bd1f363
parent d5d055864f
37 changed files with 1305 additions and 1413 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -507,7 +507,7 @@ class CensusACSETL(ExtractTransformLoad):
        # geojson file for all of the US, this will read it off of S3
        logger.debug("Reading in geojson for the country")
        if not os.path.exists(
-            self.DATA_PATH / "census" / "geojson" / "us.json"
+            self.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
        ):
            logger.debug("Fetching Census data from AWS S3")
            unzip_file_from_url(
@ -515,9 +515,8 @@ class CensusACSETL(ExtractTransformLoad):
                self.DATA_PATH / "tmp",
                self.DATA_PATH,
            )
-
-        self.geo_df = gpd.read_file(
-            self.DATA_PATH / "census" / "geojson" / "us.json",
+        self.geo_df = gpd.read_parquet(
+            self.DATA_PATH / "census" / "geojson" / "us_geo.parquet",
        )

    def transform(self) -> None: