Data Pipeline performance improvements for Census GeoJson and Score file

2025-07-30 13:01:16 -07:00 · 2025-01-13 09:28:14 -05:00 · 2025-01-13 09:28:14 -05:00 · c32bd1f363
commit c32bd1f363
parent d5d055864f
37 changed files with 1305 additions and 1413 deletions
--- a/.github/workflows/deploy_backend_main.yml
+++ b/.github/workflows/deploy_backend_main.yml
@ -59,12 +59,6 @@ jobs:
        with:
          path: data/data-pipeline/data_pipeline/data/census
          key: data-census
-      - name: Install GDAL/ogr2ogr
-        if: steps.cache-census.outputs.cache-hit != 'true'
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install gdal-bin
-          ogrinfo --version
      - name: Get Census Data
        if: steps.cache-census.outputs.cache-hit != 'true'
        run: |
@ -72,7 +66,6 @@ jobs:
      - name: Run ETL
        run: |
          poetry run python3 -m data_pipeline.application etl-run
-          poetry run python3 -m data_pipeline.application etl-run --dataset tribal
      - name: Generate Score
        run: |
          poetry run python3 -m data_pipeline.application score-run
--- a/.github/workflows/pr_backend.yml
+++ b/.github/workflows/pr_backend.yml
@ -98,11 +98,6 @@ jobs:
      - name: Install dependencies
        run: poetry add s4cmd && poetry install
        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-      - name: Install GDAL/ogr2ogr
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install gdal-bin
-          ogrinfo --version
      - name: Load cached ETL data
        id: cached-etl-data
        uses: actions/cache@v4
@ -119,7 +114,6 @@ jobs:
        if: steps.cached-etl-data.outputs.cache-hit != 'true'
        run: |
          poetry run python3 -m data_pipeline.application etl-run
-          poetry run python3 -m data_pipeline.application etl-run --dataset tribal
      - name: Generate Score
        run: |
          poetry run python3 -m data_pipeline.application score-run
--- a/data/data-pipeline/.vscode/launch.json
+++ b/data/data-pipeline/.vscode/launch.json
@ -4,27 +4,9 @@
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  "version": "0.2.0",
  "configurations": [
-    {
-      "name": "Score Run",
-      "type": "python",
-      "request": "launch",
-      "module": "data_pipeline.application",
-      "args": [
-        "score-run"
-      ]
-    },
-    {
-      "name": "Generate Score Post",
-      "type": "python",
-      "request": "launch",
-      "module": "data_pipeline.application",
-      "args": [
-        "generate-score-post"
-      ]
-    },
    {
      "name": "Data Cleanup",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "data_pipeline.application",
      "args": [
@ -33,7 +15,7 @@
    },
    {
      "name": "Census Cleanup",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "data_pipeline.application",
      "args": [
@ -42,73 +24,25 @@
    },
    {
      "name": "Download Census",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "data_pipeline.application",
      "args": [
-        "census-data-download"
-      ]
-    },
-    {
-      "name": "Score Full Run",
-      "type": "python",
-      "request": "launch",
-      "module": "data_pipeline.application",
-      "args": [
-        "score-full-run"
-      ]
-    },
-    {
-      "name": "Combine Score and GeoJSON",
-      "type": "python",
-      "request": "launch",
-      "module": "data_pipeline.application",
-      "args": [
-        "geo-score"
-      ]
-    },
-    {
-      "name": "Generate Score Tiles",
-      "type": "python",
-      "request": "launch",
-      "module": "data_pipeline.application",
-      "args": [
-        "generate-map-tiles"
-      ]
-    },
-    {
-      "name": "Generate Tribal Tiles",
-      "type": "python",
-      "request": "launch",
-      "module": "data_pipeline.application",
-      "args": [
-        "generate-map-tiles",
-        "-t"
+        "census-data-download", "-u"
      ]
    },
    {
      "name": "ETL Run",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "data_pipeline.application",
      "args": [
        "etl-run"
      ]
    },
-    {
-      "name": "ETL Run NRI",
-      "type": "python",
-      "request": "launch",
-      "module": "data_pipeline.application",
-      "args": [
-        "etl-run",
-        "--dataset",
-        "national_risk_index"
-      ]
-    },
    {
      "name": "ETL Run Tribal",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "data_pipeline.application",
      "args": [
@ -117,18 +51,91 @@
        "tribal"
      ]
    },
+    {
+      "name": "Score Run",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.application",
+      "args": [
+        "score-run"
+      ]
+    },
+    {
+      "name": "Combine Score and GeoJSON",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.application",
+      "args": [
+        "geo-score"
+      ]
+    },
+    {
+      "name": "Generate Score Post",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.application",
+      "args": [
+        "generate-score-post"
+      ]
+    },
+    {
+      "name": "Generate Score Tiles",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.application",
+      "args": [
+        "generate-map-tiles"
+      ]
+    },
+    {
+      "name": "Generate Tribal Tiles",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.application",
+      "args": [
+        "generate-map-tiles",
+        "-t"
+      ]
+    },
+    {
+      "name": "Score Full Run",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.application",
+      "args": [
+        "score-full-run"
+      ]
+    },
    {
      "name": "Data Full Run",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "data_pipeline.application",
      "args": [
        "data-full-run",
      ]
    },
+    {
+      "name": "Comparator",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.comparator",
+      "args": [
+        "compare-score",
+      ]
+    },
+    {
+      "name": "Convert score to CSV",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "data_pipeline.application",
+      "args": [
+        "convert-score",
+      ]
+    },
    {
      "name": "poetry install",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "poetry",
      "args": [
@ -137,7 +144,7 @@
    },
    {
      "name": "poetry update",
-      "type": "python",
+      "type": "debugpy",
      "request": "launch",
      "module": "poetry",
      "args": [
--- a/data/data-pipeline/INSTALLATION.md
+++ b/data/data-pipeline/INSTALLATION.md
@ -58,7 +58,6 @@ The application requires the installation of three 3rd party tools.

 | Tool            | Purpose              | Link                                                      |
 | --------------- | -------------------- | --------------------------------------------------------- |
-| GDAL            | Generate census data | [GDAL library](https://github.com/OSGeo/gdal)             |
 | libspatialindex | Score generation     | [libspatialindex](https://libspatialindex.org/en/latest/) |
 | tippecanoe      | Generate map tiles   | [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe) |

@ -66,7 +65,6 @@ The application requires the installation of three 3rd party tools.

 Use Homebrew to install the three tools.

- GDAL: `brew install gdal`
 - libspatialindex: `brew install spatialindex`
 - tippecanoe: `brew install tippecanoe`

--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@ -117,7 +117,7 @@ Begin the process of running the application in your local environment by downlo

 To download census data, run the command `poetry run python3 data_pipeline/application.py census-data-download`.

-If you have a high speed internet connection and don't want to generate the census data or install `GDAL` locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder.
+If you have a high speed internet connection and don't want to generate the census data locally, you can download [a zip version of the Census file](https://justice40-data.s3.amazonaws.com/data-sources/census.zip). Unzip and move the contents inside the `data/data-pipeline/data_pipeline/data/census` folder.

 #### Run the Application

--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -1,4 +1,7 @@
 import sys
+import os
+import pandas as pd
+from pathlib import Path
 from subprocess import call

 import click
@ -19,6 +22,7 @@ from data_pipeline.etl.sources.tribal.etl_utils import (
    reset_data_directories as tribal_reset,
 )
 from data_pipeline.tile.generate import generate_tiles
+from data_pipeline.etl.score import constants
 from data_pipeline.utils import check_first_run
 from data_pipeline.utils import data_folder_cleanup
 from data_pipeline.utils import downloadable_cleanup
@ -330,25 +334,11 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
        temp_folder_cleanup()
        tribal_reset(data_path)

-        if data_source == "local":
        log_info("Downloading census data")
        etl_runner("census", use_cache)

        log_info("Running all ETLs")
-            etl_runner(use_cache=True)
-
-            log_info("Running tribal ETL")
-            etl_runner("tribal", use_cache)
-
-        else:
-            log_info("Downloading census data")
-            etl_runner("census", use_cache=False)
-
-            log_info("Running all ETLs")
-            etl_runner(use_cache=False)
-
-            log_info("Running tribal ETL")
-            etl_runner("tribal", use_cache=False)
+        etl_runner(use_cache)

        log_info("Generating score")
        score_generate()
@ -467,10 +457,41 @@ def full_run(ctx, use_cache):
        ctx.invoke(data_cleanup)
    ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
    ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
-    ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
    ctx.invoke(full_post_etl)


+@cli.command(
+    help="Convert a Pickle or Parquet file to GeoJSON or CSV depending on the contents of the file.",
+)
+@click.option(
+    "--source",
+    "-s",
+    type=click.Path(),
+    # We don't require this option, otherwise the tool will not run when there is no score
+    default=constants.DATA_SCORE_CSV_FULL_FILE_PATH,
+    help="Path to the input file. Defaults to the default location of the local score file.",
+)
+@click.option(
+    "--destination",
+    "-d",
+    type=click.Path(writable=True),
+    default=Path(
+        os.path.splitext(constants.DATA_SCORE_CSV_FULL_FILE_PATH)[0] + ".csv"
+    ),
+    help="Path to the input file. Defaults to the source file with CSV extension.",
+)
+def convert_score(source: Path, destination: Path):
+    """Converts the score file to CSV."""
+    if source.exists():
+        score_df = pd.read_parquet(source)
+        logger.info(f"Saving score as CSV to {destination}")
+        score_df.to_csv(destination, index=False)
+        logger.info("Done.")
+    else:
+        logger.error(f"Error: Unable to read {source}")
+        sys.exit(1)
+
+
 def log_title(title: str, subtitle: str = None):
    """Logs a title in our fancy title format"""
    logger.info("-" * LOG_LINE_WIDTH)
--- a/data/data-pipeline/data_pipeline/comparator.py
+++ b/data/data-pipeline/data_pipeline/comparator.py
@ -51,12 +51,19 @@ def _read_from_file(file_path: Path):
            "Please generate the score and try again."
        )
        sys.exit(1)
-    return pd.read_csv(
+    df = pd.DataFrame()
+    if file_path.suffix == ".parquet":
+        df = pd.read_parquet(file_path)
+        df.set_index("GEOID10_TRACT", inplace=True)
+    else:
+        df = pd.read_csv(
            file_path,
            index_col="GEOID10_TRACT",
            dtype={"GEOID10_TRACT": str},
            low_memory=False,
-    ).sort_index()
+        )
+
+    return df.sort_index()


 def _add_tract_list(tract_list: list[str]):
@ -67,7 +74,7 @@ def _add_tract_list(tract_list: list[str]):
        tract_list (list[str]): a list of tracts
    """
    if len(tract_list) > 0:
-        _add_text("Those tracts are:\n")
+        _add_text(" Those tracts are:\n")
        # First extract the Census states/territories
        states_by_tract = []
        for tract in tract_list:
@ -125,7 +132,7 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
        local_df (pd.DataFrame): the local score
    """
    log_info("Comparing dataframe contents (production vs local)")
-    _add_text("\n\n## Scores\n")
+    _add_text("\n## Scores\n")

    production_row_count = len(prod_df.index)
    local_row_count = len(local_df.index)
@ -189,10 +196,10 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
        f" in the locally generated score representing {local_pct_of_population_represented:.1%} of the total population."
    )
    _add_text(
-        " The number of tracts match!\n "
+        " The number of tracts match!\n"
        if len(production_disadvantaged_tracts_set)
        == len(local_disadvantaged_tracts_set)
-        else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n "
+        else f" The difference is {abs(len(production_disadvantaged_tracts_set) - len(local_disadvantaged_tracts_set))} tract(s).\n"
    )

    removed_tracts = production_disadvantaged_tracts_set.difference(
@ -213,17 +220,44 @@ def _compare_score_results(prod_df: pd.DataFrame, local_df: pd.DataFrame):
    )
    _add_tract_list(added_tracts)

-    # Grandfathered tracts from v1.0
+
+def _check_grandfathered_tracts(
+    prod_df: pd.DataFrame, local_df: pd.DataFrame, compare_to_version: str
+):
+    """
+    Find grandfathered tracts for v1.0 comparisons.
+
+    Args:
+        prod_df (pd.DataFrame): the production score
+        local_df (pd.DataFrame): the local score
+        compare_to_version (str): the compare to version
+    """
+
+    # Set the field we will check for grandfathering.
+    # This allows us to add other fields for other versions.
+    grandfathered_field = (
+        field_names.GRANDFATHERED_N_COMMUNITIES_V1_0
+        if compare_to_version.startswith("1")
+        else None
+    )
+
+    # If there is a grandfathered field then check for those tracts
+    if grandfathered_field:
+        log_info("Checking for grandfathered tracks")
        grandfathered_tracts = local_df.loc[
            local_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
        ].index
        if len(grandfathered_tracts) > 0:
            _add_text(
-            f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
+                f"\n* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring."
            )
            _add_tract_list(grandfathered_tracts)
        else:
-        _add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
+            _add_text(
+                "* There are NO grandfathered tracts from v1.0 scoring.\n"
+            )
+    else:
+        _add_text("\n* There is no grandfathered tract list for this version.")


 def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
@ -234,7 +268,7 @@ def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame):
        prod_df (pd.DataFrame): the production score
        local_df (pd.DataFrame): the local score
    """
-    _add_text("\n## Delta\n")
+    _add_text("\n\n## Delta\n")
    # First we make the columns on two dataframes to be the same to be able to compare
    local_score_df_columns = local_df.columns.array.tolist()
    production_score_df_columns = prod_df.columns.array.tolist()
@ -287,7 +321,7 @@ def cli():
@click.option(
    "-v",
    "--compare-to-version",
-    default="1.0",
+    default="2.0",
    required=False,
    type=str,
    help="Set the production score version to compare to",
@ -359,8 +393,10 @@ def compare_score(

    _compare_score_columns(production_score_df, local_score_df)
    _compare_score_results(production_score_df, local_score_df)
+    _check_grandfathered_tracts(
+        production_score_df, local_score_df, compare_to_version
+    )
    _generate_delta(production_score_df, local_score_df)
-
    result_doc = _get_result_doc()
    print(result_doc)

--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -155,7 +155,13 @@ DATASET_LIST = [
        "class_name": "HistoricRedliningETL",
        "is_memory_intensive": False,
    },
-    # This has to come after us.json exists
+    {
+        "name": "tribal",
+        "module_dir": "tribal",
+        "class_name": "TribalETL",
+        "is_memory_intensive": False,
+    },
+    # This has to come after us_geo.parquet exists
    {
        "name": "census_acs",
        "module_dir": "census_acs",
@ -196,10 +202,3 @@ CENSUS_INFO = {
    "class_name": "CensusETL",
    "is_memory_intensive": False,
 }
-
-TRIBAL_INFO = {
-    "name": "tribal",
-    "module_dir": "tribal",
-    "class_name": "TribalETL",
-    "is_memory_intensive": False,
-}
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -1,5 +1,6 @@
 import concurrent.futures
 import importlib
+import time
 import typing
 import os

@ -27,9 +28,7 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
        None
    """
    dataset_list = constants.DATASET_LIST
-    etls_to_search = (
-        dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
-    )
+    etls_to_search = dataset_list + [constants.CENSUS_INFO]

    if dataset_to_run:
        dataset_element = next(
@ -59,6 +58,8 @@ def _get_dataset(dataset: dict) -> ExtractTransformLoad:
 def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
    """Runs one etl process."""

+    start_time = time.time()
+
    logger.info(f"Running ETL for {dataset['name']}")
    etl_instance = _get_dataset(dataset)

@ -83,6 +84,9 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
    etl_instance.cleanup()

    logger.info(f"Finished ETL for dataset {dataset['name']}")
+    logger.debug(
+        f"Execution time for ETL for dataset {dataset['name']} was {time.time() - start_time}s"
+    )


 def etl_runner(
@ -197,10 +201,14 @@ def score_generate() -> None:
    """

    # Score Gen
+    start_time = time.time()
    score_gen = ScoreETL()
    score_gen.extract()
    score_gen.transform()
    score_gen.load()
+    logger.debug(
+        f"Execution time for Score Generation was {time.time() - start_time}s"
+    )


 def score_post(data_source: str = "local") -> None:
@ -216,11 +224,15 @@ def score_post(data_source: str = "local") -> None:
        None
    """
    # Post Score Processing
+    start_time = time.time()
    score_post = PostScoreETL(data_source=data_source)
    score_post.extract()
    score_post.transform()
    score_post.load()
    score_post.cleanup()
+    logger.debug(
+        f"Execution time for Score Post was {time.time() - start_time}s"
+    )


 def score_geo(data_source: str = "local") -> None:
@ -237,10 +249,14 @@ def score_geo(data_source: str = "local") -> None:
    """

    # Score Geo
+    start_time = time.time()
    score_geo = GeoScoreETL(data_source=data_source)
    score_geo.extract()
    score_geo.transform()
    score_geo.load()
+    logger.debug(
+        f"Execution time for Score Geo was {time.time() - start_time}s"
+    )


 def _find_dataset_index(dataset_list, key, value):
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -24,7 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
 DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
 DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
 DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
-DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json"
+DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us_geo.parquet"

 # Score paths
 DATA_SCORE_DIR = DATA_PATH / "score"
@ -32,7 +32,7 @@ DATA_SCORE_DIR = DATA_PATH / "score"
 ## Score CSV Paths
 DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv"
 DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full"
-DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa.csv"
+DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet"
 FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
    DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv"
 )
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -727,4 +727,4 @@ class ScoreETL(ExtractTransformLoad):
    def load(self) -> None:
        constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)

-        self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
+        self.df.to_parquet(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -37,9 +37,7 @@ class GeoScoreETL(ExtractTransformLoad):
        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
        self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"

-        self.CENSUS_USA_GEOJSON = (
-            self.DATA_PATH / "census" / "geojson" / "us.json"
-        )
+        self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH

        # Import the shortened name for Score N to be used on tiles.
        # We should no longer be using PFS
@ -87,16 +85,14 @@ class GeoScoreETL(ExtractTransformLoad):
            score_data_source=self.DATA_SOURCE,
        )

-        logger.info("Reading US GeoJSON (~6 minutes)")
-        full_geojson_usa_df = gpd.read_file(
+        logger.info("Reading US GeoJSON")
+        full_geojson_usa_df = gpd.read_parquet(
            self.CENSUS_USA_GEOJSON,
-            dtype={self.GEOID_FIELD_NAME: "string"},
-            usecols=[
+            columns=[
                self.GEOID_FIELD_NAME,
                self.GEOMETRY_FIELD_NAME,
                self.LAND_FIELD_NAME,
            ],
-            low_memory=False,
        )

        # We only want to keep tracts to visualize that have non-0 land
@ -104,7 +100,7 @@ class GeoScoreETL(ExtractTransformLoad):
            full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
        ]

-        logger.info("Reading score CSV")
+        logger.info("Reading tile score CSV")
        self.score_usa_df = pd.read_csv(
            self.TILE_SCORE_CSV,
            dtype={
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -94,12 +94,8 @@ class PostScoreETL(ExtractTransformLoad):
        )

    def _extract_score(self, score_path: Path) -> pd.DataFrame:
-        logger.debug("Reading Score CSV")
-        df = pd.read_csv(
-            score_path,
-            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
-            low_memory=False,
-        )
+        logger.debug("Reading Score")
+        df = pd.read_parquet(score_path)

        # Convert total population to an int
        df["Total population"] = df["Total population"].astype(
@ -116,8 +112,7 @@ class PostScoreETL(ExtractTransformLoad):
           gpd.GeoDataFrame: the census geo json data
        """
        logger.debug("Reading Census GeoJSON")
-        with open(geo_path, "r", encoding="utf-8") as file:
-            data = gpd.read_file(file)
+        data = gpd.read_parquet(geo_path)
        return data

    def extract(self, use_cached_data_sources: bool = False) -> None:
--- a/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
@ -70,7 +70,7 @@ def state_data_initial(sample_data_dir):

@pytest.fixture()
 def score_data_initial(sample_data_dir):
-    return sample_data_dir / "score_data_initial.csv"
+    return sample_data_dir / "score_data_initial.parquet"


@pytest.fixture()
@ -104,8 +104,8 @@ def states_transformed_expected():

@pytest.fixture()
 def score_transformed_expected():
-    return pd.read_pickle(
-        pytest.SNAPSHOT_DIR / "score_transformed_expected.pkl"
+    return pd.read_parquet(
+        pytest.SNAPSHOT_DIR / "score_transformed_expected.parquet"
    )


@ -122,7 +122,7 @@ def national_tract_df():

@pytest.fixture()
 def score_data_expected():
-    return pd.read_pickle(pytest.SNAPSHOT_DIR / "score_data_expected.pkl")
+    return pd.read_parquet(pytest.SNAPSHOT_DIR / "score_data_expected.parquet")


@pytest.fixture()
@ -144,8 +144,8 @@ def create_tile_data_expected():

@pytest.fixture()
 def downloadable_data_expected():
-    return pd.read_pickle(
-        pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl"
+    return pd.read_parquet(
+        pytest.SNAPSHOT_DIR / "downloadable_data_expected.parquet"
    )


--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.parquet
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.parquet
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.parquet
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.parquet
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.parquet
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.parquet
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.parquet
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.parquet
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -33,8 +33,7 @@ def test_extract_states(etl, state_data_initial):

 def test_extract_score(etl, score_data_initial):
    extracted = etl._extract_score(score_data_initial)
-    string_cols = ["GEOID10_TRACT"]
-    assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
+    assert len(extracted) > 0


 # Transform Tests
@ -107,6 +106,7 @@ def test_create_downloadable_data(
    pdt.assert_frame_equal(
        output_downloadable_df_actual,
        downloadable_data_expected,
+        check_dtype=False,
    )


--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@ -1,10 +1,9 @@
 import csv
-import json
-import subprocess
 from enum import Enum
 from pathlib import Path

 import geopandas as gpd
+import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
@ -26,8 +25,8 @@ class CensusETL(ExtractTransformLoad):
    CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
    GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
    NATIONAL_TRACT_CSV_PATH = CSV_BASE_PATH / "us.csv"
-    NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us.json"
-    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
+    NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us_geo.parquet"
+    GEOID_TRACT_FIELD_NAME: str = "GEOID10"

    def __init__(self):

@ -59,7 +58,7 @@ class CensusETL(ExtractTransformLoad):
                / f"tl_2010_{fips_code}_tract10.shp"
            )
        elif file_type == GeoFileType.GEOJSON:
-            file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.json")
+            file_path = Path(self.GEOJSON_BASE_PATH / f"{fips_code}.parquet")
        elif file_type == GeoFileType.CSV:
            file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
        return file_path
@ -93,14 +92,8 @@ class CensusETL(ExtractTransformLoad):
        )

        if not geojson_file_path.is_file():
-            cmd = [
-                "ogr2ogr",
-                "-f",
-                "GeoJSON",
-                str(geojson_file_path),
-                str(shp_file_path),
-            ]
-            subprocess.run(cmd, check=True)
+            gdf = gpd.read_file(shp_file_path)
+            gdf.to_parquet(geojson_file_path)

    def _generate_tract_table(self) -> None:
        """Generate Tract CSV table for pandas, load in memory
@ -110,20 +103,15 @@ class CensusETL(ExtractTransformLoad):
        """
        logger.debug("Transforming tracts")

-        for file in self.GEOJSON_BASE_PATH.iterdir():
-            if file.suffix == ".json":
+        files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
+        files.sort()
+        for file in files:
            logger.debug(f"Adding GEOID10 for file {file.name}")
-                with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f:
-                    geojson = json.load(f)
-                    for feature in geojson["features"]:
-                        tractid10 = feature["properties"]["GEOID10"]
-                        self.TRACT_NATIONAL.append(str(tractid10))
-                        tractid10_state_id = tractid10[:2]
-                        if not self.TRACT_PER_STATE.get(tractid10_state_id):
-                            self.TRACT_PER_STATE[tractid10_state_id] = []
-                        self.TRACT_PER_STATE[tractid10_state_id].append(
-                            tractid10
-                        )
+            state_df = gpd.read_parquet(file)
+            tract_list = state_df["GEOID10"].to_list()
+            self.TRACT_NATIONAL.extend(tract_list)
+            tractid10_state_id = state_df["STATEFP10"][0]
+            self.TRACT_PER_STATE[tractid10_state_id] = tract_list

    def transform(self) -> None:
        """Download all census shape files from the Census FTP and extract the geojson
@ -210,18 +198,24 @@ class CensusETL(ExtractTransformLoad):

        usa_df = gpd.GeoDataFrame()

-        for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"):
+        # Read state only files and append them into a MEGA US GPD
+        files = list(self.GEOJSON_BASE_PATH.glob("[0-9]*.parquet"))
+        files.sort()
+        for file_name in files:
            logger.debug(f"Adding national GeoJSON file {file_name.name}")
-            state_gdf = gpd.read_file(file_name)
-            usa_df = usa_df.append(state_gdf)
+            state_gdf = gpd.read_parquet(file_name)
+            usa_df = pd.concat([usa_df, state_gdf], ignore_index=True)

+        assert len(usa_df.columns) > 0
        logger.debug("Converting to CRS")
-        usa_df = usa_df.to_crs(
-            "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
-        )
+        usa_df = usa_df.to_crs("EPSG:4326")

        logger.debug("Saving national GeoJSON file")
-        usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
+        # Convert tract ID to a string
+        usa_df[self.GEOID_TRACT_FIELD_NAME] = usa_df[
+            self.GEOID_TRACT_FIELD_NAME
+        ].astype(str, errors="ignore")
+        usa_df.to_parquet(self.NATIONAL_TRACT_JSON_PATH)

    def load(self) -> None:
        """Create state CSVs, National CSV, and National GeoJSON
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -104,7 +104,7 @@ def check_census_data_source(
        )
    else:
        # check if census data is found locally
-        if not os.path.isfile(census_data_path / "geojson" / "us.json"):
+        if not os.path.isfile(census_data_path / "geojson" / "us_geo.parquet"):
            logger.error(
                "No local census data found. Please use '-s aws` to fetch from AWS"
            )
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -507,7 +507,7 @@ class CensusACSETL(ExtractTransformLoad):
        # geojson file for all of the US, this will read it off of S3
        logger.debug("Reading in geojson for the country")
        if not os.path.exists(
-            self.DATA_PATH / "census" / "geojson" / "us.json"
+            self.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
        ):
            logger.debug("Fetching Census data from AWS S3")
            unzip_file_from_url(
@ -515,9 +515,8 @@ class CensusACSETL(ExtractTransformLoad):
                self.DATA_PATH / "tmp",
                self.DATA_PATH,
            )
-
-        self.geo_df = gpd.read_file(
-            self.DATA_PATH / "census" / "geojson" / "us.json",
+        self.geo_df = gpd.read_parquet(
+            self.DATA_PATH / "census" / "geojson" / "us_geo.parquet",
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@ -33,7 +33,7 @@ class CensusDecennialETL(ExtractTransformLoad):
        / f"census_decennial_{DECENNIAL_YEAR}"
    )
    CENSUS_GEOJSON_PATH = (
-        ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
+        ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us_geo.parquet"
    )

    def __get_api_url(
@ -148,7 +148,7 @@ class CensusDecennialETL(ExtractTransformLoad):
        """Impute income for both income measures."""
        # Merges Census geojson to imput values from.
        logger.debug(f"Reading GeoJSON from {geojson_path}")
-        geo_df = gpd.read_file(geojson_path)
+        geo_df = gpd.read_parquet(geojson_path)
        self.df_all = CensusACSETL.merge_geojson(
            df=self.df_all,
            usa_geo_df=geo_df,
--- a/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py
@ -26,10 +26,7 @@ def get_tract_geojson(
        census_etl.extract()
        census_etl.transform()
        census_etl.load()
-    tract_data = gpd.read_file(
-        GEOJSON_PATH,
-        include_fields=["GEOID10"],
-    )
+    tract_data = gpd.read_parquet(GEOJSON_PATH)
    tract_data = tract_data.rename(
        columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
    )
--- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py
+++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py
@ -7,10 +7,13 @@ from data_pipeline.score.field_names import GEOID_TRACT_FIELD

@pytest.fixture(scope="session")
 def final_score_df():
-    return pd.read_csv(
-        settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv",
-        dtype={GEOID_TRACT_FIELD: str},
-        low_memory=False,
+    return pd.read_parquet(
+        settings.APP_ROOT
+        / "data"
+        / "score"
+        / "csv"
+        / "full"
+        / "usa_score.parquet",
    )


@ -173,7 +176,7 @@ def geocorr_urban_rural_df():
@pytest.fixture()
 def census_decennial_df():
    census_decennial_csv = (
-        constants.DATA_PATH / "dataset" / "census_decennial_2010" / "usa.csv"
+        constants.DATA_PATH / "dataset" / "census_decennial_2020" / "usa.csv"
    )
    return pd.read_csv(
        census_decennial_csv,
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/us.geojson
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/us.geojson
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/us_geo.parquet
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/us_geo.parquet
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/test_adjacency.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/test_adjacency.py
@ -17,7 +17,7 @@ from data_pipeline.score.utils import (
@contextmanager
 def patch_calculate_tract_adjacency_scores():
    # Use fixtures for tract data.
-    tract_data_path = Path(__file__).parent / "data" / "us.geojson"
+    tract_data_path = Path(__file__).parent / "data" / "us_geo.parquet"

    get_tract_geojson_mock = partial(
        get_tract_geojson, _tract_data_path=tract_data_path
--- a/data/data-pipeline/data_pipeline/tests/sources/census_decennial/data/imputation/census-us-territory-geojson.json
+++ b/data/data-pipeline/data_pipeline/tests/sources/census_decennial/data/imputation/census-us-territory-geojson.json
--- a/data/data-pipeline/data_pipeline/tests/sources/census_decennial/data/imputation/census-us-territory-geojson.parquet
+++ b/data/data-pipeline/data_pipeline/tests/sources/census_decennial/data/imputation/census-us-territory-geojson.parquet
--- a/data/data-pipeline/data_pipeline/tests/sources/census_decennial/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/census_decennial/test_etl.py
@ -68,7 +68,7 @@ def transformed_data_fixture(
    """Load the test data and call the ETL transform"""
    dec = CensusDecennialETL()
    dec.df_all = extracted_data_fixture
-    dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
+    dec.transform(imputed_path_fixture / "census-us-territory-geojson.parquet")
    return dec.df_all


--- a/data/data-pipeline/poetry.lock
+++ b/data/data-pipeline/poetry.lock
--- a/data/data-pipeline/pyproject.toml
+++ b/data/data-pipeline/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "justice40-data-pipeline"
-version = "1.0.1"
+version = "2.0"
 description = "ETL, Score and Map Generation of Justice 40 Tool"
 authors = ["Justice40 Engineering <j40-engineering@lists.usds.gov>"]
 keywords = ["justice40", "environmental_justice", "python", "etl"]
@ -42,6 +42,7 @@ pydantic = "^1.9.0"
 Rtree = "^1.0.0"
 fiona = "~1.8.21"
 tenacity = ">=5.0.2"
+pyarrow = "^18.1.0"

 [tool.poetry.group.dev.dependencies]
 black = "^21"