Allow for Census Tract search in UI

2025-09-10 05:40:59 -07:00 · 2024-12-04 14:36:46 -05:00 · 2024-12-04 14:36:46 -05:00 · cf4e35acce
commit cf4e35acce
parent 4130c46aee
15 changed files with 362 additions and 162 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -24,6 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
 DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
 DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
 DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
+DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json"

 # Score paths
 DATA_SCORE_DIR = DATA_PATH / "score"
@ -46,6 +47,9 @@ DATA_SCORE_JSON_INDEX_FILE_PATH = (
 ## Tile path
 DATA_SCORE_TILES_DIR = DATA_SCORE_DIR / "tiles"

+## Tiles search
+DATA_TILES_SEARCH_DIR = DATA_SCORE_DIR / "search"
+
 # Downloadable paths
 if not os.environ.get("J40_VERSION_LABEL_STRING"):
    version_str = "beta"
@ -82,6 +86,7 @@ SCORE_VERSIONING_README_FILE_NAME = f"readme-version-{version_str}.md"
 SCORE_VERSIONING_README_FILE_PATH = (
    FILES_PATH / SCORE_VERSIONING_README_FILE_NAME
 )
+SCORE_TRACT_SEARCH_FILE_PATH = DATA_TILES_SEARCH_DIR / "tracts.json"

 # For the codebook
 CEJST_SCORE_COLUMN_NAME = "score_name"
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -4,6 +4,7 @@ from pathlib import Path
 import numpy as np
 from numpy import float64
 import pandas as pd
+import geopandas as gpd

 from data_pipeline.content.schemas.download_schemas import CodebookConfig
 from data_pipeline.content.schemas.download_schemas import CSVConfig
@ -42,10 +43,12 @@ class PostScoreETL(ExtractTransformLoad):
        self.input_counties_df: pd.DataFrame
        self.input_states_df: pd.DataFrame
        self.input_score_df: pd.DataFrame
+        self.input_census_geo_df: gpd.GeoDataFrame

        self.output_score_county_state_merged_df: pd.DataFrame
        self.output_score_tiles_df: pd.DataFrame
        self.output_downloadable_df: pd.DataFrame
+        self.output_tract_search_df: pd.DataFrame

        # Define some constants for the YAML file
        # TODO: Implement this as a marshmallow schema.
@ -105,6 +108,18 @@ class PostScoreETL(ExtractTransformLoad):

        return df

+    def _extract_census_geojson(self, geo_path: Path) -> gpd.GeoDataFrame:
+        """
+        Read in the Census Geo JSON data.
+
+        Returns:
+           gpd.GeoDataFrame: the census geo json data
+        """
+        logger.debug("Reading Census GeoJSON")
+        with open(geo_path, "r", encoding="utf-8") as file:
+            data = gpd.read_file(file)
+        return data
+
    def extract(self, use_cached_data_sources: bool = False) -> None:

        super().extract(
@ -131,6 +146,9 @@ class PostScoreETL(ExtractTransformLoad):
        self.input_score_df = self._extract_score(
            constants.DATA_SCORE_CSV_FULL_FILE_PATH
        )
+        self.input_census_geo_df = self._extract_census_geojson(
+            constants.DATA_CENSUS_GEOJSON_FILE_PATH
+        )

    def _transform_counties(
        self, initial_counties_df: pd.DataFrame
@ -392,7 +410,23 @@ class PostScoreETL(ExtractTransformLoad):

        return final_df

+    def _create_tract_search_data(
+        self, census_geojson: gpd.GeoDataFrame
+    ) -> pd.DataFrame:
+        """
+        Generate a dataframe with only the tract IDs and the center lat/lon of each tract.
+
+        Returns:
+            pd.DataFrame: a dataframe with the tract search data
+        """
+        logger.debug("Creating Census tract search data")
+        columns_to_extract = ["GEOID10", "INTPTLAT10", "INTPTLON10"]
+        return pd.DataFrame(census_geojson[columns_to_extract])
+
    def transform(self) -> None:
+        self.output_tract_search_df = self._create_tract_search_data(
+            self.input_census_geo_df
+        )
        transformed_counties = self._transform_counties(self.input_counties_df)
        transformed_states = self._transform_states(self.input_states_df)
        transformed_score = self._transform_score(self.input_score_df)
@ -409,6 +443,9 @@ class PostScoreETL(ExtractTransformLoad):
        self.output_score_county_state_merged_df = (
            output_score_county_state_merged_df
        )
+        self.output_tract_search_df = self._create_tract_search_data(
+            self.input_census_geo_df
+        )

    def _load_score_csv_full(
        self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
@ -592,6 +629,13 @@ class PostScoreETL(ExtractTransformLoad):
        ]
        zip_files(version_data_documentation_zip_path, files_to_compress)

+    def _load_search_tract_data(self, output_path: Path):
+        """Write the Census tract search data."""
+        logger.debug("Writing Census tract search data")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # We use the records orientation to easily import the JSON in JS.
+        self.output_tract_search_df.to_json(output_path, orient="records")
+
    def load(self) -> None:
        self._load_score_csv_full(
            self.output_score_county_state_merged_df,
@ -600,4 +644,5 @@ class PostScoreETL(ExtractTransformLoad):
        self._load_tile_csv(
            self.output_score_tiles_df, constants.DATA_SCORE_CSV_TILES_FILE_PATH
        )
+        self._load_search_tract_data(constants.SCORE_TRACT_SEARCH_FILE_PATH)
        self._load_downloadable_zip(constants.SCORE_DOWNLOADABLE_DIR)
--- a/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
@ -3,6 +3,7 @@ from importlib import reload
 from pathlib import Path

 import pandas as pd
+import geopandas as gpd
 import pytest
 from data_pipeline import config
 from data_pipeline.etl.score import etl_score_post
@ -144,3 +145,13 @@ def downloadable_data_expected():
    return pd.read_pickle(
        pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl"
    )
+
+
+@pytest.fixture()
+def census_geojson_sample_data(sample_data_dir) -> gpd.GeoDataFrame:
+    with open(
+        sample_data_dir / "census_60.geojson", "r", encoding="utf-8"
+    ) as file:
+        data = gpd.read_file(file)
+        return data
+    return None
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/census_60.geojson
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/census_60.geojson
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -5,9 +5,12 @@ from pathlib import Path

 import pandas.api.types as ptypes
 import pandas.testing as pdt
+import pandas as pd
+import geopandas as gpd
 from data_pipeline.content.schemas.download_schemas import CSVConfig
 from data_pipeline.etl.score import constants
 from data_pipeline.utils import load_yaml_dict_from_file
+from data_pipeline.etl.score.etl_score_post import PostScoreETL

 # See conftest.py for all fixtures used in these tests

@ -150,3 +153,16 @@ def test_load_downloadable_zip(etl, monkeypatch, score_data_expected):
    assert constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH.is_file()
    assert constants.SCORE_DOWNLOADABLE_CSV_ZIP_FILE_PATH.is_file()
    assert constants.SCORE_DOWNLOADABLE_XLS_ZIP_FILE_PATH.is_file()
+
+
+def test_create_tract_search_data(census_geojson_sample_data: gpd.GeoDataFrame):
+    # Sanity check
+    assert len(census_geojson_sample_data) > 0
+    
+    result = PostScoreETL()._create_tract_search_data(census_geojson_sample_data)
+    assert isinstance(result, pd.DataFrame)
+    assert not result.columns.empty
+    columns = ["GEOID10", "INTPTLAT10", "INTPTLON10"]
+    for col in columns:
+        assert col in result.columns
+    assert len(census_geojson_sample_data) == len(result)