Allow for Census Tract search in UI

This commit is contained in:
Carlos Felix 2024-12-04 14:36:46 -05:00 committed by Carlos Felix
commit cf4e35acce
15 changed files with 362 additions and 162 deletions

View file

@ -24,6 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json"
# Score paths
DATA_SCORE_DIR = DATA_PATH / "score"
@ -46,6 +47,9 @@ DATA_SCORE_JSON_INDEX_FILE_PATH = (
## Tile path
DATA_SCORE_TILES_DIR = DATA_SCORE_DIR / "tiles"
## Tiles search
DATA_TILES_SEARCH_DIR = DATA_SCORE_DIR / "search"
# Downloadable paths
if not os.environ.get("J40_VERSION_LABEL_STRING"):
version_str = "beta"
@ -82,6 +86,7 @@ SCORE_VERSIONING_README_FILE_NAME = f"readme-version-{version_str}.md"
SCORE_VERSIONING_README_FILE_PATH = (
FILES_PATH / SCORE_VERSIONING_README_FILE_NAME
)
SCORE_TRACT_SEARCH_FILE_PATH = DATA_TILES_SEARCH_DIR / "tracts.json"
# For the codebook
CEJST_SCORE_COLUMN_NAME = "score_name"

View file

@ -4,6 +4,7 @@ from pathlib import Path
import numpy as np
from numpy import float64
import pandas as pd
import geopandas as gpd
from data_pipeline.content.schemas.download_schemas import CodebookConfig
from data_pipeline.content.schemas.download_schemas import CSVConfig
@ -42,10 +43,12 @@ class PostScoreETL(ExtractTransformLoad):
self.input_counties_df: pd.DataFrame
self.input_states_df: pd.DataFrame
self.input_score_df: pd.DataFrame
self.input_census_geo_df: gpd.GeoDataFrame
self.output_score_county_state_merged_df: pd.DataFrame
self.output_score_tiles_df: pd.DataFrame
self.output_downloadable_df: pd.DataFrame
self.output_tract_search_df: pd.DataFrame
# Define some constants for the YAML file
# TODO: Implement this as a marshmallow schema.
@ -105,6 +108,18 @@ class PostScoreETL(ExtractTransformLoad):
return df
def _extract_census_geojson(self, geo_path: Path) -> gpd.GeoDataFrame:
"""
Read in the Census Geo JSON data.
Returns:
gpd.GeoDataFrame: the census geo json data
"""
logger.debug("Reading Census GeoJSON")
with open(geo_path, "r", encoding="utf-8") as file:
data = gpd.read_file(file)
return data
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
@ -131,6 +146,9 @@ class PostScoreETL(ExtractTransformLoad):
self.input_score_df = self._extract_score(
constants.DATA_SCORE_CSV_FULL_FILE_PATH
)
self.input_census_geo_df = self._extract_census_geojson(
constants.DATA_CENSUS_GEOJSON_FILE_PATH
)
def _transform_counties(
self, initial_counties_df: pd.DataFrame
@ -392,7 +410,23 @@ class PostScoreETL(ExtractTransformLoad):
return final_df
def _create_tract_search_data(
self, census_geojson: gpd.GeoDataFrame
) -> pd.DataFrame:
"""
Generate a dataframe with only the tract IDs and the center lat/lon of each tract.
Returns:
pd.DataFrame: a dataframe with the tract search data
"""
logger.debug("Creating Census tract search data")
columns_to_extract = ["GEOID10", "INTPTLAT10", "INTPTLON10"]
return pd.DataFrame(census_geojson[columns_to_extract])
def transform(self) -> None:
self.output_tract_search_df = self._create_tract_search_data(
self.input_census_geo_df
)
transformed_counties = self._transform_counties(self.input_counties_df)
transformed_states = self._transform_states(self.input_states_df)
transformed_score = self._transform_score(self.input_score_df)
@ -409,6 +443,9 @@ class PostScoreETL(ExtractTransformLoad):
self.output_score_county_state_merged_df = (
output_score_county_state_merged_df
)
self.output_tract_search_df = self._create_tract_search_data(
self.input_census_geo_df
)
def _load_score_csv_full(
self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
@ -592,6 +629,13 @@ class PostScoreETL(ExtractTransformLoad):
]
zip_files(version_data_documentation_zip_path, files_to_compress)
def _load_search_tract_data(self, output_path: Path):
"""Write the Census tract search data."""
logger.debug("Writing Census tract search data")
output_path.parent.mkdir(parents=True, exist_ok=True)
# We use the records orientation to easily import the JSON in JS.
self.output_tract_search_df.to_json(output_path, orient="records")
def load(self) -> None:
self._load_score_csv_full(
self.output_score_county_state_merged_df,
@ -600,4 +644,5 @@ class PostScoreETL(ExtractTransformLoad):
self._load_tile_csv(
self.output_score_tiles_df, constants.DATA_SCORE_CSV_TILES_FILE_PATH
)
self._load_search_tract_data(constants.SCORE_TRACT_SEARCH_FILE_PATH)
self._load_downloadable_zip(constants.SCORE_DOWNLOADABLE_DIR)

View file

@ -3,6 +3,7 @@ from importlib import reload
from pathlib import Path
import pandas as pd
import geopandas as gpd
import pytest
from data_pipeline import config
from data_pipeline.etl.score import etl_score_post
@ -144,3 +145,13 @@ def downloadable_data_expected():
return pd.read_pickle(
pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl"
)
@pytest.fixture()
def census_geojson_sample_data(sample_data_dir) -> gpd.GeoDataFrame:
with open(
sample_data_dir / "census_60.geojson", "r", encoding="utf-8"
) as file:
data = gpd.read_file(file)
return data
return None

File diff suppressed because one or more lines are too long

View file

@ -5,9 +5,12 @@ from pathlib import Path
import pandas.api.types as ptypes
import pandas.testing as pdt
import pandas as pd
import geopandas as gpd
from data_pipeline.content.schemas.download_schemas import CSVConfig
from data_pipeline.etl.score import constants
from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.etl.score.etl_score_post import PostScoreETL
# See conftest.py for all fixtures used in these tests
@ -150,3 +153,16 @@ def test_load_downloadable_zip(etl, monkeypatch, score_data_expected):
assert constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH.is_file()
assert constants.SCORE_DOWNLOADABLE_CSV_ZIP_FILE_PATH.is_file()
assert constants.SCORE_DOWNLOADABLE_XLS_ZIP_FILE_PATH.is_file()
def test_create_tract_search_data(census_geojson_sample_data: gpd.GeoDataFrame):
# Sanity check
assert len(census_geojson_sample_data) > 0
result = PostScoreETL()._create_tract_search_data(census_geojson_sample_data)
assert isinstance(result, pd.DataFrame)
assert not result.columns.empty
columns = ["GEOID10", "INTPTLAT10", "INTPTLON10"]
for col in columns:
assert col in result.columns
assert len(census_geojson_sample_data) == len(result)