mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-26 01:51:17 -07:00
Allow for Census Tract search in UI
This commit is contained in:
parent
4130c46aee
commit
cf4e35acce
15 changed files with 362 additions and 162 deletions
|
@ -24,6 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
|
|||
DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
|
||||
DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
|
||||
DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
|
||||
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json"
|
||||
|
||||
# Score paths
|
||||
DATA_SCORE_DIR = DATA_PATH / "score"
|
||||
|
@ -46,6 +47,9 @@ DATA_SCORE_JSON_INDEX_FILE_PATH = (
|
|||
## Tile path
|
||||
DATA_SCORE_TILES_DIR = DATA_SCORE_DIR / "tiles"
|
||||
|
||||
## Tiles search
|
||||
DATA_TILES_SEARCH_DIR = DATA_SCORE_DIR / "search"
|
||||
|
||||
# Downloadable paths
|
||||
if not os.environ.get("J40_VERSION_LABEL_STRING"):
|
||||
version_str = "beta"
|
||||
|
@ -82,6 +86,7 @@ SCORE_VERSIONING_README_FILE_NAME = f"readme-version-{version_str}.md"
|
|||
SCORE_VERSIONING_README_FILE_PATH = (
|
||||
FILES_PATH / SCORE_VERSIONING_README_FILE_NAME
|
||||
)
|
||||
SCORE_TRACT_SEARCH_FILE_PATH = DATA_TILES_SEARCH_DIR / "tracts.json"
|
||||
|
||||
# For the codebook
|
||||
CEJST_SCORE_COLUMN_NAME = "score_name"
|
||||
|
|
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
from numpy import float64
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
|
||||
from data_pipeline.content.schemas.download_schemas import CodebookConfig
|
||||
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
||||
|
@ -42,10 +43,12 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
self.input_counties_df: pd.DataFrame
|
||||
self.input_states_df: pd.DataFrame
|
||||
self.input_score_df: pd.DataFrame
|
||||
self.input_census_geo_df: gpd.GeoDataFrame
|
||||
|
||||
self.output_score_county_state_merged_df: pd.DataFrame
|
||||
self.output_score_tiles_df: pd.DataFrame
|
||||
self.output_downloadable_df: pd.DataFrame
|
||||
self.output_tract_search_df: pd.DataFrame
|
||||
|
||||
# Define some constants for the YAML file
|
||||
# TODO: Implement this as a marshmallow schema.
|
||||
|
@ -105,6 +108,18 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
return df
|
||||
|
||||
def _extract_census_geojson(self, geo_path: Path) -> gpd.GeoDataFrame:
|
||||
"""
|
||||
Read in the Census Geo JSON data.
|
||||
|
||||
Returns:
|
||||
gpd.GeoDataFrame: the census geo json data
|
||||
"""
|
||||
logger.debug("Reading Census GeoJSON")
|
||||
with open(geo_path, "r", encoding="utf-8") as file:
|
||||
data = gpd.read_file(file)
|
||||
return data
|
||||
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
||||
super().extract(
|
||||
|
@ -131,6 +146,9 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
self.input_score_df = self._extract_score(
|
||||
constants.DATA_SCORE_CSV_FULL_FILE_PATH
|
||||
)
|
||||
self.input_census_geo_df = self._extract_census_geojson(
|
||||
constants.DATA_CENSUS_GEOJSON_FILE_PATH
|
||||
)
|
||||
|
||||
def _transform_counties(
|
||||
self, initial_counties_df: pd.DataFrame
|
||||
|
@ -392,7 +410,23 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
return final_df
|
||||
|
||||
def _create_tract_search_data(
|
||||
self, census_geojson: gpd.GeoDataFrame
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Generate a dataframe with only the tract IDs and the center lat/lon of each tract.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: a dataframe with the tract search data
|
||||
"""
|
||||
logger.debug("Creating Census tract search data")
|
||||
columns_to_extract = ["GEOID10", "INTPTLAT10", "INTPTLON10"]
|
||||
return pd.DataFrame(census_geojson[columns_to_extract])
|
||||
|
||||
def transform(self) -> None:
|
||||
self.output_tract_search_df = self._create_tract_search_data(
|
||||
self.input_census_geo_df
|
||||
)
|
||||
transformed_counties = self._transform_counties(self.input_counties_df)
|
||||
transformed_states = self._transform_states(self.input_states_df)
|
||||
transformed_score = self._transform_score(self.input_score_df)
|
||||
|
@ -409,6 +443,9 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
self.output_score_county_state_merged_df = (
|
||||
output_score_county_state_merged_df
|
||||
)
|
||||
self.output_tract_search_df = self._create_tract_search_data(
|
||||
self.input_census_geo_df
|
||||
)
|
||||
|
||||
def _load_score_csv_full(
|
||||
self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
|
||||
|
@ -592,6 +629,13 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
]
|
||||
zip_files(version_data_documentation_zip_path, files_to_compress)
|
||||
|
||||
def _load_search_tract_data(self, output_path: Path):
|
||||
"""Write the Census tract search data."""
|
||||
logger.debug("Writing Census tract search data")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# We use the records orientation to easily import the JSON in JS.
|
||||
self.output_tract_search_df.to_json(output_path, orient="records")
|
||||
|
||||
def load(self) -> None:
|
||||
self._load_score_csv_full(
|
||||
self.output_score_county_state_merged_df,
|
||||
|
@ -600,4 +644,5 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
self._load_tile_csv(
|
||||
self.output_score_tiles_df, constants.DATA_SCORE_CSV_TILES_FILE_PATH
|
||||
)
|
||||
self._load_search_tract_data(constants.SCORE_TRACT_SEARCH_FILE_PATH)
|
||||
self._load_downloadable_zip(constants.SCORE_DOWNLOADABLE_DIR)
|
||||
|
|
|
@ -3,6 +3,7 @@ from importlib import reload
|
|||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import pytest
|
||||
from data_pipeline import config
|
||||
from data_pipeline.etl.score import etl_score_post
|
||||
|
@ -144,3 +145,13 @@ def downloadable_data_expected():
|
|||
return pd.read_pickle(
|
||||
pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def census_geojson_sample_data(sample_data_dir) -> gpd.GeoDataFrame:
|
||||
with open(
|
||||
sample_data_dir / "census_60.geojson", "r", encoding="utf-8"
|
||||
) as file:
|
||||
data = gpd.read_file(file)
|
||||
return data
|
||||
return None
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -5,9 +5,12 @@ from pathlib import Path
|
|||
|
||||
import pandas.api.types as ptypes
|
||||
import pandas.testing as pdt
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
||||
from data_pipeline.etl.score import constants
|
||||
from data_pipeline.utils import load_yaml_dict_from_file
|
||||
from data_pipeline.etl.score.etl_score_post import PostScoreETL
|
||||
|
||||
# See conftest.py for all fixtures used in these tests
|
||||
|
||||
|
@ -150,3 +153,16 @@ def test_load_downloadable_zip(etl, monkeypatch, score_data_expected):
|
|||
assert constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH.is_file()
|
||||
assert constants.SCORE_DOWNLOADABLE_CSV_ZIP_FILE_PATH.is_file()
|
||||
assert constants.SCORE_DOWNLOADABLE_XLS_ZIP_FILE_PATH.is_file()
|
||||
|
||||
|
||||
def test_create_tract_search_data(census_geojson_sample_data: gpd.GeoDataFrame):
|
||||
# Sanity check
|
||||
assert len(census_geojson_sample_data) > 0
|
||||
|
||||
result = PostScoreETL()._create_tract_search_data(census_geojson_sample_data)
|
||||
assert isinstance(result, pd.DataFrame)
|
||||
assert not result.columns.empty
|
||||
columns = ["GEOID10", "INTPTLAT10", "INTPTLON10"]
|
||||
for col in columns:
|
||||
assert col in result.columns
|
||||
assert len(census_geojson_sample_data) == len(result)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue