Data Pipeline performance improvements for Census GeoJson and Score file

This commit is contained in:
Carlos Felix 2025-01-13 09:28:14 -05:00 committed by Carlos Felix
commit c32bd1f363
37 changed files with 1305 additions and 1413 deletions

View file

@ -7,10 +7,13 @@ from data_pipeline.score.field_names import GEOID_TRACT_FIELD
@pytest.fixture(scope="session")
def final_score_df():
return pd.read_csv(
settings.APP_ROOT / "data" / "score" / "csv" / "full" / "usa.csv",
dtype={GEOID_TRACT_FIELD: str},
low_memory=False,
return pd.read_parquet(
settings.APP_ROOT
/ "data"
/ "score"
/ "csv"
/ "full"
/ "usa_score.parquet",
)
@ -173,7 +176,7 @@ def geocorr_urban_rural_df():
@pytest.fixture()
def census_decennial_df():
census_decennial_csv = (
constants.DATA_PATH / "dataset" / "census_decennial_2010" / "usa.csv"
constants.DATA_PATH / "dataset" / "census_decennial_2020" / "usa.csv"
)
return pd.read_csv(
census_decennial_csv,

File diff suppressed because one or more lines are too long

View file

@ -17,7 +17,7 @@ from data_pipeline.score.utils import (
@contextmanager
def patch_calculate_tract_adjacency_scores():
# Use fixtures for tract data.
tract_data_path = Path(__file__).parent / "data" / "us.geojson"
tract_data_path = Path(__file__).parent / "data" / "us_geo.parquet"
get_tract_geojson_mock = partial(
get_tract_geojson, _tract_data_path=tract_data_path

View file

@ -68,7 +68,7 @@ def transformed_data_fixture(
"""Load the test data and call the ETL transform"""
dec = CensusDecennialETL()
dec.df_all = extracted_data_fixture
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
dec.transform(imputed_path_fixture / "census-us-territory-geojson.parquet")
return dec.df_all