mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-06 02:54:18 -07:00
Data Pipeline performance improvements for Census GeoJson and Score file
This commit is contained in:
parent
d5d055864f
commit
c32bd1f363
37 changed files with 1305 additions and 1413 deletions
|
@ -24,7 +24,7 @@ DATA_CENSUS_DIR = DATA_PATH / "census"
|
|||
DATA_CENSUS_CSV_DIR = DATA_CENSUS_DIR / "csv"
|
||||
DATA_CENSUS_CSV_FILE_PATH = DATA_CENSUS_CSV_DIR / "us.csv"
|
||||
DATA_CENSUS_CSV_STATE_FILE_PATH = DATA_CENSUS_CSV_DIR / "fips_states_2010.csv"
|
||||
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us.json"
|
||||
DATA_CENSUS_GEOJSON_FILE_PATH = DATA_CENSUS_DIR / "geojson" / "us_geo.parquet"
|
||||
|
||||
# Score paths
|
||||
DATA_SCORE_DIR = DATA_PATH / "score"
|
||||
|
@ -32,7 +32,7 @@ DATA_SCORE_DIR = DATA_PATH / "score"
|
|||
## Score CSV Paths
|
||||
DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv"
|
||||
DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full"
|
||||
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa.csv"
|
||||
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet"
|
||||
FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
|
||||
DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv"
|
||||
)
|
||||
|
|
|
@ -727,4 +727,4 @@ class ScoreETL(ExtractTransformLoad):
|
|||
def load(self) -> None:
|
||||
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
||||
self.df.to_parquet(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
||||
|
|
|
@ -37,9 +37,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
||||
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
||||
|
||||
self.CENSUS_USA_GEOJSON = (
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
)
|
||||
self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH
|
||||
|
||||
# Import the shortened name for Score N to be used on tiles.
|
||||
# We should no longer be using PFS
|
||||
|
@ -87,16 +85,14 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
score_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
logger.info("Reading US GeoJSON (~6 minutes)")
|
||||
full_geojson_usa_df = gpd.read_file(
|
||||
logger.info("Reading US GeoJSON")
|
||||
full_geojson_usa_df = gpd.read_parquet(
|
||||
self.CENSUS_USA_GEOJSON,
|
||||
dtype={self.GEOID_FIELD_NAME: "string"},
|
||||
usecols=[
|
||||
columns=[
|
||||
self.GEOID_FIELD_NAME,
|
||||
self.GEOMETRY_FIELD_NAME,
|
||||
self.LAND_FIELD_NAME,
|
||||
],
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
# We only want to keep tracts to visualize that have non-0 land
|
||||
|
@ -104,7 +100,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
|
||||
]
|
||||
|
||||
logger.info("Reading score CSV")
|
||||
logger.info("Reading tile score CSV")
|
||||
self.score_usa_df = pd.read_csv(
|
||||
self.TILE_SCORE_CSV,
|
||||
dtype={
|
||||
|
|
|
@ -94,12 +94,8 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
||||
logger.debug("Reading Score CSV")
|
||||
df = pd.read_csv(
|
||||
score_path,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
logger.debug("Reading Score")
|
||||
df = pd.read_parquet(score_path)
|
||||
|
||||
# Convert total population to an int
|
||||
df["Total population"] = df["Total population"].astype(
|
||||
|
@ -116,8 +112,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
gpd.GeoDataFrame: the census geo json data
|
||||
"""
|
||||
logger.debug("Reading Census GeoJSON")
|
||||
with open(geo_path, "r", encoding="utf-8") as file:
|
||||
data = gpd.read_file(file)
|
||||
data = gpd.read_parquet(geo_path)
|
||||
return data
|
||||
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
|
|
@ -70,7 +70,7 @@ def state_data_initial(sample_data_dir):
|
|||
|
||||
@pytest.fixture()
|
||||
def score_data_initial(sample_data_dir):
|
||||
return sample_data_dir / "score_data_initial.csv"
|
||||
return sample_data_dir / "score_data_initial.parquet"
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -104,8 +104,8 @@ def states_transformed_expected():
|
|||
|
||||
@pytest.fixture()
|
||||
def score_transformed_expected():
|
||||
return pd.read_pickle(
|
||||
pytest.SNAPSHOT_DIR / "score_transformed_expected.pkl"
|
||||
return pd.read_parquet(
|
||||
pytest.SNAPSHOT_DIR / "score_transformed_expected.parquet"
|
||||
)
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ def national_tract_df():
|
|||
|
||||
@pytest.fixture()
|
||||
def score_data_expected():
|
||||
return pd.read_pickle(pytest.SNAPSHOT_DIR / "score_data_expected.pkl")
|
||||
return pd.read_parquet(pytest.SNAPSHOT_DIR / "score_data_expected.parquet")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -144,8 +144,8 @@ def create_tile_data_expected():
|
|||
|
||||
@pytest.fixture()
|
||||
def downloadable_data_expected():
|
||||
return pd.read_pickle(
|
||||
pytest.SNAPSHOT_DIR / "downloadable_data_expected.pkl"
|
||||
return pd.read_parquet(
|
||||
pytest.SNAPSHOT_DIR / "downloadable_data_expected.parquet"
|
||||
)
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -33,8 +33,7 @@ def test_extract_states(etl, state_data_initial):
|
|||
|
||||
def test_extract_score(etl, score_data_initial):
|
||||
extracted = etl._extract_score(score_data_initial)
|
||||
string_cols = ["GEOID10_TRACT"]
|
||||
assert all(ptypes.is_string_dtype(extracted[col]) for col in string_cols)
|
||||
assert len(extracted) > 0
|
||||
|
||||
|
||||
# Transform Tests
|
||||
|
@ -107,6 +106,7 @@ def test_create_downloadable_data(
|
|||
pdt.assert_frame_equal(
|
||||
output_downloadable_df_actual,
|
||||
downloadable_data_expected,
|
||||
check_dtype=False,
|
||||
)
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue