mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 14:01:16 -07:00
* Better document base on Lucas's feedback (#1835) * Fix typo (#1835) * Add test to verify GEOJSON matches tiles (#1835) * Remove NOOP line (#1835) * Move GEOJSON generation up for new smoketest (#1835) * Fixup code format (#1835) * Update readme for new somketest (#1835)
This commit is contained in:
parent
aca226165c
commit
f70f30d610
5 changed files with 108 additions and 23 deletions
|
@ -31,7 +31,7 @@ from .fixtures import (
|
|||
|
||||
|
||||
pytestmark = pytest.mark.smoketest
|
||||
UNMATCHED_TRACK_THRESHOLD = 1000
|
||||
UNMATCHED_TRACT_THRESHOLD = 1000
|
||||
|
||||
|
||||
def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
|
||||
|
@ -254,6 +254,15 @@ def test_data_sources(
|
|||
key: value for key, value in locals().items() if key != "final_score_df"
|
||||
}
|
||||
|
||||
# For each data source that's injected via the fixtures, do the following:
|
||||
# * Ensure at least one column from the source shows up in the score
|
||||
# * Ensure any tracts NOT in the data source are NA/null in the score
|
||||
# * Ensure the data source doesn't have a large number of tract IDs that are not
|
||||
# included in the final score, since that implies the source is using 2020
|
||||
# tract IDs
|
||||
# * Verify that the data from the source that's in the final score output
|
||||
# is the "equal" to the data from the ETL, allowing for the minor
|
||||
# differences that come from floating point comparisons
|
||||
for data_source_name, data_source in data_sources.items():
|
||||
final = "final_"
|
||||
df: pd.DataFrame = final_score_df.merge(
|
||||
|
@ -275,12 +284,12 @@ def test_data_sources(
|
|||
), f"No columns from data source show up in final score in source {data_source_name}"
|
||||
|
||||
# Make sure we have NAs for any tracts in the final data that aren't
|
||||
# covered in the final data
|
||||
# included in the data source
|
||||
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
|
||||
|
||||
# Make sure the datasource doesn't have a ton of unmatched tracts, implying it
|
||||
# has moved to 2020 tracts
|
||||
assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACK_THRESHOLD
|
||||
assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACT_THRESHOLD
|
||||
|
||||
df = df[df.MERGE == "both"]
|
||||
|
||||
|
@ -293,6 +302,7 @@ def test_data_sources(
|
|||
f"Column {final_column} not equal "
|
||||
f"between {data_source_name} and final score"
|
||||
)
|
||||
# For non-numeric types, we can use the built-in equals from pandas
|
||||
if df[final_column].dtype in [
|
||||
np.dtype(object),
|
||||
np.dtype(bool),
|
||||
|
@ -301,6 +311,8 @@ def test_data_sources(
|
|||
assert df[final_column].equals(
|
||||
df[data_source_column]
|
||||
), error_message
|
||||
# For numeric sources, use np.close so we don't get harmed by
|
||||
# float equaity weirdness
|
||||
else:
|
||||
assert np.allclose(
|
||||
df[final_column],
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import numpy as np
|
||||
import pytest
|
||||
from data_pipeline.config import settings
|
||||
|
@ -26,6 +27,13 @@ def tiles_df(scope="session"):
|
|||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tiles_geojson_df():
|
||||
return gpd.read_file(
|
||||
settings.APP_ROOT / "data" / "score" / "geojson" / "usa-high.json"
|
||||
)
|
||||
|
||||
|
||||
PERCENTILE_FIELDS = [
|
||||
"DF_PFS",
|
||||
"AF_PFS",
|
||||
|
@ -102,6 +110,19 @@ def test_tract_equality(tiles_df, final_score_df):
|
|||
assert tiles_df.shape[0] == final_score_df.shape[0]
|
||||
|
||||
|
||||
def is_col_fake_bool(col) -> bool:
|
||||
if col.dtype == np.dtype("float64"):
|
||||
fake_bool = {1.0, 0.0, None}
|
||||
# Replace the nans in the column values with None for
|
||||
# so we can just use issubset below
|
||||
col_values = set(
|
||||
not np.isnan(val) and val or None
|
||||
for val in col.value_counts(dropna=False).index
|
||||
)
|
||||
return len(col_values) <= 3 and col_values.issubset(fake_bool)
|
||||
return False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColumnValueComparison:
|
||||
final_score_column: pd.Series
|
||||
|
@ -110,16 +131,7 @@ class ColumnValueComparison:
|
|||
|
||||
@property
|
||||
def _is_tiles_column_fake_bool(self) -> bool:
|
||||
if self.tiles_column.dtype == np.dtype("float64"):
|
||||
fake_bool = {1.0, 0.0, None}
|
||||
# Replace the nans in the column values with None for
|
||||
# so we can just use issubset below
|
||||
col_values = set(
|
||||
not np.isnan(val) and val or None
|
||||
for val in self.tiles_column.value_counts(dropna=False).index
|
||||
)
|
||||
return len(col_values) <= 3 and col_values.issubset(fake_bool)
|
||||
return False
|
||||
return is_col_fake_bool(self.tiles_column)
|
||||
|
||||
@property
|
||||
def _is_dtype_ok(self) -> bool:
|
||||
|
@ -215,6 +227,49 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
|
|||
assert not errors, error_message
|
||||
|
||||
|
||||
def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df):
|
||||
tiles_geojson_df = tiles_geojson_df.drop(columns=["geometry"]).rename(
|
||||
columns={"GEOID10": "GTF"}
|
||||
)
|
||||
assert tiles_df.shape == tiles_geojson_df.shape
|
||||
assert tiles_df["GTF"].equals(tiles_geojson_df["GTF"])
|
||||
assert sorted(tiles_df.columns) == sorted(tiles_geojson_df.columns)
|
||||
|
||||
# Are all the dtypes and values the same?
|
||||
for col_name in tiles_geojson_df.columns:
|
||||
if is_col_fake_bool(tiles_df[col_name]):
|
||||
tiles_df[col_name] = (
|
||||
tiles_df[col_name]
|
||||
.astype("float64")
|
||||
.replace({0.0: False, 1.0: True})
|
||||
)
|
||||
if is_col_fake_bool(tiles_geojson_df[col_name]):
|
||||
tiles_geojson_df[col_name] = (
|
||||
tiles_geojson_df[col_name]
|
||||
.astype("float64")
|
||||
.replace({0.0: False, 1.0: True})
|
||||
)
|
||||
tiles_geojson_df[col_name] = tiles_df[col_name].replace({None: np.nan})
|
||||
error_message = f"Column {col_name} not equal "
|
||||
# For non-numeric types, we can use the built-in equals from pandas
|
||||
if tiles_df[col_name].dtype in [
|
||||
np.dtype(object),
|
||||
np.dtype(bool),
|
||||
np.dtype(str),
|
||||
]:
|
||||
assert tiles_df[col_name].equals(
|
||||
tiles_geojson_df[col_name]
|
||||
), error_message
|
||||
# For numeric sources, use np.close so we don't get harmed by
|
||||
# float equaity weirdness
|
||||
else:
|
||||
assert np.allclose(
|
||||
tiles_df[col_name],
|
||||
tiles_geojson_df[col_name],
|
||||
equal_nan=True,
|
||||
), error_message
|
||||
|
||||
|
||||
def test_for_state_names(tiles_df):
|
||||
states = tiles_df["SF"].value_counts(dropna=False).index
|
||||
assert np.nan not in states
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue