Improve score test documentation based on Lucas's feedback (#1835) (#1914)

* Better document base on Lucas's feedback (#1835)

* Fix typo (#1835)

* Add test to verify GEOJSON matches tiles (#1835)

* Remove NOOP line (#1835)

* Move GEOJSON generation up for new smoketest (#1835)

* Fixup code format (#1835)

* Update readme for new somketest (#1835)
This commit is contained in:
Matt Bowen 2022-09-23 13:18:15 -04:00 committed by GitHub
commit f70f30d610
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 108 additions and 23 deletions

View file

@ -31,7 +31,7 @@ from .fixtures import (
pytestmark = pytest.mark.smoketest
UNMATCHED_TRACK_THRESHOLD = 1000
UNMATCHED_TRACT_THRESHOLD = 1000
def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
@ -254,6 +254,15 @@ def test_data_sources(
key: value for key, value in locals().items() if key != "final_score_df"
}
# For each data source that's injected via the fixtures, do the following:
# * Ensure at least one column from the source shows up in the score
# * Ensure any tracts NOT in the data source are NA/null in the score
# * Ensure the data source doesn't have a large number of tract IDs that are not
# included in the final score, since that implies the source is using 2020
# tract IDs
# * Verify that the data from the source that's in the final score output
# is the "equal" to the data from the ETL, allowing for the minor
# differences that come from floating point comparisons
for data_source_name, data_source in data_sources.items():
final = "final_"
df: pd.DataFrame = final_score_df.merge(
@ -275,12 +284,12 @@ def test_data_sources(
), f"No columns from data source show up in final score in source {data_source_name}"
# Make sure we have NAs for any tracts in the final data that aren't
# covered in the final data
# included in the data source
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
# Make sure the datasource doesn't have a ton of unmatched tracts, implying it
# has moved to 2020 tracts
assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACK_THRESHOLD
assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACT_THRESHOLD
df = df[df.MERGE == "both"]
@ -293,6 +302,7 @@ def test_data_sources(
f"Column {final_column} not equal "
f"between {data_source_name} and final score"
)
# For non-numeric types, we can use the built-in equals from pandas
if df[final_column].dtype in [
np.dtype(object),
np.dtype(bool),
@ -301,6 +311,8 @@ def test_data_sources(
assert df[final_column].equals(
df[data_source_column]
), error_message
# For numeric sources, use np.close so we don't get harmed by
# float equaity weirdness
else:
assert np.allclose(
df[final_column],

View file

@ -2,6 +2,7 @@
from dataclasses import dataclass
from typing import Optional
import pandas as pd
import geopandas as gpd
import numpy as np
import pytest
from data_pipeline.config import settings
@ -26,6 +27,13 @@ def tiles_df(scope="session"):
)
@pytest.fixture()
def tiles_geojson_df():
return gpd.read_file(
settings.APP_ROOT / "data" / "score" / "geojson" / "usa-high.json"
)
PERCENTILE_FIELDS = [
"DF_PFS",
"AF_PFS",
@ -102,6 +110,19 @@ def test_tract_equality(tiles_df, final_score_df):
assert tiles_df.shape[0] == final_score_df.shape[0]
def is_col_fake_bool(col) -> bool:
if col.dtype == np.dtype("float64"):
fake_bool = {1.0, 0.0, None}
# Replace the nans in the column values with None for
# so we can just use issubset below
col_values = set(
not np.isnan(val) and val or None
for val in col.value_counts(dropna=False).index
)
return len(col_values) <= 3 and col_values.issubset(fake_bool)
return False
@dataclass
class ColumnValueComparison:
final_score_column: pd.Series
@ -110,16 +131,7 @@ class ColumnValueComparison:
@property
def _is_tiles_column_fake_bool(self) -> bool:
if self.tiles_column.dtype == np.dtype("float64"):
fake_bool = {1.0, 0.0, None}
# Replace the nans in the column values with None for
# so we can just use issubset below
col_values = set(
not np.isnan(val) and val or None
for val in self.tiles_column.value_counts(dropna=False).index
)
return len(col_values) <= 3 and col_values.issubset(fake_bool)
return False
return is_col_fake_bool(self.tiles_column)
@property
def _is_dtype_ok(self) -> bool:
@ -215,6 +227,49 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
assert not errors, error_message
def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df):
tiles_geojson_df = tiles_geojson_df.drop(columns=["geometry"]).rename(
columns={"GEOID10": "GTF"}
)
assert tiles_df.shape == tiles_geojson_df.shape
assert tiles_df["GTF"].equals(tiles_geojson_df["GTF"])
assert sorted(tiles_df.columns) == sorted(tiles_geojson_df.columns)
# Are all the dtypes and values the same?
for col_name in tiles_geojson_df.columns:
if is_col_fake_bool(tiles_df[col_name]):
tiles_df[col_name] = (
tiles_df[col_name]
.astype("float64")
.replace({0.0: False, 1.0: True})
)
if is_col_fake_bool(tiles_geojson_df[col_name]):
tiles_geojson_df[col_name] = (
tiles_geojson_df[col_name]
.astype("float64")
.replace({0.0: False, 1.0: True})
)
tiles_geojson_df[col_name] = tiles_df[col_name].replace({None: np.nan})
error_message = f"Column {col_name} not equal "
# For non-numeric types, we can use the built-in equals from pandas
if tiles_df[col_name].dtype in [
np.dtype(object),
np.dtype(bool),
np.dtype(str),
]:
assert tiles_df[col_name].equals(
tiles_geojson_df[col_name]
), error_message
# For numeric sources, use np.close so we don't get harmed by
# float equaity weirdness
else:
assert np.allclose(
tiles_df[col_name],
tiles_geojson_df[col_name],
equal_nan=True,
), error_message
def test_for_state_names(tiles_df):
states = tiles_df["SF"].value_counts(dropna=False).index
assert np.nan not in states