Fix and enable smoke tests

This commit is contained in:
Carlos Felix 2025-01-17 12:59:24 -05:00 committed by Carlos Felix
commit 6093ce0f53
9 changed files with 64 additions and 93 deletions

View file

@ -130,8 +130,7 @@ jobs:
- name: Generate Score Geo - name: Generate Score Geo
run: | run: |
poetry run python3 -m data_pipeline.application geo-score poetry run python3 -m data_pipeline.application geo-score
- name: Run smoketest for 1.0 - name: Run smoketests
if: ${{ env.J40_VERSION_LABEL_STRING == '1.0' }}
run: | run: |
poetry run pytest data_pipeline/ -m smoketest poetry run pytest data_pipeline/ -m smoketest
- name: Set timezone for tippecanoe - name: Set timezone for tippecanoe

View file

@ -34,12 +34,12 @@ DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv"
DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full" DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full"
DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet" DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet"
FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = ( FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv" DATA_SCORE_CSV_FULL_DIR / "usa_counties.parquet"
) )
# Score Tile CSV source path # Score Tile CSV source path
DATA_SCORE_CSV_TILES_PATH = DATA_SCORE_CSV_DIR / "tiles" DATA_SCORE_CSV_TILES_PATH = DATA_SCORE_CSV_DIR / "tiles"
DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.csv" DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.parquet"
DATA_SCORE_JSON_INDEX_FILE_PATH = ( DATA_SCORE_JSON_INDEX_FILE_PATH = (
DATA_SCORE_CSV_TILES_PATH / "tile_indexes.json" DATA_SCORE_CSV_TILES_PATH / "tile_indexes.json"
) )

View file

@ -35,7 +35,6 @@ class GeoScoreETL(ExtractTransformLoad):
self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp" self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH
@ -100,13 +99,9 @@ class GeoScoreETL(ExtractTransformLoad):
full_geojson_usa_df[self.LAND_FIELD_NAME] > 0 full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
] ]
logger.info("Reading tile score CSV") logger.info("Reading tile score")
self.score_usa_df = pd.read_csv( self.score_usa_df = pd.read_parquet(
self.TILE_SCORE_CSV, constants.DATA_SCORE_CSV_TILES_FILE_PATH,
dtype={
self.TRACT_SHORT_FIELD: str,
},
low_memory=False,
) )
def transform(self) -> None: def transform(self) -> None:

View file

@ -442,15 +442,14 @@ class PostScoreETL(ExtractTransformLoad):
self.input_census_geo_df self.input_census_geo_df
) )
def _load_score_csv_full( def _load_score_full(
self, score_county_state_merged: pd.DataFrame, score_csv_path: Path self, score_county_state_merged: pd.DataFrame, score_path: Path
) -> None: ) -> None:
logger.debug("Saving Full Score CSV with County Information") logger.debug("Saving Full Score CSV with County Information")
score_csv_path.parent.mkdir(parents=True, exist_ok=True) score_path.parent.mkdir(parents=True, exist_ok=True)
score_county_state_merged.to_csv( score_county_state_merged.to_parquet(
score_csv_path, score_path,
index=False, index=False,
encoding="utf-8-sig", # windows compat https://stackoverflow.com/a/43684587
) )
def _load_excel_from_df( def _load_excel_from_df(
@ -514,12 +513,12 @@ class PostScoreETL(ExtractTransformLoad):
return excel_csv_config return excel_csv_config
def _load_tile_csv( def _load_tile_score(
self, score_tiles_df: pd.DataFrame, tile_score_path: Path self, score_tiles_df: pd.DataFrame, tile_score_path: Path
) -> None: ) -> None:
logger.debug("Saving Tile Score CSV") logger.debug("Saving Tile Score")
tile_score_path.parent.mkdir(parents=True, exist_ok=True) tile_score_path.parent.mkdir(parents=True, exist_ok=True)
score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8") score_tiles_df.to_parquet(tile_score_path, index=False)
def _load_downloadable_zip(self, downloadable_info_path: Path) -> None: def _load_downloadable_zip(self, downloadable_info_path: Path) -> None:
downloadable_info_path.mkdir(parents=True, exist_ok=True) downloadable_info_path.mkdir(parents=True, exist_ok=True)
@ -631,11 +630,11 @@ class PostScoreETL(ExtractTransformLoad):
self.output_tract_search_df.to_json(output_path, orient="records") self.output_tract_search_df.to_json(output_path, orient="records")
def load(self) -> None: def load(self) -> None:
self._load_score_csv_full( self._load_score_full(
self.output_score_county_state_merged_df, self.output_score_county_state_merged_df,
constants.FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH, constants.FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH,
) )
self._load_tile_csv( self._load_tile_score(
self.output_score_tiles_df, constants.DATA_SCORE_CSV_TILES_FILE_PATH self.output_score_tiles_df, constants.DATA_SCORE_CSV_TILES_FILE_PATH
) )
self._load_search_tract_data(constants.SCORE_TRACT_SEARCH_FILE_PATH) self._load_search_tract_data(constants.SCORE_TRACT_SEARCH_FILE_PATH)

View file

@ -43,17 +43,17 @@ def check_score_data_source(
settings.AWS_JUSTICE40_DATAPIPELINE_URL settings.AWS_JUSTICE40_DATAPIPELINE_URL
+ "/data/score/csv/tiles/usa.csv" + "/data/score/csv/tiles/usa.csv"
) )
TILE_SCORE_CSV = score_csv_data_path / "tiles" / "usa.csv" TILE_SCORE_FILE = constants.DATA_SCORE_CSV_TILES_FILE_PATH
# download from s3 if census_data_source is aws # download from s3 if census_data_source is aws
if score_data_source == "aws": if score_data_source == "aws":
logger.debug("Fetching Score Tile data from AWS S3") logger.debug("Fetching Score Tile data from AWS S3")
Downloader.download_file_from_url( Downloader.download_file_from_url(
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_FILE
) )
else: else:
# check if score data is found locally # check if score data is found locally
if not os.path.isfile(TILE_SCORE_CSV): if not os.path.isfile(TILE_SCORE_FILE):
logger.warning( logger.warning(
"No local score tiles data found. Please use '-s aws` to fetch from AWS" "No local score tiles data found. Please use '-s aws` to fetch from AWS"
) )

View file

@ -110,9 +110,9 @@ def test_create_downloadable_data(
) )
def test_load_score_csv_full(etl, score_data_expected): def test_load_score_full(etl, score_data_expected):
reload(constants) reload(constants)
etl._load_score_csv_full( etl._load_score_full(
score_data_expected, score_data_expected,
constants.FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH, constants.FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH,
) )
@ -121,7 +121,7 @@ def test_load_score_csv_full(etl, score_data_expected):
def test_load_tile_csv(etl, tile_data_expected): def test_load_tile_csv(etl, tile_data_expected):
reload(constants) reload(constants)
etl._load_score_csv_full( etl._load_score_full(
tile_data_expected, constants.DATA_SCORE_CSV_TILES_FILE_PATH tile_data_expected, constants.DATA_SCORE_CSV_TILES_FILE_PATH
) )
assert constants.DATA_SCORE_CSV_TILES_FILE_PATH.is_file() assert constants.DATA_SCORE_CSV_TILES_FILE_PATH.is_file()

View file

@ -970,9 +970,8 @@ class CensusACSETL(ExtractTransformLoad):
# Then the imputed field should have no nulls # Then the imputed field should have no nulls
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
] ]
.isna() .notna()
.sum() .all()
== 0
), "Error: not all values were filled..." ), "Error: not all values were filled..."
logger.debug("Renaming columns...") logger.debug("Renaming columns...")

View file

@ -207,28 +207,34 @@ def test_max_40_percent_DAC(final_score_df):
def test_donut_hole_addition_to_score_n(final_score_df): def test_donut_hole_addition_to_score_n(final_score_df):
score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN dacs_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
score_col = field_names.SCORE_N_COMMUNITIES dacs_col = field_names.SCORE_N_COMMUNITIES
donut_hole_score_only = ( donut_hole_community_col = (
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
) )
count_donuts = final_score_df[donut_hole_score_only].sum() # Number of donuts found regardless of other scoring.
count_n = final_score_df[score_col].sum() num_donuts = final_score_df[donut_hole_community_col].sum()
count_n_with_donuts = final_score_df[score_col_with_donuts].sum()
new_donuts = final_score_df[ # Number of DACS not including adjacency.
final_score_df[donut_hole_score_only] & ~final_score_df[score_col] num_dacs = final_score_df[dacs_col].sum()
# Number of DACS including adjacency.
num_dacs_with_donuts = final_score_df[dacs_col_with_donuts].sum()
# Number of DACS that are donuts.
num_dacs_due_to_donuts = final_score_df[
final_score_df[donut_hole_community_col] & ~final_score_df[dacs_col]
].shape[0] ].shape[0]
assert ( assert num_dacs_due_to_donuts <= num_dacs_with_donuts
new_donuts + count_n == count_n_with_donuts assert num_dacs_with_donuts >= num_dacs
), "The math doesn't work! The number of new donut hole tracts plus score tracts (base) does not equal the total number of tracts identified"
assert ( assert (
count_donuts < count_n num_donuts < num_dacs
), "There are more donut hole tracts than base tracts. How can it be?" ), "There are more donut hole tracts than base tracts. How can it be?"
assert ( assert (
new_donuts > 0 num_dacs_due_to_donuts > 0
), "FYI: The adjacency index is doing nothing. Consider removing it?" ), "FYI: The adjacency index is doing nothing. Consider removing it?"
@ -429,30 +435,6 @@ def test_all_tracts_have_scores(final_score_df):
def test_imputed_tracts(final_score_df): def test_imputed_tracts(final_score_df):
# Make sure that any tracts with zero population have null imputed income
tracts_with_zero_population_df = final_score_df[
final_score_df[field_names.TOTAL_POP_FIELD] == 0
]
assert (
tracts_with_zero_population_df[
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
]
.isna()
.all()
)
# Make sure that any tracts with null population have null imputed income
tracts_with_null_population_df = final_score_df[
final_score_df[field_names.TOTAL_POP_FIELD].isnull()
]
assert (
tracts_with_null_population_df[
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
]
.isna()
.all()
)
# Make sure that no tracts with population have null imputed income # Make sure that no tracts with population have null imputed income
# We DO NOT impute income for island areas, so remove those from the test # We DO NOT impute income for island areas, so remove those from the test
is_island_area = ( is_island_area = (

View file

@ -8,11 +8,6 @@ import pandas as pd
import pytest import pytest
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.score import constants from data_pipeline.etl.score import constants
from data_pipeline.etl.score.constants import THRESHOLD_COUNT_TO_SHOW_FIELD_NAME
from data_pipeline.etl.score.constants import TILES_SCORE_COLUMNS
from data_pipeline.etl.score.constants import (
USER_INTERFACE_EXPERIENCE_FIELD_NAME,
)
from data_pipeline.score import field_names from data_pipeline.score import field_names
from .fixtures import final_score_df # pylint: disable=unused-import from .fixtures import final_score_df # pylint: disable=unused-import
@ -22,10 +17,8 @@ pytestmark = pytest.mark.smoketest
@pytest.fixture @pytest.fixture
def tiles_df(scope="session"): def tiles_df(scope="session"):
return pd.read_csv( return pd.read_parquet(
settings.APP_ROOT / "data" / "score" / "csv" / "tiles" / "usa.csv", constants.DATA_SCORE_CSV_TILES_FILE_PATH,
dtype={"GTF": str},
low_memory=False,
) )
@ -73,7 +66,6 @@ def test_percentiles(tiles_df):
assert (tiles_df[col].median() >= 0.4) & ( assert (tiles_df[col].median() >= 0.4) & (
tiles_df[col].median() <= 0.6 tiles_df[col].median() <= 0.6
), f"Percentile distribution for {col} is decidedly not uniform" ), f"Percentile distribution for {col} is decidedly not uniform"
return True
def test_count_of_fips_codes(tiles_df, final_score_df): def test_count_of_fips_codes(tiles_df, final_score_df):
@ -91,19 +83,19 @@ def test_count_of_fips_codes(tiles_df, final_score_df):
def test_column_presence(tiles_df): def test_column_presence(tiles_df):
expected_column_names = set(TILES_SCORE_COLUMNS.values()) | { expected_column_names = set(constants.TILES_SCORE_COLUMNS.values()) | {
THRESHOLD_COUNT_TO_SHOW_FIELD_NAME, constants.THRESHOLD_COUNT_TO_SHOW_FIELD_NAME,
USER_INTERFACE_EXPERIENCE_FIELD_NAME, constants.USER_INTERFACE_EXPERIENCE_FIELD_NAME,
} }
actual_column_names = set(tiles_df.columns) actual_column_names = set(tiles_df.columns)
extra_columns = actual_column_names - expected_column_names extra_columns = actual_column_names - expected_column_names
missing_columns = expected_column_names - expected_column_names missing_columns = expected_column_names - expected_column_names
assert not ( assert not (
extra_columns extra_columns
), f"tiles/usa.csv has columns not specified in TILE_SCORE_COLUMNS: {extra_columns}" ), f"tiles score has columns not specified in TILE_SCORE_COLUMNS: {extra_columns}"
assert not ( assert not (
missing_columns missing_columns
), f"tiles/usa.csv is missing columns from TILE_SCORE_COLUMNS: {missing_columns}" ), f"tiles score is missing columns from TILE_SCORE_COLUMNS: {missing_columns}"
def test_tract_equality(tiles_df, final_score_df): def test_tract_equality(tiles_df, final_score_df):
@ -189,12 +181,17 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
# every tile column # every tile column
# * Because tiles use rounded floats, we use close with a tolerance # * Because tiles use rounded floats, we use close with a tolerance
assert ( assert (
set(TILES_SCORE_COLUMNS.values()) - set(tiles_df.columns) == set() set(constants.TILES_SCORE_COLUMNS.values()) - set(tiles_df.columns)
== set()
), "Some TILES_SCORE_COLUMNS are missing from the tiles dataframe" ), "Some TILES_SCORE_COLUMNS are missing from the tiles dataframe"
# Keep only the tiles score columns in the final score data # Keep only the tiles score columns in the final score data
final_score_df = final_score_df.rename(columns=TILES_SCORE_COLUMNS).drop( final_score_df = final_score_df.rename(
final_score_df.columns.difference(TILES_SCORE_COLUMNS.values()), columns=constants.TILES_SCORE_COLUMNS
).drop(
final_score_df.columns.difference(
constants.TILES_SCORE_COLUMNS.values()
),
axis=1, axis=1,
errors="ignore", errors="ignore",
) )
@ -227,7 +224,7 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
assert not errors, error_message assert not errors, error_message
def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df): def test_for_geojson_fidelity_from_tiles_score(tiles_df, tiles_geojson_df):
tiles_geojson_df = tiles_geojson_df.drop(columns=["geometry"]).rename( tiles_geojson_df = tiles_geojson_df.drop(columns=["geometry"]).rename(
columns={"GEOID10": "GTF"} columns={"GEOID10": "GTF"}
) )
@ -252,11 +249,11 @@ def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df):
tiles_geojson_df[col_name] = tiles_df[col_name].replace({None: np.nan}) tiles_geojson_df[col_name] = tiles_df[col_name].replace({None: np.nan})
error_message = f"Column {col_name} not equal " error_message = f"Column {col_name} not equal "
# For non-numeric types, we can use the built-in equals from pandas # For non-numeric types, we can use the built-in equals from pandas
if tiles_df[col_name].dtype in [ if (
np.dtype(object), pd.api.types.is_bool_dtype(tiles_df[col_name])
np.dtype(bool), or pd.api.types.is_object_dtype(tiles_df[col_name])
np.dtype(str), or pd.api.types.is_string_dtype(tiles_df[col_name])
]: ):
assert tiles_df[col_name].equals( assert tiles_df[col_name].equals(
tiles_geojson_df[col_name] tiles_geojson_df[col_name]
), error_message ), error_message