Fix and enable smoke tests

2025-08-24 15:41:39 -07:00 · 2025-01-17 12:59:24 -05:00 · 2025-01-17 12:59:24 -05:00 · 6093ce0f53
commit 6093ce0f53
parent 0f184a63f2
9 changed files with 64 additions and 93 deletions
--- a/.github/workflows/pr_backend.yml
+++ b/.github/workflows/pr_backend.yml
@ -130,8 +130,7 @@ jobs:
      - name: Generate Score Geo
        run: |
          poetry run python3 -m data_pipeline.application geo-score
-      - name: Run smoketest for 1.0
-        if: ${{ env.J40_VERSION_LABEL_STRING == '1.0' }}
+      - name: Run smoketests
        run: |
          poetry run pytest data_pipeline/ -m smoketest
      - name: Set timezone for tippecanoe
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -34,12 +34,12 @@ DATA_SCORE_CSV_DIR = DATA_SCORE_DIR / "csv"
 DATA_SCORE_CSV_FULL_DIR = DATA_SCORE_CSV_DIR / "full"
 DATA_SCORE_CSV_FULL_FILE_PATH = DATA_SCORE_CSV_FULL_DIR / "usa_score.parquet"
 FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH = (
-    DATA_SCORE_CSV_FULL_DIR / "usa_counties.csv"
+    DATA_SCORE_CSV_FULL_DIR / "usa_counties.parquet"
 )

 # Score Tile CSV source path
 DATA_SCORE_CSV_TILES_PATH = DATA_SCORE_CSV_DIR / "tiles"
-DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.csv"
+DATA_SCORE_CSV_TILES_FILE_PATH = DATA_SCORE_CSV_TILES_PATH / "usa.parquet"
 DATA_SCORE_JSON_INDEX_FILE_PATH = (
    DATA_SCORE_CSV_TILES_PATH / "tile_indexes.json"
 )
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -35,7 +35,6 @@ class GeoScoreETL(ExtractTransformLoad):
        self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"

        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
-        self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"

        self.CENSUS_USA_GEOJSON = constants.DATA_CENSUS_GEOJSON_FILE_PATH

@ -100,13 +99,9 @@ class GeoScoreETL(ExtractTransformLoad):
            full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
        ]

-        logger.info("Reading tile score CSV")
-        self.score_usa_df = pd.read_csv(
-            self.TILE_SCORE_CSV,
-            dtype={
-                self.TRACT_SHORT_FIELD: str,
-            },
-            low_memory=False,
+        logger.info("Reading tile score")
+        self.score_usa_df = pd.read_parquet(
+            constants.DATA_SCORE_CSV_TILES_FILE_PATH,
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -442,15 +442,14 @@ class PostScoreETL(ExtractTransformLoad):
            self.input_census_geo_df
        )

-    def _load_score_csv_full(
-        self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
+    def _load_score_full(
+        self, score_county_state_merged: pd.DataFrame, score_path: Path
    ) -> None:
        logger.debug("Saving Full Score CSV with County Information")
-        score_csv_path.parent.mkdir(parents=True, exist_ok=True)
-        score_county_state_merged.to_csv(
-            score_csv_path,
+        score_path.parent.mkdir(parents=True, exist_ok=True)
+        score_county_state_merged.to_parquet(
+            score_path,
            index=False,
-            encoding="utf-8-sig",  # windows compat https://stackoverflow.com/a/43684587
        )

    def _load_excel_from_df(
@ -514,12 +513,12 @@ class PostScoreETL(ExtractTransformLoad):

        return excel_csv_config

-    def _load_tile_csv(
+    def _load_tile_score(
        self, score_tiles_df: pd.DataFrame, tile_score_path: Path
    ) -> None:
-        logger.debug("Saving Tile Score CSV")
+        logger.debug("Saving Tile Score")
        tile_score_path.parent.mkdir(parents=True, exist_ok=True)
-        score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")
+        score_tiles_df.to_parquet(tile_score_path, index=False)

    def _load_downloadable_zip(self, downloadable_info_path: Path) -> None:
        downloadable_info_path.mkdir(parents=True, exist_ok=True)
@ -631,11 +630,11 @@ class PostScoreETL(ExtractTransformLoad):
        self.output_tract_search_df.to_json(output_path, orient="records")

    def load(self) -> None:
-        self._load_score_csv_full(
+        self._load_score_full(
            self.output_score_county_state_merged_df,
            constants.FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH,
        )
-        self._load_tile_csv(
+        self._load_tile_score(
            self.output_score_tiles_df, constants.DATA_SCORE_CSV_TILES_FILE_PATH
        )
        self._load_search_tract_data(constants.SCORE_TRACT_SEARCH_FILE_PATH)
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -43,17 +43,17 @@ def check_score_data_source(
        settings.AWS_JUSTICE40_DATAPIPELINE_URL
        + "/data/score/csv/tiles/usa.csv"
    )
-    TILE_SCORE_CSV = score_csv_data_path / "tiles" / "usa.csv"
+    TILE_SCORE_FILE = constants.DATA_SCORE_CSV_TILES_FILE_PATH

    # download from s3 if census_data_source is aws
    if score_data_source == "aws":
        logger.debug("Fetching Score Tile data from AWS S3")
        Downloader.download_file_from_url(
-            file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
+            file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_FILE
        )
    else:
        # check if score data is found locally
-        if not os.path.isfile(TILE_SCORE_CSV):
+        if not os.path.isfile(TILE_SCORE_FILE):
            logger.warning(
                "No local score tiles data found. Please use '-s aws` to fetch from AWS"
            )
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -110,9 +110,9 @@ def test_create_downloadable_data(
    )


-def test_load_score_csv_full(etl, score_data_expected):
+def test_load_score_full(etl, score_data_expected):
    reload(constants)
-    etl._load_score_csv_full(
+    etl._load_score_full(
        score_data_expected,
        constants.FULL_SCORE_CSV_FULL_PLUS_COUNTIES_FILE_PATH,
    )
@ -121,7 +121,7 @@ def test_load_score_csv_full(etl, score_data_expected):

 def test_load_tile_csv(etl, tile_data_expected):
    reload(constants)
-    etl._load_score_csv_full(
+    etl._load_score_full(
        tile_data_expected, constants.DATA_SCORE_CSV_TILES_FILE_PATH
    )
    assert constants.DATA_SCORE_CSV_TILES_FILE_PATH.is_file()
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -970,9 +970,8 @@ class CensusACSETL(ExtractTransformLoad):
                # Then the imputed field should have no nulls
                self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
            ]
-            .isna()
-            .sum()
-            == 0
+            .notna()
+            .all()
        ), "Error: not all values were filled..."

        logger.debug("Renaming columns...")
--- a/data/data-pipeline/data_pipeline/tests/score/test_output.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py
@ -207,28 +207,34 @@ def test_max_40_percent_DAC(final_score_df):


 def test_donut_hole_addition_to_score_n(final_score_df):
-    score_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
-    score_col = field_names.SCORE_N_COMMUNITIES
-    donut_hole_score_only = (
+    dacs_col_with_donuts = field_names.FINAL_SCORE_N_BOOLEAN
+    dacs_col = field_names.SCORE_N_COMMUNITIES
+    donut_hole_community_col = (
        field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
    )
-    count_donuts = final_score_df[donut_hole_score_only].sum()
-    count_n = final_score_df[score_col].sum()
-    count_n_with_donuts = final_score_df[score_col_with_donuts].sum()
-    new_donuts = final_score_df[
-        final_score_df[donut_hole_score_only] & ~final_score_df[score_col]
+    # Number of donuts found regardless of other scoring.
+    num_donuts = final_score_df[donut_hole_community_col].sum()
+
+    # Number of DACS not including adjacency.
+    num_dacs = final_score_df[dacs_col].sum()
+
+    # Number of DACS including adjacency.
+    num_dacs_with_donuts = final_score_df[dacs_col_with_donuts].sum()
+
+    # Number of DACS that are donuts.
+    num_dacs_due_to_donuts = final_score_df[
+        final_score_df[donut_hole_community_col] & ~final_score_df[dacs_col]
    ].shape[0]

-    assert (
-        new_donuts + count_n == count_n_with_donuts
-    ), "The math doesn't work! The number of new donut hole tracts plus score tracts (base) does not equal the total number of tracts identified"
+    assert num_dacs_due_to_donuts <= num_dacs_with_donuts
+    assert num_dacs_with_donuts >= num_dacs

    assert (
-        count_donuts < count_n
+        num_donuts < num_dacs
    ), "There are more donut hole tracts than base tracts. How can it be?"

    assert (
-        new_donuts > 0
+        num_dacs_due_to_donuts > 0
    ), "FYI: The adjacency index is doing nothing. Consider removing it?"


@ -429,30 +435,6 @@ def test_all_tracts_have_scores(final_score_df):


 def test_imputed_tracts(final_score_df):
-    # Make sure that any tracts with zero population have null imputed income
-    tracts_with_zero_population_df = final_score_df[
-        final_score_df[field_names.TOTAL_POP_FIELD] == 0
-    ]
-    assert (
-        tracts_with_zero_population_df[
-            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
-        ]
-        .isna()
-        .all()
-    )
-
-    # Make sure that any tracts with null population have null imputed income
-    tracts_with_null_population_df = final_score_df[
-        final_score_df[field_names.TOTAL_POP_FIELD].isnull()
-    ]
-    assert (
-        tracts_with_null_population_df[
-            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
-        ]
-        .isna()
-        .all()
-    )
-
    # Make sure that no tracts with population have null imputed income
    # We DO NOT impute income for island areas, so remove those from the test
    is_island_area = (
--- a/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py
@ -8,11 +8,6 @@ import pandas as pd
 import pytest
 from data_pipeline.config import settings
 from data_pipeline.etl.score import constants
-from data_pipeline.etl.score.constants import THRESHOLD_COUNT_TO_SHOW_FIELD_NAME
-from data_pipeline.etl.score.constants import TILES_SCORE_COLUMNS
-from data_pipeline.etl.score.constants import (
-    USER_INTERFACE_EXPERIENCE_FIELD_NAME,
-)
 from data_pipeline.score import field_names

 from .fixtures import final_score_df  # pylint: disable=unused-import
@ -22,10 +17,8 @@ pytestmark = pytest.mark.smoketest

@pytest.fixture
 def tiles_df(scope="session"):
-    return pd.read_csv(
-        settings.APP_ROOT / "data" / "score" / "csv" / "tiles" / "usa.csv",
-        dtype={"GTF": str},
-        low_memory=False,
+    return pd.read_parquet(
+        constants.DATA_SCORE_CSV_TILES_FILE_PATH,
    )


@ -73,7 +66,6 @@ def test_percentiles(tiles_df):
        assert (tiles_df[col].median() >= 0.4) & (
            tiles_df[col].median() <= 0.6
        ), f"Percentile distribution for {col} is decidedly not uniform"
-    return True


 def test_count_of_fips_codes(tiles_df, final_score_df):
@ -91,19 +83,19 @@ def test_count_of_fips_codes(tiles_df, final_score_df):


 def test_column_presence(tiles_df):
-    expected_column_names = set(TILES_SCORE_COLUMNS.values()) | {
-        THRESHOLD_COUNT_TO_SHOW_FIELD_NAME,
-        USER_INTERFACE_EXPERIENCE_FIELD_NAME,
+    expected_column_names = set(constants.TILES_SCORE_COLUMNS.values()) | {
+        constants.THRESHOLD_COUNT_TO_SHOW_FIELD_NAME,
+        constants.USER_INTERFACE_EXPERIENCE_FIELD_NAME,
    }
    actual_column_names = set(tiles_df.columns)
    extra_columns = actual_column_names - expected_column_names
    missing_columns = expected_column_names - expected_column_names
    assert not (
        extra_columns
-    ), f"tiles/usa.csv has columns not specified in TILE_SCORE_COLUMNS: {extra_columns}"
+    ), f"tiles score has columns not specified in TILE_SCORE_COLUMNS: {extra_columns}"
    assert not (
        missing_columns
-    ), f"tiles/usa.csv is missing columns from TILE_SCORE_COLUMNS: {missing_columns}"
+    ), f"tiles score is missing columns from TILE_SCORE_COLUMNS: {missing_columns}"


 def test_tract_equality(tiles_df, final_score_df):
@ -189,12 +181,17 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
    #   every tile column
    #   * Because tiles use rounded floats, we use close with a tolerance
    assert (
-        set(TILES_SCORE_COLUMNS.values()) - set(tiles_df.columns) == set()
+        set(constants.TILES_SCORE_COLUMNS.values()) - set(tiles_df.columns)
+        == set()
    ), "Some TILES_SCORE_COLUMNS are missing from the tiles dataframe"

    # Keep only the tiles score columns in the final score data
-    final_score_df = final_score_df.rename(columns=TILES_SCORE_COLUMNS).drop(
-        final_score_df.columns.difference(TILES_SCORE_COLUMNS.values()),
+    final_score_df = final_score_df.rename(
+        columns=constants.TILES_SCORE_COLUMNS
+    ).drop(
+        final_score_df.columns.difference(
+            constants.TILES_SCORE_COLUMNS.values()
+        ),
        axis=1,
        errors="ignore",
    )
@ -227,7 +224,7 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
    assert not errors, error_message


-def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df):
+def test_for_geojson_fidelity_from_tiles_score(tiles_df, tiles_geojson_df):
    tiles_geojson_df = tiles_geojson_df.drop(columns=["geometry"]).rename(
        columns={"GEOID10": "GTF"}
    )
@ -252,11 +249,11 @@ def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df):
        tiles_geojson_df[col_name] = tiles_df[col_name].replace({None: np.nan})
        error_message = f"Column {col_name} not equal "
        # For non-numeric types, we can use the built-in equals from pandas
-        if tiles_df[col_name].dtype in [
-            np.dtype(object),
-            np.dtype(bool),
-            np.dtype(str),
-        ]:
+        if (
+            pd.api.types.is_bool_dtype(tiles_df[col_name])
+            or pd.api.types.is_object_dtype(tiles_df[col_name])
+            or pd.api.types.is_string_dtype(tiles_df[col_name])
+        ):
            assert tiles_df[col_name].equals(
                tiles_geojson_df[col_name]
            ), error_message