diff --git a/.github/workflows/deploy_be_staging.yml b/.github/workflows/deploy_be_staging.yml index 8a10cf38..d33fe9e3 100644 --- a/.github/workflows/deploy_be_staging.yml +++ b/.github/workflows/deploy_be_staging.yml @@ -61,7 +61,10 @@ jobs: poetry run python3 data_pipeline/application.py score-full-run - name: Generate Score Post run: | - poetry run python3 data_pipeline/application.py generate-score-post -s aws + poetry run python3 data_pipeline/application.py generate-score-post + - name: Generate Score Geo + run: | + poetry run python3 data_pipeline/application.py geo-score - name: Run Smoketests run: | poetry run pytest data_pipeline/ -m smoketest @@ -100,9 +103,6 @@ jobs: mkdir -p /usr/local/bin cp tippecanoe /usr/local/bin/tippecanoe tippecanoe -v - - name: Generate Score Geo - run: | - poetry run python3 data_pipeline/application.py geo-score - name: Generate Tiles run: | poetry run python3 data_pipeline/application.py generate-map-tiles diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index d61b1713..d553f3b9 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -12,11 +12,14 @@ - [2. Extract-Transform-Load (ETL) the data](#2-extract-transform-load-etl-the-data) - [3. Combined dataset](#3-combined-dataset) - [4. Tileset](#4-tileset) + - [5. Shapefiles](#5-shapefiles) - [Score generation and comparison workflow](#score-generation-and-comparison-workflow) - [Workflow Diagram](#workflow-diagram) - [Step 0: Set up your environment](#step-0-set-up-your-environment) - [Step 1: Run the script to download census data or download from the Justice40 S3 URL](#step-1-run-the-script-to-download-census-data-or-download-from-the-justice40-s3-url) - [Step 2: Run the ETL script for each data source](#step-2-run-the-etl-script-for-each-data-source) + - [Table of commands](#table-of-commands) + - [ETL steps](#etl-steps) - [Step 3: Calculate the Justice40 score experiments](#step-3-calculate-the-justice40-score-experiments) - [Step 4: Compare the Justice40 score experiments to other indices](#step-4-compare-the-justice40-score-experiments-to-other-indices) - [Data Sources](#data-sources) @@ -26,21 +29,27 @@ - [MacOS](#macos) - [Windows Users](#windows-users) - [Setting up Poetry](#setting-up-poetry) - - [Downloading Census Block Groups GeoJSON and Generating CBG CSVs](#downloading-census-block-groups-geojson-and-generating-cbg-csvs) + - [Running tox](#running-tox) + - [The Application entrypoint](#the-application-entrypoint) + - [Downloading Census Block Groups GeoJSON and Generating CBG CSVs (not normally required)](#downloading-census-block-groups-geojson-and-generating-cbg-csvs-not-normally-required) + - [Run all ETL, score and map generation processes](#run-all-etl-score-and-map-generation-processes) + - [Run both ETL and score generation processes](#run-both-etl-and-score-generation-processes) + - [Run all ETL processes](#run-all-etl-processes) - [Generating Map Tiles](#generating-map-tiles) - [Serve the map locally](#serve-the-map-locally) - [Running Jupyter notebooks](#running-jupyter-notebooks) - [Activating variable-enabled Markdown for Jupyter notebooks](#activating-variable-enabled-markdown-for-jupyter-notebooks) - - [Miscellaneous](#miscellaneous) - [Testing](#testing) - [Background](#background) - - [Configuration / Fixtures](#configuration--fixtures) + - [Score and post-processing tests](#score-and-post-processing-tests) - [Updating Pickles](#updating-pickles) - - [Future Enchancements](#future-enchancements) - - [ETL Unit Tests](#etl-unit-tests) + - [Future Enhancements](#future-enhancements) + - [Fixtures used in ETL "snapshot tests"](#fixtures-used-in-etl-snapshot-tests) + - [Other ETL Unit Tests](#other-etl-unit-tests) - [Extract Tests](#extract-tests) - [Transform Tests](#transform-tests) - [Load Tests](#load-tests) + - [Smoketests](#smoketests) @@ -496,3 +505,13 @@ See above [Fixtures](#configuration--fixtures) section for information about whe These make use of [tmp_path_factory](https://docs.pytest.org/en/latest/how-to/tmp_path.html) to create a file-system located under `temp_dir`, and validate whether the correct files are written to the correct locations. Additional future modifications could include the use of Pandera and/or other schema validation tools, and or a more explicit test that the data written to file can be read back in and yield the same dataframe. + +### Smoketests + +To ensure the score and tiles process correctly, there is a suite of "smoke tests" that can be run after the ETL and score data have been run, and outputs like the frontend GEOJSON have been created. +These tests are implemented as pytest test, but are skipped by default. To run them. + +1. Generate a full score with `poetry run python3 data_pipeline/application.py score-full-run` +2. Generate the tile data with `poetry run python3 data_pipeline/application.py generate-score-post` +3. Generate the frontend GEOJSON with `poetry run python3 data_pipeline/application.py geo-score` +4. Select the smoke tests for pytest with `poetry run pytest data_pipeline/tests -k smoketest` \ No newline at end of file diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index 31eacbe1..48320678 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -41,7 +41,6 @@ class GeoScoreETL(ExtractTransformLoad): self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv" - self.DATA_SOURCE = data_source self.CENSUS_USA_GEOJSON = ( self.DATA_PATH / "census" / "geojson" / "us.json" ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 0945fb9e..f88011bc 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -31,7 +31,7 @@ from .fixtures import ( pytestmark = pytest.mark.smoketest -UNMATCHED_TRACK_THRESHOLD = 1000 +UNMATCHED_TRACT_THRESHOLD = 1000 def _helper_test_count_exceeding_threshold(df, col, error_check=1000): @@ -254,6 +254,15 @@ def test_data_sources( key: value for key, value in locals().items() if key != "final_score_df" } + # For each data source that's injected via the fixtures, do the following: + # * Ensure at least one column from the source shows up in the score + # * Ensure any tracts NOT in the data source are NA/null in the score + # * Ensure the data source doesn't have a large number of tract IDs that are not + # included in the final score, since that implies the source is using 2020 + # tract IDs + # * Verify that the data from the source that's in the final score output + # is the "equal" to the data from the ETL, allowing for the minor + # differences that come from floating point comparisons for data_source_name, data_source in data_sources.items(): final = "final_" df: pd.DataFrame = final_score_df.merge( @@ -275,12 +284,12 @@ def test_data_sources( ), f"No columns from data source show up in final score in source {data_source_name}" # Make sure we have NAs for any tracts in the final data that aren't - # covered in the final data + # included in the data source assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) # Make sure the datasource doesn't have a ton of unmatched tracts, implying it # has moved to 2020 tracts - assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACK_THRESHOLD + assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACT_THRESHOLD df = df[df.MERGE == "both"] @@ -293,6 +302,7 @@ def test_data_sources( f"Column {final_column} not equal " f"between {data_source_name} and final score" ) + # For non-numeric types, we can use the built-in equals from pandas if df[final_column].dtype in [ np.dtype(object), np.dtype(bool), @@ -301,6 +311,8 @@ def test_data_sources( assert df[final_column].equals( df[data_source_column] ), error_message + # For numeric sources, use np.close so we don't get harmed by + # float equaity weirdness else: assert np.allclose( df[final_column], diff --git a/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py b/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py index 4bc84c4f..3f662f71 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from typing import Optional import pandas as pd +import geopandas as gpd import numpy as np import pytest from data_pipeline.config import settings @@ -26,6 +27,13 @@ def tiles_df(scope="session"): ) +@pytest.fixture() +def tiles_geojson_df(): + return gpd.read_file( + settings.APP_ROOT / "data" / "score" / "geojson" / "usa-high.json" + ) + + PERCENTILE_FIELDS = [ "DF_PFS", "AF_PFS", @@ -102,6 +110,19 @@ def test_tract_equality(tiles_df, final_score_df): assert tiles_df.shape[0] == final_score_df.shape[0] +def is_col_fake_bool(col) -> bool: + if col.dtype == np.dtype("float64"): + fake_bool = {1.0, 0.0, None} + # Replace the nans in the column values with None for + # so we can just use issubset below + col_values = set( + not np.isnan(val) and val or None + for val in col.value_counts(dropna=False).index + ) + return len(col_values) <= 3 and col_values.issubset(fake_bool) + return False + + @dataclass class ColumnValueComparison: final_score_column: pd.Series @@ -110,16 +131,7 @@ class ColumnValueComparison: @property def _is_tiles_column_fake_bool(self) -> bool: - if self.tiles_column.dtype == np.dtype("float64"): - fake_bool = {1.0, 0.0, None} - # Replace the nans in the column values with None for - # so we can just use issubset below - col_values = set( - not np.isnan(val) and val or None - for val in self.tiles_column.value_counts(dropna=False).index - ) - return len(col_values) <= 3 and col_values.issubset(fake_bool) - return False + return is_col_fake_bool(self.tiles_column) @property def _is_dtype_ok(self) -> bool: @@ -215,6 +227,49 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df): assert not errors, error_message +def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df): + tiles_geojson_df = tiles_geojson_df.drop(columns=["geometry"]).rename( + columns={"GEOID10": "GTF"} + ) + assert tiles_df.shape == tiles_geojson_df.shape + assert tiles_df["GTF"].equals(tiles_geojson_df["GTF"]) + assert sorted(tiles_df.columns) == sorted(tiles_geojson_df.columns) + + # Are all the dtypes and values the same? + for col_name in tiles_geojson_df.columns: + if is_col_fake_bool(tiles_df[col_name]): + tiles_df[col_name] = ( + tiles_df[col_name] + .astype("float64") + .replace({0.0: False, 1.0: True}) + ) + if is_col_fake_bool(tiles_geojson_df[col_name]): + tiles_geojson_df[col_name] = ( + tiles_geojson_df[col_name] + .astype("float64") + .replace({0.0: False, 1.0: True}) + ) + tiles_geojson_df[col_name] = tiles_df[col_name].replace({None: np.nan}) + error_message = f"Column {col_name} not equal " + # For non-numeric types, we can use the built-in equals from pandas + if tiles_df[col_name].dtype in [ + np.dtype(object), + np.dtype(bool), + np.dtype(str), + ]: + assert tiles_df[col_name].equals( + tiles_geojson_df[col_name] + ), error_message + # For numeric sources, use np.close so we don't get harmed by + # float equaity weirdness + else: + assert np.allclose( + tiles_df[col_name], + tiles_geojson_df[col_name], + equal_nan=True, + ), error_message + + def test_for_state_names(tiles_df): states = tiles_df["SF"].value_counts(dropna=False).index assert np.nan not in states