mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 01:54:18 -08:00
* Better document base on Lucas's feedback (#1835) * Fix typo (#1835) * Add test to verify GEOJSON matches tiles (#1835) * Remove NOOP line (#1835) * Move GEOJSON generation up for new smoketest (#1835) * Fixup code format (#1835) * Update readme for new somketest (#1835)
This commit is contained in:
parent
aca226165c
commit
f70f30d610
5 changed files with 108 additions and 23 deletions
8
.github/workflows/deploy_be_staging.yml
vendored
8
.github/workflows/deploy_be_staging.yml
vendored
|
@ -61,7 +61,10 @@ jobs:
|
||||||
poetry run python3 data_pipeline/application.py score-full-run
|
poetry run python3 data_pipeline/application.py score-full-run
|
||||||
- name: Generate Score Post
|
- name: Generate Score Post
|
||||||
run: |
|
run: |
|
||||||
poetry run python3 data_pipeline/application.py generate-score-post -s aws
|
poetry run python3 data_pipeline/application.py generate-score-post
|
||||||
|
- name: Generate Score Geo
|
||||||
|
run: |
|
||||||
|
poetry run python3 data_pipeline/application.py geo-score
|
||||||
- name: Run Smoketests
|
- name: Run Smoketests
|
||||||
run: |
|
run: |
|
||||||
poetry run pytest data_pipeline/ -m smoketest
|
poetry run pytest data_pipeline/ -m smoketest
|
||||||
|
@ -100,9 +103,6 @@ jobs:
|
||||||
mkdir -p /usr/local/bin
|
mkdir -p /usr/local/bin
|
||||||
cp tippecanoe /usr/local/bin/tippecanoe
|
cp tippecanoe /usr/local/bin/tippecanoe
|
||||||
tippecanoe -v
|
tippecanoe -v
|
||||||
- name: Generate Score Geo
|
|
||||||
run: |
|
|
||||||
poetry run python3 data_pipeline/application.py geo-score
|
|
||||||
- name: Generate Tiles
|
- name: Generate Tiles
|
||||||
run: |
|
run: |
|
||||||
poetry run python3 data_pipeline/application.py generate-map-tiles
|
poetry run python3 data_pipeline/application.py generate-map-tiles
|
||||||
|
|
|
@ -12,11 +12,14 @@
|
||||||
- [2. Extract-Transform-Load (ETL) the data](#2-extract-transform-load-etl-the-data)
|
- [2. Extract-Transform-Load (ETL) the data](#2-extract-transform-load-etl-the-data)
|
||||||
- [3. Combined dataset](#3-combined-dataset)
|
- [3. Combined dataset](#3-combined-dataset)
|
||||||
- [4. Tileset](#4-tileset)
|
- [4. Tileset](#4-tileset)
|
||||||
|
- [5. Shapefiles](#5-shapefiles)
|
||||||
- [Score generation and comparison workflow](#score-generation-and-comparison-workflow)
|
- [Score generation and comparison workflow](#score-generation-and-comparison-workflow)
|
||||||
- [Workflow Diagram](#workflow-diagram)
|
- [Workflow Diagram](#workflow-diagram)
|
||||||
- [Step 0: Set up your environment](#step-0-set-up-your-environment)
|
- [Step 0: Set up your environment](#step-0-set-up-your-environment)
|
||||||
- [Step 1: Run the script to download census data or download from the Justice40 S3 URL](#step-1-run-the-script-to-download-census-data-or-download-from-the-justice40-s3-url)
|
- [Step 1: Run the script to download census data or download from the Justice40 S3 URL](#step-1-run-the-script-to-download-census-data-or-download-from-the-justice40-s3-url)
|
||||||
- [Step 2: Run the ETL script for each data source](#step-2-run-the-etl-script-for-each-data-source)
|
- [Step 2: Run the ETL script for each data source](#step-2-run-the-etl-script-for-each-data-source)
|
||||||
|
- [Table of commands](#table-of-commands)
|
||||||
|
- [ETL steps](#etl-steps)
|
||||||
- [Step 3: Calculate the Justice40 score experiments](#step-3-calculate-the-justice40-score-experiments)
|
- [Step 3: Calculate the Justice40 score experiments](#step-3-calculate-the-justice40-score-experiments)
|
||||||
- [Step 4: Compare the Justice40 score experiments to other indices](#step-4-compare-the-justice40-score-experiments-to-other-indices)
|
- [Step 4: Compare the Justice40 score experiments to other indices](#step-4-compare-the-justice40-score-experiments-to-other-indices)
|
||||||
- [Data Sources](#data-sources)
|
- [Data Sources](#data-sources)
|
||||||
|
@ -26,21 +29,27 @@
|
||||||
- [MacOS](#macos)
|
- [MacOS](#macos)
|
||||||
- [Windows Users](#windows-users)
|
- [Windows Users](#windows-users)
|
||||||
- [Setting up Poetry](#setting-up-poetry)
|
- [Setting up Poetry](#setting-up-poetry)
|
||||||
- [Downloading Census Block Groups GeoJSON and Generating CBG CSVs](#downloading-census-block-groups-geojson-and-generating-cbg-csvs)
|
- [Running tox](#running-tox)
|
||||||
|
- [The Application entrypoint](#the-application-entrypoint)
|
||||||
|
- [Downloading Census Block Groups GeoJSON and Generating CBG CSVs (not normally required)](#downloading-census-block-groups-geojson-and-generating-cbg-csvs-not-normally-required)
|
||||||
|
- [Run all ETL, score and map generation processes](#run-all-etl-score-and-map-generation-processes)
|
||||||
|
- [Run both ETL and score generation processes](#run-both-etl-and-score-generation-processes)
|
||||||
|
- [Run all ETL processes](#run-all-etl-processes)
|
||||||
- [Generating Map Tiles](#generating-map-tiles)
|
- [Generating Map Tiles](#generating-map-tiles)
|
||||||
- [Serve the map locally](#serve-the-map-locally)
|
- [Serve the map locally](#serve-the-map-locally)
|
||||||
- [Running Jupyter notebooks](#running-jupyter-notebooks)
|
- [Running Jupyter notebooks](#running-jupyter-notebooks)
|
||||||
- [Activating variable-enabled Markdown for Jupyter notebooks](#activating-variable-enabled-markdown-for-jupyter-notebooks)
|
- [Activating variable-enabled Markdown for Jupyter notebooks](#activating-variable-enabled-markdown-for-jupyter-notebooks)
|
||||||
- [Miscellaneous](#miscellaneous)
|
|
||||||
- [Testing](#testing)
|
- [Testing](#testing)
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
- [Configuration / Fixtures](#configuration--fixtures)
|
- [Score and post-processing tests](#score-and-post-processing-tests)
|
||||||
- [Updating Pickles](#updating-pickles)
|
- [Updating Pickles](#updating-pickles)
|
||||||
- [Future Enchancements](#future-enchancements)
|
- [Future Enhancements](#future-enhancements)
|
||||||
- [ETL Unit Tests](#etl-unit-tests)
|
- [Fixtures used in ETL "snapshot tests"](#fixtures-used-in-etl-snapshot-tests)
|
||||||
|
- [Other ETL Unit Tests](#other-etl-unit-tests)
|
||||||
- [Extract Tests](#extract-tests)
|
- [Extract Tests](#extract-tests)
|
||||||
- [Transform Tests](#transform-tests)
|
- [Transform Tests](#transform-tests)
|
||||||
- [Load Tests](#load-tests)
|
- [Load Tests](#load-tests)
|
||||||
|
- [Smoketests](#smoketests)
|
||||||
|
|
||||||
<!-- /TOC -->
|
<!-- /TOC -->
|
||||||
|
|
||||||
|
@ -496,3 +505,13 @@ See above [Fixtures](#configuration--fixtures) section for information about whe
|
||||||
These make use of [tmp_path_factory](https://docs.pytest.org/en/latest/how-to/tmp_path.html) to create a file-system located under `temp_dir`, and validate whether the correct files are written to the correct locations.
|
These make use of [tmp_path_factory](https://docs.pytest.org/en/latest/how-to/tmp_path.html) to create a file-system located under `temp_dir`, and validate whether the correct files are written to the correct locations.
|
||||||
|
|
||||||
Additional future modifications could include the use of Pandera and/or other schema validation tools, and or a more explicit test that the data written to file can be read back in and yield the same dataframe.
|
Additional future modifications could include the use of Pandera and/or other schema validation tools, and or a more explicit test that the data written to file can be read back in and yield the same dataframe.
|
||||||
|
|
||||||
|
### Smoketests
|
||||||
|
|
||||||
|
To ensure the score and tiles process correctly, there is a suite of "smoke tests" that can be run after the ETL and score data have been run, and outputs like the frontend GEOJSON have been created.
|
||||||
|
These tests are implemented as pytest test, but are skipped by default. To run them.
|
||||||
|
|
||||||
|
1. Generate a full score with `poetry run python3 data_pipeline/application.py score-full-run`
|
||||||
|
2. Generate the tile data with `poetry run python3 data_pipeline/application.py generate-score-post`
|
||||||
|
3. Generate the frontend GEOJSON with `poetry run python3 data_pipeline/application.py geo-score`
|
||||||
|
4. Select the smoke tests for pytest with `poetry run pytest data_pipeline/tests -k smoketest`
|
|
@ -41,7 +41,6 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
||||||
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
||||||
|
|
||||||
self.DATA_SOURCE = data_source
|
|
||||||
self.CENSUS_USA_GEOJSON = (
|
self.CENSUS_USA_GEOJSON = (
|
||||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||||
)
|
)
|
||||||
|
|
|
@ -31,7 +31,7 @@ from .fixtures import (
|
||||||
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.smoketest
|
pytestmark = pytest.mark.smoketest
|
||||||
UNMATCHED_TRACK_THRESHOLD = 1000
|
UNMATCHED_TRACT_THRESHOLD = 1000
|
||||||
|
|
||||||
|
|
||||||
def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
|
def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
|
||||||
|
@ -254,6 +254,15 @@ def test_data_sources(
|
||||||
key: value for key, value in locals().items() if key != "final_score_df"
|
key: value for key, value in locals().items() if key != "final_score_df"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# For each data source that's injected via the fixtures, do the following:
|
||||||
|
# * Ensure at least one column from the source shows up in the score
|
||||||
|
# * Ensure any tracts NOT in the data source are NA/null in the score
|
||||||
|
# * Ensure the data source doesn't have a large number of tract IDs that are not
|
||||||
|
# included in the final score, since that implies the source is using 2020
|
||||||
|
# tract IDs
|
||||||
|
# * Verify that the data from the source that's in the final score output
|
||||||
|
# is the "equal" to the data from the ETL, allowing for the minor
|
||||||
|
# differences that come from floating point comparisons
|
||||||
for data_source_name, data_source in data_sources.items():
|
for data_source_name, data_source in data_sources.items():
|
||||||
final = "final_"
|
final = "final_"
|
||||||
df: pd.DataFrame = final_score_df.merge(
|
df: pd.DataFrame = final_score_df.merge(
|
||||||
|
@ -275,12 +284,12 @@ def test_data_sources(
|
||||||
), f"No columns from data source show up in final score in source {data_source_name}"
|
), f"No columns from data source show up in final score in source {data_source_name}"
|
||||||
|
|
||||||
# Make sure we have NAs for any tracts in the final data that aren't
|
# Make sure we have NAs for any tracts in the final data that aren't
|
||||||
# covered in the final data
|
# included in the data source
|
||||||
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
|
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
|
||||||
|
|
||||||
# Make sure the datasource doesn't have a ton of unmatched tracts, implying it
|
# Make sure the datasource doesn't have a ton of unmatched tracts, implying it
|
||||||
# has moved to 2020 tracts
|
# has moved to 2020 tracts
|
||||||
assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACK_THRESHOLD
|
assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACT_THRESHOLD
|
||||||
|
|
||||||
df = df[df.MERGE == "both"]
|
df = df[df.MERGE == "both"]
|
||||||
|
|
||||||
|
@ -293,6 +302,7 @@ def test_data_sources(
|
||||||
f"Column {final_column} not equal "
|
f"Column {final_column} not equal "
|
||||||
f"between {data_source_name} and final score"
|
f"between {data_source_name} and final score"
|
||||||
)
|
)
|
||||||
|
# For non-numeric types, we can use the built-in equals from pandas
|
||||||
if df[final_column].dtype in [
|
if df[final_column].dtype in [
|
||||||
np.dtype(object),
|
np.dtype(object),
|
||||||
np.dtype(bool),
|
np.dtype(bool),
|
||||||
|
@ -301,6 +311,8 @@ def test_data_sources(
|
||||||
assert df[final_column].equals(
|
assert df[final_column].equals(
|
||||||
df[data_source_column]
|
df[data_source_column]
|
||||||
), error_message
|
), error_message
|
||||||
|
# For numeric sources, use np.close so we don't get harmed by
|
||||||
|
# float equaity weirdness
|
||||||
else:
|
else:
|
||||||
assert np.allclose(
|
assert np.allclose(
|
||||||
df[final_column],
|
df[final_column],
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import geopandas as gpd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
@ -26,6 +27,13 @@ def tiles_df(scope="session"):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tiles_geojson_df():
|
||||||
|
return gpd.read_file(
|
||||||
|
settings.APP_ROOT / "data" / "score" / "geojson" / "usa-high.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PERCENTILE_FIELDS = [
|
PERCENTILE_FIELDS = [
|
||||||
"DF_PFS",
|
"DF_PFS",
|
||||||
"AF_PFS",
|
"AF_PFS",
|
||||||
|
@ -102,6 +110,19 @@ def test_tract_equality(tiles_df, final_score_df):
|
||||||
assert tiles_df.shape[0] == final_score_df.shape[0]
|
assert tiles_df.shape[0] == final_score_df.shape[0]
|
||||||
|
|
||||||
|
|
||||||
|
def is_col_fake_bool(col) -> bool:
|
||||||
|
if col.dtype == np.dtype("float64"):
|
||||||
|
fake_bool = {1.0, 0.0, None}
|
||||||
|
# Replace the nans in the column values with None for
|
||||||
|
# so we can just use issubset below
|
||||||
|
col_values = set(
|
||||||
|
not np.isnan(val) and val or None
|
||||||
|
for val in col.value_counts(dropna=False).index
|
||||||
|
)
|
||||||
|
return len(col_values) <= 3 and col_values.issubset(fake_bool)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ColumnValueComparison:
|
class ColumnValueComparison:
|
||||||
final_score_column: pd.Series
|
final_score_column: pd.Series
|
||||||
|
@ -110,16 +131,7 @@ class ColumnValueComparison:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _is_tiles_column_fake_bool(self) -> bool:
|
def _is_tiles_column_fake_bool(self) -> bool:
|
||||||
if self.tiles_column.dtype == np.dtype("float64"):
|
return is_col_fake_bool(self.tiles_column)
|
||||||
fake_bool = {1.0, 0.0, None}
|
|
||||||
# Replace the nans in the column values with None for
|
|
||||||
# so we can just use issubset below
|
|
||||||
col_values = set(
|
|
||||||
not np.isnan(val) and val or None
|
|
||||||
for val in self.tiles_column.value_counts(dropna=False).index
|
|
||||||
)
|
|
||||||
return len(col_values) <= 3 and col_values.issubset(fake_bool)
|
|
||||||
return False
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _is_dtype_ok(self) -> bool:
|
def _is_dtype_ok(self) -> bool:
|
||||||
|
@ -215,6 +227,49 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
|
||||||
assert not errors, error_message
|
assert not errors, error_message
|
||||||
|
|
||||||
|
|
||||||
|
def test_for_geojson_fidelity_from_tiles_csv(tiles_df, tiles_geojson_df):
|
||||||
|
tiles_geojson_df = tiles_geojson_df.drop(columns=["geometry"]).rename(
|
||||||
|
columns={"GEOID10": "GTF"}
|
||||||
|
)
|
||||||
|
assert tiles_df.shape == tiles_geojson_df.shape
|
||||||
|
assert tiles_df["GTF"].equals(tiles_geojson_df["GTF"])
|
||||||
|
assert sorted(tiles_df.columns) == sorted(tiles_geojson_df.columns)
|
||||||
|
|
||||||
|
# Are all the dtypes and values the same?
|
||||||
|
for col_name in tiles_geojson_df.columns:
|
||||||
|
if is_col_fake_bool(tiles_df[col_name]):
|
||||||
|
tiles_df[col_name] = (
|
||||||
|
tiles_df[col_name]
|
||||||
|
.astype("float64")
|
||||||
|
.replace({0.0: False, 1.0: True})
|
||||||
|
)
|
||||||
|
if is_col_fake_bool(tiles_geojson_df[col_name]):
|
||||||
|
tiles_geojson_df[col_name] = (
|
||||||
|
tiles_geojson_df[col_name]
|
||||||
|
.astype("float64")
|
||||||
|
.replace({0.0: False, 1.0: True})
|
||||||
|
)
|
||||||
|
tiles_geojson_df[col_name] = tiles_df[col_name].replace({None: np.nan})
|
||||||
|
error_message = f"Column {col_name} not equal "
|
||||||
|
# For non-numeric types, we can use the built-in equals from pandas
|
||||||
|
if tiles_df[col_name].dtype in [
|
||||||
|
np.dtype(object),
|
||||||
|
np.dtype(bool),
|
||||||
|
np.dtype(str),
|
||||||
|
]:
|
||||||
|
assert tiles_df[col_name].equals(
|
||||||
|
tiles_geojson_df[col_name]
|
||||||
|
), error_message
|
||||||
|
# For numeric sources, use np.close so we don't get harmed by
|
||||||
|
# float equaity weirdness
|
||||||
|
else:
|
||||||
|
assert np.allclose(
|
||||||
|
tiles_df[col_name],
|
||||||
|
tiles_geojson_df[col_name],
|
||||||
|
equal_nan=True,
|
||||||
|
), error_message
|
||||||
|
|
||||||
|
|
||||||
def test_for_state_names(tiles_df):
|
def test_for_state_names(tiles_df):
|
||||||
states = tiles_df["SF"].value_counts(dropna=False).index
|
states = tiles_df["SF"].value_counts(dropna=False).index
|
||||||
assert np.nan not in states
|
assert np.nan not in states
|
||||||
|
|
Loading…
Add table
Reference in a new issue