Data pipeline and client fixes

2025-02-21 09:11:26 -08:00 · 2025-01-06 10:41:37 -08:00 · 2025-01-06 10:41:37 -08:00 · 9e33932600
commit 9e33932600
parent 7722b830e2
8 changed files with 45 additions and 30 deletions
--- a/QUICKSTART.md
+++ b/QUICKSTART.md
@ -21,8 +21,24 @@ $ PIPELINE_CMD="data_pipeline.application full-run" docker compose up
 ```
 The above command will build and spin up three containers: A data pipeline container, a data server, and a web server. 

-The data pipeline container can run the entire data pipeline, or any individual step. Because running the entire pipeline is a time-consuming process, the application command has been turned into a variable so individual parts of the pipeline can be run by docker compose. Once the full-run has been completed, you can change the PIPELINE_CMD environment variable to any other valid parameter for future runs. For example setting `PIPELINE_CMD="full-run --help"` would show the options for the full-run command. This would be helpful if you didn't want to run the data pipeline but merely wanted to see front end changes.
+The data pipeline container can run the entire data pipeline, or any individual step. Because running the entire pipeline is a time-consuming process, the application command has been turned into a variable so individual parts of the pipeline can be run by docker compose. Once the full-run has been completed, you can change the PIPELINE_CMD environment variable to any other valid parameter for future runs. For example setting `PIPELINE_CMD="data_pipeline.application full-run --help"` would show the options for the full-run command. This would be helpful if you didn't want to run the data pipeline but merely wanted to see front end changes.

-The data server will make the files created by the data pipeline container available to the web server. The data pipeline container mounts the local repo directories to read and write files. The data server presents the local files to the webserver to render the map and downloadables.
+The data server will make the files created by the data pipeline container available to the web server. The data pipeline container mounts the local repo directories to read and write files. The data server presents the local files to the webserver to render the map and downloadable files.

-The web server will run the application website. After it initializes, you should be able to open the web server in your browser at [`http://localhost:8000`](http://localhost:8000). If the data pipeline container is set to run the full data pipeline, the website will not pick up the changes until the pipeline completes.
+The web server will run the application website. After it initializes, you should be able to open the web server in your browser at [`http://localhost:8000`](http://localhost:8000). If the data pipeline container is set to run the full data pipeline, the website will not pick up the changes until the pipeline completes.
+
+In order for docker to pick up code changes, the images will need to be rebuilt. If there are code changes in the data folder, the data pipeline image should be rebuilt. If there are code changes in the the client folder, the web server image should be rebuilt. The data server image should never have to be rebuilt.
+
+Command to rebuild the data pipeline image:
+
+```sh
+$ docker build ./data/data-pipeline -t 'j40_data_pipeline'
+```
+
+Command to rebuild the web server image:
+
+```sh
+$ docker build ./client -t 'j40_website'
+```
+
+Once one or both images are rebuilt, you can re-run the docker compose command.
--- a/client/.env.development
+++ b/client/.env.development
@ -28,7 +28,6 @@ GATSBY_FILE_DL_PATH_1_0_SHAPE_FILE_ZIP=downloadable/1.0-shapefile-codebook.zip
 GATSBY_FILE_DL_PATH_1_0_DATA_DOC=downloadable/1.0-data-documentation.zip
 GATSBY_FILE_DL_PATH_BETA_TRAINING_SLIDES_PPT=downloadable/technical-training-slides.pptx

-
 GATSBY_FILE_DL_PATH_2_0_COMMUNITIES_LIST_XLS=downloadable/2.0-communities.xlsx
 GATSBY_FILE_DL_PATH_2_0_COMMUNITIES_LIST_CSV=downloadable/2.0-communities.csv
 GATSBY_FILE_DL_PATH_2_0_COMMUNITIES_LIST_PDF=downloadable/2.0-communities-list.pdf
@ -44,9 +43,4 @@ GATSBY_FILE_DL_PATH_2_0_M_23_09_SIGNED_PDF=downloadable/M-23-09_Signed_CEQ_CPO_e
 GATSBY_FILE_DL_PATH_TSD_ES_PDF=downloadable/cejst-technical-support-document.pdf
 GATSBY_FILE_DL_PATH_HOW_TO_COMMUNITIES_PDF=downloadable/draft-communities-list.pdf

-GATSBY_MAP_TILES_PATH=tiles
-
-# If you want the map to render a MapBox base map (as opposed to the
-# open source one from CartoDB), please create your own API TOKEN from
-# your MapBox account and add the token here:
-MAPBOX_STYLES_READ_TOKEN=''
+GATSBY_MAP_TILES_PATH=tiles
--- a/client/src/data/copy/downloads.tsx
+++ b/client/src/data/copy/downloads.tsx
@ -4,6 +4,7 @@ import React from 'react';
 import {defineMessages} from 'react-intl';
 import * as COMMON_COPY from './common';
 import {VERSION_NUMBER, VERSIONS} from './methodology';
+import {TILE_BASE_URL} from '../constants';

 export const PAGE_INTRO = defineMessages({
  PAGE_TILE: {
@ -29,19 +30,26 @@ export const PAGE_INTRO = defineMessages({
 });

 export const getDownloadFileUrl = (filePath: string | undefined, version: VERSIONS = VERSIONS.V2_0) => {
-  const scorePath = version === VERSIONS.BETA ?
-  process.env.GATSBY_BETA_SCORE_PATH :
-  version === VERSIONS.V1_0 ?
-  process.env.GATSBY_1_0_SCORE_PATH :
-  process.env.GATSBY_2_0_SCORE_PATH;
+  let scorePath;
+
+  if (process.env.DATA_SOURCE === 'local') {
+    scorePath = process.env.GATSBY_DATA_PIPELINE_SCORE_PATH_LOCAL;
+  } else {
+    scorePath = version === VERSIONS.BETA ?
+      process.env.GATSBY_BETA_SCORE_PATH :
+      version === VERSIONS.V1_0 ?
+      process.env.GATSBY_1_0_SCORE_PATH :
+      process.env.GATSBY_2_0_SCORE_PATH;
+  }
+
  return [
-    process.env.GATSBY_CDN_TILES_BASE_URL,
+    TILE_BASE_URL,
    scorePath,
    filePath,
  ].join('/');
 };

-// Define meta data on dowload files
+// Define meta data on download files
 export const DOWNLOAD_FILES = {
  NARWAL: {
    COMMUNITIES_LIST_XLS: {
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@ -48,7 +48,7 @@ The detailed steps for performing [local environment installation can be found i

 ## Running the Data Pipeline and Scoring Application

-The Justice40 Data Pipeline and Scoring Application is a multistep process that,
+The Justice40 Data Pipeline and Scoring Application is a multi-step process that,

 1. Retrieves input data sources (extract), standardizes those input data sources' data into an intermediate format (transform), and saves the results to the file system (load). It performs those steps for each configured input data source (found at [`data_pipeline/etl/sources`](data_pipeline/etl/sources))
 2. Calculates a score
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -29,8 +29,6 @@ from data_pipeline.utils import geo_score_folder_cleanup

 logger = get_module_logger(__name__)

-dataset_cli_help = "Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository"
-
 LOG_LINE_WIDTH = 60

 use_cache_option = click.option(
@ -38,7 +36,7 @@ use_cache_option = click.option(
    "--use-cache",
    is_flag=True,
    default=False,
-    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+    help="When set, will check for cached data sources to use before downloading new ones.",
 )

 dataset_option = click.option(
@ -46,7 +44,7 @@ dataset_option = click.option(
    "--dataset",
    required=False,
    type=str,
-    help=dataset_cli_help,
+    help="Name of dataset to run. If not provided, all datasets will be run.",
 )

 data_source_option = click.option(
@ -55,7 +53,7 @@ data_source_option = click.option(
    default="local",
    required=False,
    type=str,
-    help=dataset_cli_help,
+    help="Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository. Default is 'local'.",
 )


@ -290,10 +288,10 @@ def generate_map_tiles(generate_tribal_layer):
@data_source_option
@use_cache_option
 def data_full_run(check: bool, data_source: str, use_cache: bool):
-    """CLI command to run ETL, score, JSON combine and generate tiles in one command
+    """CLI command to run ETL, score, JSON combine and generate tiles including tribal layer in one command

    Args:
-        check (bool): Run the full data run only if the first run sempahore file is not set (optional)
+        check (bool): Run the full data run only if the first run semaphore file is not set (optional)
        data_source (str): Source for the census data (optional)
                           Options:
                           - local: fetch census and score data from the local data directory
@ -445,7 +443,7 @@ def clear_data_source_cache(dataset: str):
 )
@click.pass_context
 def full_post_etl(ctx):
-    """Generate scoring and tiles"""
+    """Generate scoring and tiles including tribal layer"""
    ctx.invoke(score_run)
    ctx.invoke(generate_score_post, data_source=None)
    ctx.invoke(geo_score, data_source=None)
@ -459,7 +457,7 @@ def full_post_etl(ctx):
@use_cache_option
@click.pass_context
 def full_run(ctx, use_cache):
-    """Run all downloads, ETLs, and generate scores and tiles"""
+    """Run all downloads, ETLs, and generate scores and tiles including tribal layer"""
    if not use_cache:
        ctx.invoke(data_cleanup)
    ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -52,7 +52,7 @@ DATA_TILES_SEARCH_DIR = DATA_SCORE_DIR / "search"

 # Downloadable paths
 if not os.environ.get("J40_VERSION_LABEL_STRING"):
-    version_str = "beta"
+    version_str = "2.0"
 else:
    version_str = os.environ.get("J40_VERSION_LABEL_STRING")

--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -517,7 +517,6 @@ class PostScoreETL(ExtractTransformLoad):
                num_cols = len(excel_df.columns)
                worksheet.set_column(0, num_cols - 1, num_excel_cols_width)

-            writer.save()
        return excel_csv_config

    def _load_tile_csv(
--- a/data/data-pipeline/settings.toml
+++ b/data/data-pipeline/settings.toml
@ -1,6 +1,6 @@
 [default]
 AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources"
-AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-pipeline"
+AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-versions/2.0"
 DATASOURCE_RETRIEVAL_FROM_AWS = true

 [development]