Fix docker

2025-07-28 14:11:17 -07:00 · 2024-12-23 08:05:18 -08:00 · 2024-12-23 08:05:18 -08:00 · 682b2d34a7
commit 682b2d34a7
parent aa88249f37
10 changed files with 111 additions and 54 deletions
--- a/QUICKSTART.md
+++ b/QUICKSTART.md
@ -14,10 +14,10 @@ Install [`docker`](https://docs.docker.com/get-docker/). See [Install Docker](IN
 > _Important_: To be able to run the entire application, you may need to increase the memory allocated for docker to at least 8096 MB. See [this post](https://stackoverflow.com/a/44533437) for more details.
-Use `docker-compose` to run the application:
+Use `docker compose` to run the application:
 ```sh
-$ docker-compose up
+$ docker compose up
 ```
 > Note: This may take a while – possibly even an hour or two – since it has to build the containers and then download and process all the data.
--- a/client/.dockerignore
+++ b/client/.dockerignore
@ -1 +1,8 @@
 .git
 .gitignore
 *Dockerfile*
 *docker-compose*
 .cache
 public
 node_modules
 npm-debug.log
--- a/client/Dockerfile
+++ b/client/Dockerfile
@ -4,17 +4,14 @@ FROM node:14
 # this working directory
 WORKDIR /client
 # Copy the package.json and package_lock.json files from local to the docker image / container
 COPY package*.json ./
 # install all packages as a layer in the docker image / container
 RUN npm install
 # copy all local files from the working directory to the docker image/container however we must use 
 # dockerignore to ignore node_modules so that the image can use what what was just installed from the above
 # step.
 COPY . .
 # install all packages as a layer in the docker image / container
 RUN npm ci
 ENV PORT=6000
 EXPOSE 6000
--- a/data/data-pipeline/.dockerignore
+++ b/data/data-pipeline/.dockerignore
@ -0,0 +1,17 @@
 ./data_pipeline/data/census/csv/*
 ./data_pipeline/data/census/geojson/*
 ./data_pipeline/data/census/shp/*
 ./data_pipeline/data/dataset/*
 ./data_pipeline/data/score/csv/*
 ./data_pipeline/data/score/downloadable/*
 ./data_pipeline/data/score/geojson/*
 ./data_pipeline/data/score/search/*
 ./data_pipeline/data/score/shapefile/*
 ./data_pipeline/data/score/tiles/*
 ./data_pipeline/data/sources/*
 ./data_pipeline/data/tmp/*
 ./data_pipeline/data/tribal/csv/*
 ./data_pipeline/data/tribal/geographic_data/*
 ./data_pipeline/data/tribal/geojson/*
 ./data_pipeline/data/tribal/tiles/*
--- a/data/data-pipeline/Dockerfile
+++ b/data/data-pipeline/Dockerfile
@ -1,4 +1,4 @@
-FROM ubuntu:20.04
+FROM ubuntu:22.04
 ENV TZ=America/Los_Angeles
@ -10,13 +10,13 @@ RUN apt-get update && TZ=America/Los_Angeles DEBIAN_FRONTEND=noninteractive apt-
    git \
    unzip \
    wget \
-    python3-dev \
+    software-properties-common \
-    python3-pip \
+    libsqlite3-dev \
-    gdal-bin
+    zlib1g-dev
-# tippeanoe
+# tippecanoe
 RUN apt-get update
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 RUN apt-get install -y software-properties-common libsqlite3-dev zlib1g-dev
 RUN apt-add-repository -y ppa:git-core/ppa
 RUN mkdir -p /tmp/tippecanoe-src && git clone https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe-src
 WORKDIR /tmp/tippecanoe-src
@ -24,26 +24,35 @@ RUN /bin/sh -c make && make install
 ## gdal
 RUN add-apt-repository ppa:ubuntugis/ppa
-RUN apt-get -y install gdal-bin
+RUN apt-get -y install gdal-bin libgdal-dev
 # Install python3.10
 RUN add-apt-repository ppa:deadsnakes/ppa 
 RUN apt install -y python3.10-dev
 RUN apt install -y python3-pip
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 RUN update-alternatives --config python3
 # Copy all project files into the container
 COPY . /data-pipeline
 WORKDIR /data-pipeline
 # Python package installation using poetry. See:
 # https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker
 ENV PYTHONFAULTHANDLER=1 \
-  PYTHONUNBUFFERED=1 \
+    PYTHONUNBUFFERED=1 \
-  PYTHONHASHSEED=random \
+    PYTHONHASHSEED=random \
-  PIP_NO_CACHE_DIR=off \
+    PIP_NO_CACHE_DIR=off \
-  PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
-  PIP_DEFAULT_TIMEOUT=100 \
+    PIP_DEFAULT_TIMEOUT=100 \
-  POETRY_VERSION=1.1.12
+    POETRY_VERSION=1.8.4
 WORKDIR /data-pipeline
 COPY . /data-pipeline
 RUN pip install "poetry==$POETRY_VERSION"
 RUN poetry config virtualenvs.create false \
-  && poetry config virtualenvs.in-project false \
+    && poetry config virtualenvs.in-project false \
-  && poetry install --no-dev --no-interaction --no-ansi
+    && poetry install --only main --no-interaction --no-ansi
 RUN pip install openpyxl
-# Copy all project files into the container
+# Default behavior is to output the options for "full-run". This prevents the entire pipeline from running unintentionally.
-
+ENTRYPOINT [ "poetry", "run", "python3", "-m", "data_pipeline.application"]  
-CMD python3 -m data_pipeline.application data-full-run --check -s aws
+CMD ["full-run", "--help"]
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -88,10 +88,11 @@ def data_cleanup():
    log_info("Cleaning up all data folders")
    census_reset(data_path)
    data_folder_cleanup()
-    tribal_reset(data_path)
+    downloadable_cleanup()
    score_folder_cleanup()
    temp_folder_cleanup()
    geo_score_folder_cleanup()
    tribal_reset(data_path)
    log_goodbye()
@ -304,45 +305,67 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
    log_title("Full Run", "Census DL, ETL, Score, Combine, Generate Tiles")
    data_path = settings.APP_ROOT / "data"
    first_run = False
    if check:
        if not check_first_run():
            # check if the data full run has been run before
-            log_info("The data full run was already executed")
+            first_run = True
-            sys.exit()
+
    if first_run:
        log_info("The data full run was already executed")
        sys.exit()
    else:
-        # census directories
+        # Directory cleanup
        log_info("Cleaning up data folders")
        census_reset(data_path)
        data_folder_cleanup()
        downloadable_cleanup()
        score_folder_cleanup()
        geo_score_folder_cleanup()
        temp_folder_cleanup()
        tribal_reset(data_path)
        if data_source == "local":
            log_info("Downloading census data")
            etl_runner("census", use_cache)
-        log_info("Running all ETLs")
+            log_info("Running all ETLs")
-        etl_runner(use_cache=use_cache)
+            etl_runner(use_cache=True)
            log_info("Running tribal ETL")
            etl_runner("tribal", use_cache)
        else:
            log_info("Downloading census data")
            etl_runner("census", use_cache=False)
            log_info("Running all ETLs")
            etl_runner(use_cache=False)
            log_info("Running tribal ETL")
            etl_runner("tribal", use_cache=False)
        log_info("Generating score")
        score_generate()
        log_info("Running post score")
        downloadable_cleanup()
        score_post(data_source)
-    log_info("Combining score with census GeoJSON")
+        log_info("Combining score with census GeoJSON")
-    score_geo(data_source)
+        score_geo(data_source)
-    log_info("Generating map tiles")
+        log_info("Generating map tiles")
-    generate_tiles(data_path, True)
+        generate_tiles(data_path, False)
-    log_info("Completing pipeline")
+        log_info("Generating tribal map tiles")
-    file = "first_run.txt"
+        generate_tiles(data_path, True)
-    cmd = f"touch {data_path}/{file}"
+
-    call(cmd, shell=True)
+        log_info("Completing pipeline")
        file = "first_run.txt"
        cmd = f"touch {data_path}/{file}"
        call(cmd, shell=True)
    log_goodbye()
@ -427,6 +450,7 @@ def full_post_etl(ctx):
    ctx.invoke(generate_score_post, data_source=None)
    ctx.invoke(geo_score, data_source=None)
    ctx.invoke(generate_map_tiles, generate_tribal_layer=False)
    ctx.invoke(generate_map_tiles, generate_tribal_layer=True)
@cli.command(
@ -440,6 +464,7 @@ def full_run(ctx, use_cache):
        ctx.invoke(data_cleanup)
    ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
    ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
    ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
    ctx.invoke(full_post_etl)
--- a/data/data-pipeline/data_pipeline/data/tribal/geojson/init.py
+++ b/data/data-pipeline/data_pipeline/data/tribal/geojson/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py
@ -21,5 +21,9 @@ def reset_data_directories(
    )
    # geojson
-    geojson_path = tribal_data_path / "geojson"
+    geojson_path = tribal_data_path / "geographic_data"
    remove_all_from_dir(geojson_path)
    # tiles
    tiles_path = tribal_data_path / "tiles"
    remove_all_from_dir(tiles_path)
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -218,6 +218,7 @@ def score_folder_cleanup() -> None:
    remove_all_from_dir(data_path / "score" / "geojson")
    remove_all_from_dir(data_path / "score" / "tiles")
    remove_all_from_dir(data_path / "score" / "shapefile")
    remove_all_from_dir(data_path / "score" / "search")
    downloadable_cleanup()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,4 +1,3 @@
 version: "3.4"
 services:
  # The j40_data_pipeline service runs the ETL pipeline to create the score
  score:
@ -15,6 +14,9 @@ services:
      ENV_FOR_DYNACONF: development
      PYTHONUNBUFFERED: 1
      TZ: America/Los_Angeles
    # The argument (and options) for the scoring step you want to run. Replace "full-run" with "etl-run" or whatever step you want to run
    # To add arguments follow this example: command: ["generate-map-tiles", "--arg1", "value1", "--arg2", "value2"]
    command: ["full-run", "--help"]
  # The score_server serves the data-pipeline volume as a URL
  j40_score_server:
@ -23,26 +25,21 @@ services:
    build: data/data-serve/.
    volumes:
      - ./data/data-pipeline/data_pipeline/data/score:/data/data-pipeline/data_pipeline/data/score
      - ./data/data-pipeline/data_pipeline/data/tribal:/data/data-pipeline/data_pipeline/data/tribal
    ports:
      - 5000:8080
    environment:
      TZ: America/Los_Angeles
  #The j40_website service runs the web app / map / site
  j40_website:
    image: j40_website
    container_name: j40_website_1
-    build: ./client
+    build: client
    environment:
    # See the client readme for more info on environment variables:
    # https://github.com/usds/justice40-tool/blob/main/client/README.md
      DATA_SOURCE: local
      # If you want the map to render a MapBox base map (as opposed to the
      # open source one from CartoDB), please create your own API TOKEN from
      # your MapBox account and add the token here:
      MAPBOX_STYLES_READ_TOKEN: ""
      TZ: America/Los_Angeles
    volumes:
      - ./client/src:/client/src