Fix docker

2025-02-21 09:11:26 -08:00 · 2024-12-23 08:05:18 -08:00 · 2024-12-23 08:05:18 -08:00 · 682b2d34a7
commit 682b2d34a7
parent aa88249f37
10 changed files with 111 additions and 54 deletions
--- a/QUICKSTART.md
+++ b/QUICKSTART.md
@ -14,10 +14,10 @@ Install [`docker`](https://docs.docker.com/get-docker/). See [Install Docker](IN

 > _Important_: To be able to run the entire application, you may need to increase the memory allocated for docker to at least 8096 MB. See [this post](https://stackoverflow.com/a/44533437) for more details.

-Use `docker-compose` to run the application:
+Use `docker compose` to run the application:

 ```sh
-$ docker-compose up
+$ docker compose up
 ```

 > Note: This may take a while – possibly even an hour or two – since it has to build the containers and then download and process all the data.
--- a/client/.dockerignore
+++ b/client/.dockerignore
@ -1 +1,8 @@
-node_modules
+.git
+.gitignore
+*Dockerfile*
+*docker-compose*
+.cache
+public
+node_modules
+npm-debug.log
--- a/client/Dockerfile
+++ b/client/Dockerfile
@ -4,17 +4,14 @@ FROM node:14
 # this working directory
 WORKDIR /client

-# Copy the package.json and package_lock.json files from local to the docker image / container
-COPY package*.json ./
-
-# install all packages as a layer in the docker image / container
-RUN npm install
-
 # copy all local files from the working directory to the docker image/container however we must use 
 # dockerignore to ignore node_modules so that the image can use what what was just installed from the above
 # step.
 COPY . .

+# install all packages as a layer in the docker image / container
+RUN npm ci
+
 ENV PORT=6000

 EXPOSE 6000
--- a/data/data-pipeline/.dockerignore
+++ b/data/data-pipeline/.dockerignore
@ -0,0 +1,17 @@
+./data_pipeline/data/census/csv/*
+./data_pipeline/data/census/geojson/*
+./data_pipeline/data/census/shp/*
+./data_pipeline/data/dataset/*
+./data_pipeline/data/score/csv/*
+./data_pipeline/data/score/downloadable/*
+./data_pipeline/data/score/geojson/*
+./data_pipeline/data/score/search/*
+./data_pipeline/data/score/shapefile/*
+./data_pipeline/data/score/tiles/*
+./data_pipeline/data/sources/*
+./data_pipeline/data/tmp/*
+./data_pipeline/data/tribal/csv/*
+./data_pipeline/data/tribal/geographic_data/*
+./data_pipeline/data/tribal/geojson/*
+./data_pipeline/data/tribal/tiles/*
+
--- a/data/data-pipeline/Dockerfile
+++ b/data/data-pipeline/Dockerfile
@ -1,4 +1,4 @@
-FROM ubuntu:20.04
+FROM ubuntu:22.04

 ENV TZ=America/Los_Angeles

@ -10,13 +10,13 @@ RUN apt-get update && TZ=America/Los_Angeles DEBIAN_FRONTEND=noninteractive apt-
    git \
    unzip \
    wget \
-    python3-dev \
-    python3-pip \
-    gdal-bin
+    software-properties-common \
+    libsqlite3-dev \
+    zlib1g-dev

-# tippeanoe
+# tippecanoe
+RUN apt-get update
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-RUN apt-get install -y software-properties-common libsqlite3-dev zlib1g-dev
 RUN apt-add-repository -y ppa:git-core/ppa
 RUN mkdir -p /tmp/tippecanoe-src && git clone https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe-src
 WORKDIR /tmp/tippecanoe-src
@ -24,26 +24,35 @@ RUN /bin/sh -c make && make install

 ## gdal
 RUN add-apt-repository ppa:ubuntugis/ppa
-RUN apt-get -y install gdal-bin
+RUN apt-get -y install gdal-bin libgdal-dev
+
+# Install python3.10
+RUN add-apt-repository ppa:deadsnakes/ppa 
+RUN apt install -y python3.10-dev
+RUN apt install -y python3-pip
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+RUN update-alternatives --config python3
+
+# Copy all project files into the container
+COPY . /data-pipeline
+WORKDIR /data-pipeline

 # Python package installation using poetry. See:
 # https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker
 ENV PYTHONFAULTHANDLER=1 \
-  PYTHONUNBUFFERED=1 \
-  PYTHONHASHSEED=random \
-  PIP_NO_CACHE_DIR=off \
-  PIP_DISABLE_PIP_VERSION_CHECK=on \
-  PIP_DEFAULT_TIMEOUT=100 \
-  POETRY_VERSION=1.1.12
-
-WORKDIR /data-pipeline
-COPY . /data-pipeline
+    PYTHONUNBUFFERED=1 \
+    PYTHONHASHSEED=random \
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    POETRY_VERSION=1.8.4

 RUN pip install "poetry==$POETRY_VERSION"
 RUN poetry config virtualenvs.create false \
-  && poetry config virtualenvs.in-project false \
-  && poetry install --no-dev --no-interaction --no-ansi
+    && poetry config virtualenvs.in-project false \
+    && poetry install --only main --no-interaction --no-ansi
+RUN pip install openpyxl

-# Copy all project files into the container
-
-CMD python3 -m data_pipeline.application data-full-run --check -s aws
+# Default behavior is to output the options for "full-run". This prevents the entire pipeline from running unintentionally.
+ENTRYPOINT [ "poetry", "run", "python3", "-m", "data_pipeline.application"]  
+CMD ["full-run", "--help"]
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -88,10 +88,11 @@ def data_cleanup():
    log_info("Cleaning up all data folders")
    census_reset(data_path)
    data_folder_cleanup()
-    tribal_reset(data_path)
+    downloadable_cleanup()
    score_folder_cleanup()
    temp_folder_cleanup()
    geo_score_folder_cleanup()
+    tribal_reset(data_path)

    log_goodbye()

@ -304,45 +305,67 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
    log_title("Full Run", "Census DL, ETL, Score, Combine, Generate Tiles")

    data_path = settings.APP_ROOT / "data"
+    first_run = False

    if check:
        if not check_first_run():
            # check if the data full run has been run before
-            log_info("The data full run was already executed")
-            sys.exit()
+            first_run = True
+
+    if first_run:
+        log_info("The data full run was already executed")
+        sys.exit()

    else:
-        # census directories
+        # Directory cleanup
        log_info("Cleaning up data folders")
        census_reset(data_path)
        data_folder_cleanup()
+        downloadable_cleanup()
        score_folder_cleanup()
+        geo_score_folder_cleanup()
        temp_folder_cleanup()
+        tribal_reset(data_path)

        if data_source == "local":
            log_info("Downloading census data")
            etl_runner("census", use_cache)

-        log_info("Running all ETLs")
-        etl_runner(use_cache=use_cache)
+            log_info("Running all ETLs")
+            etl_runner(use_cache=True)
+
+            log_info("Running tribal ETL")
+            etl_runner("tribal", use_cache)
+
+        else:
+            log_info("Downloading census data")
+            etl_runner("census", use_cache=False)
+
+            log_info("Running all ETLs")
+            etl_runner(use_cache=False)
+
+            log_info("Running tribal ETL")
+            etl_runner("tribal", use_cache=False)

        log_info("Generating score")
        score_generate()

        log_info("Running post score")
-        downloadable_cleanup()
        score_post(data_source)

-    log_info("Combining score with census GeoJSON")
-    score_geo(data_source)
+        log_info("Combining score with census GeoJSON")
+        score_geo(data_source)

-    log_info("Generating map tiles")
-    generate_tiles(data_path, True)
+        log_info("Generating map tiles")
+        generate_tiles(data_path, False)

-    log_info("Completing pipeline")
-    file = "first_run.txt"
-    cmd = f"touch {data_path}/{file}"
-    call(cmd, shell=True)
+        log_info("Generating tribal map tiles")
+        generate_tiles(data_path, True)
+
+        log_info("Completing pipeline")
+        file = "first_run.txt"
+        cmd = f"touch {data_path}/{file}"
+        call(cmd, shell=True)

    log_goodbye()

@ -427,6 +450,7 @@ def full_post_etl(ctx):
    ctx.invoke(generate_score_post, data_source=None)
    ctx.invoke(geo_score, data_source=None)
    ctx.invoke(generate_map_tiles, generate_tribal_layer=False)
+    ctx.invoke(generate_map_tiles, generate_tribal_layer=True)


@cli.command(
@ -440,6 +464,7 @@ def full_run(ctx, use_cache):
        ctx.invoke(data_cleanup)
    ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
    ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
+    ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
    ctx.invoke(full_post_etl)


--- a/data/data-pipeline/data_pipeline/data/tribal/geojson/init.py
+++ b/data/data-pipeline/data_pipeline/data/tribal/geojson/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py
@ -21,5 +21,9 @@ def reset_data_directories(
    )

    # geojson
-    geojson_path = tribal_data_path / "geojson"
+    geojson_path = tribal_data_path / "geographic_data"
    remove_all_from_dir(geojson_path)
+
+    # tiles
+    tiles_path = tribal_data_path / "tiles"
+    remove_all_from_dir(tiles_path)
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -218,6 +218,7 @@ def score_folder_cleanup() -> None:
    remove_all_from_dir(data_path / "score" / "geojson")
    remove_all_from_dir(data_path / "score" / "tiles")
    remove_all_from_dir(data_path / "score" / "shapefile")
+    remove_all_from_dir(data_path / "score" / "search")
    downloadable_cleanup()


--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,4 +1,3 @@
-version: "3.4"
 services:
  # The j40_data_pipeline service runs the ETL pipeline to create the score
  score:
@ -15,6 +14,9 @@ services:
      ENV_FOR_DYNACONF: development
      PYTHONUNBUFFERED: 1
      TZ: America/Los_Angeles
+    # The argument (and options) for the scoring step you want to run. Replace "full-run" with "etl-run" or whatever step you want to run
+    # To add arguments follow this example: command: ["generate-map-tiles", "--arg1", "value1", "--arg2", "value2"]
+    command: ["full-run", "--help"]

  # The score_server serves the data-pipeline volume as a URL
  j40_score_server:
@ -23,26 +25,21 @@ services:
    build: data/data-serve/.
    volumes:
      - ./data/data-pipeline/data_pipeline/data/score:/data/data-pipeline/data_pipeline/data/score
+      - ./data/data-pipeline/data_pipeline/data/tribal:/data/data-pipeline/data_pipeline/data/tribal
    ports:
      - 5000:8080
    environment:
      TZ: America/Los_Angeles

-
  #The j40_website service runs the web app / map / site
  j40_website:
    image: j40_website
    container_name: j40_website_1
-    build: ./client
+    build: client
    environment:
    # See the client readme for more info on environment variables:
    # https://github.com/usds/justice40-tool/blob/main/client/README.md
      DATA_SOURCE: local
-
-      # If you want the map to render a MapBox base map (as opposed to the
-      # open source one from CartoDB), please create your own API TOKEN from
-      # your MapBox account and add the token here:
-      MAPBOX_STYLES_READ_TOKEN: ""
      TZ: America/Los_Angeles
    volumes:
      - ./client/src:/client/src