Fix docker

This commit is contained in:
ericiwamoto 2024-12-23 08:05:18 -08:00 committed by Carlos Felix
parent aa88249f37
commit 682b2d34a7
10 changed files with 111 additions and 54 deletions

View file

@ -14,10 +14,10 @@ Install [`docker`](https://docs.docker.com/get-docker/). See [Install Docker](IN
> _Important_: To be able to run the entire application, you may need to increase the memory allocated for docker to at least 8096 MB. See [this post](https://stackoverflow.com/a/44533437) for more details.
Use `docker-compose` to run the application:
Use `docker compose` to run the application:
```sh
$ docker-compose up
$ docker compose up
```
> Note: This may take a while possibly even an hour or two since it has to build the containers and then download and process all the data.

View file

@ -1 +1,8 @@
node_modules
.git
.gitignore
*Dockerfile*
*docker-compose*
.cache
public
node_modules
npm-debug.log

View file

@ -4,17 +4,14 @@ FROM node:14
# this working directory
WORKDIR /client
# Copy the package.json and package_lock.json files from local to the docker image / container
COPY package*.json ./
# install all packages as a layer in the docker image / container
RUN npm install
# copy all local files from the working directory to the docker image/container however we must use
# dockerignore to ignore node_modules so that the image can use what what was just installed from the above
# step.
COPY . .
# install all packages as a layer in the docker image / container
RUN npm ci
ENV PORT=6000
EXPOSE 6000

View file

@ -0,0 +1,17 @@
./data_pipeline/data/census/csv/*
./data_pipeline/data/census/geojson/*
./data_pipeline/data/census/shp/*
./data_pipeline/data/dataset/*
./data_pipeline/data/score/csv/*
./data_pipeline/data/score/downloadable/*
./data_pipeline/data/score/geojson/*
./data_pipeline/data/score/search/*
./data_pipeline/data/score/shapefile/*
./data_pipeline/data/score/tiles/*
./data_pipeline/data/sources/*
./data_pipeline/data/tmp/*
./data_pipeline/data/tribal/csv/*
./data_pipeline/data/tribal/geographic_data/*
./data_pipeline/data/tribal/geojson/*
./data_pipeline/data/tribal/tiles/*

View file

@ -1,4 +1,4 @@
FROM ubuntu:20.04
FROM ubuntu:22.04
ENV TZ=America/Los_Angeles
@ -10,13 +10,13 @@ RUN apt-get update && TZ=America/Los_Angeles DEBIAN_FRONTEND=noninteractive apt-
git \
unzip \
wget \
python3-dev \
python3-pip \
gdal-bin
software-properties-common \
libsqlite3-dev \
zlib1g-dev
# tippeanoe
# tippecanoe
RUN apt-get update
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN apt-get install -y software-properties-common libsqlite3-dev zlib1g-dev
RUN apt-add-repository -y ppa:git-core/ppa
RUN mkdir -p /tmp/tippecanoe-src && git clone https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe-src
WORKDIR /tmp/tippecanoe-src
@ -24,26 +24,35 @@ RUN /bin/sh -c make && make install
## gdal
RUN add-apt-repository ppa:ubuntugis/ppa
RUN apt-get -y install gdal-bin
RUN apt-get -y install gdal-bin libgdal-dev
# Install python3.10
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt install -y python3.10-dev
RUN apt install -y python3-pip
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
RUN update-alternatives --config python3
# Copy all project files into the container
COPY . /data-pipeline
WORKDIR /data-pipeline
# Python package installation using poetry. See:
# https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker
ENV PYTHONFAULTHANDLER=1 \
PYTHONUNBUFFERED=1 \
PYTHONHASHSEED=random \
PIP_NO_CACHE_DIR=off \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
POETRY_VERSION=1.1.12
WORKDIR /data-pipeline
COPY . /data-pipeline
PYTHONUNBUFFERED=1 \
PYTHONHASHSEED=random \
PIP_NO_CACHE_DIR=off \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
POETRY_VERSION=1.8.4
RUN pip install "poetry==$POETRY_VERSION"
RUN poetry config virtualenvs.create false \
&& poetry config virtualenvs.in-project false \
&& poetry install --no-dev --no-interaction --no-ansi
&& poetry config virtualenvs.in-project false \
&& poetry install --only main --no-interaction --no-ansi
RUN pip install openpyxl
# Copy all project files into the container
CMD python3 -m data_pipeline.application data-full-run --check -s aws
# Default behavior is to output the options for "full-run". This prevents the entire pipeline from running unintentionally.
ENTRYPOINT [ "poetry", "run", "python3", "-m", "data_pipeline.application"]
CMD ["full-run", "--help"]

View file

@ -88,10 +88,11 @@ def data_cleanup():
log_info("Cleaning up all data folders")
census_reset(data_path)
data_folder_cleanup()
tribal_reset(data_path)
downloadable_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
geo_score_folder_cleanup()
tribal_reset(data_path)
log_goodbye()
@ -304,45 +305,67 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
log_title("Full Run", "Census DL, ETL, Score, Combine, Generate Tiles")
data_path = settings.APP_ROOT / "data"
first_run = False
if check:
if not check_first_run():
# check if the data full run has been run before
log_info("The data full run was already executed")
sys.exit()
first_run = True
if first_run:
log_info("The data full run was already executed")
sys.exit()
else:
# census directories
# Directory cleanup
log_info("Cleaning up data folders")
census_reset(data_path)
data_folder_cleanup()
downloadable_cleanup()
score_folder_cleanup()
geo_score_folder_cleanup()
temp_folder_cleanup()
tribal_reset(data_path)
if data_source == "local":
log_info("Downloading census data")
etl_runner("census", use_cache)
log_info("Running all ETLs")
etl_runner(use_cache=use_cache)
log_info("Running all ETLs")
etl_runner(use_cache=True)
log_info("Running tribal ETL")
etl_runner("tribal", use_cache)
else:
log_info("Downloading census data")
etl_runner("census", use_cache=False)
log_info("Running all ETLs")
etl_runner(use_cache=False)
log_info("Running tribal ETL")
etl_runner("tribal", use_cache=False)
log_info("Generating score")
score_generate()
log_info("Running post score")
downloadable_cleanup()
score_post(data_source)
log_info("Combining score with census GeoJSON")
score_geo(data_source)
log_info("Combining score with census GeoJSON")
score_geo(data_source)
log_info("Generating map tiles")
generate_tiles(data_path, True)
log_info("Generating map tiles")
generate_tiles(data_path, False)
log_info("Completing pipeline")
file = "first_run.txt"
cmd = f"touch {data_path}/{file}"
call(cmd, shell=True)
log_info("Generating tribal map tiles")
generate_tiles(data_path, True)
log_info("Completing pipeline")
file = "first_run.txt"
cmd = f"touch {data_path}/{file}"
call(cmd, shell=True)
log_goodbye()
@ -427,6 +450,7 @@ def full_post_etl(ctx):
ctx.invoke(generate_score_post, data_source=None)
ctx.invoke(geo_score, data_source=None)
ctx.invoke(generate_map_tiles, generate_tribal_layer=False)
ctx.invoke(generate_map_tiles, generate_tribal_layer=True)
@cli.command(
@ -440,6 +464,7 @@ def full_run(ctx, use_cache):
ctx.invoke(data_cleanup)
ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
ctx.invoke(full_post_etl)

View file

@ -21,5 +21,9 @@ def reset_data_directories(
)
# geojson
geojson_path = tribal_data_path / "geojson"
geojson_path = tribal_data_path / "geographic_data"
remove_all_from_dir(geojson_path)
# tiles
tiles_path = tribal_data_path / "tiles"
remove_all_from_dir(tiles_path)

View file

@ -218,6 +218,7 @@ def score_folder_cleanup() -> None:
remove_all_from_dir(data_path / "score" / "geojson")
remove_all_from_dir(data_path / "score" / "tiles")
remove_all_from_dir(data_path / "score" / "shapefile")
remove_all_from_dir(data_path / "score" / "search")
downloadable_cleanup()

View file

@ -1,4 +1,3 @@
version: "3.4"
services:
# The j40_data_pipeline service runs the ETL pipeline to create the score
score:
@ -15,6 +14,9 @@ services:
ENV_FOR_DYNACONF: development
PYTHONUNBUFFERED: 1
TZ: America/Los_Angeles
# The argument (and options) for the scoring step you want to run. Replace "full-run" with "etl-run" or whatever step you want to run
# To add arguments follow this example: command: ["generate-map-tiles", "--arg1", "value1", "--arg2", "value2"]
command: ["full-run", "--help"]
# The score_server serves the data-pipeline volume as a URL
j40_score_server:
@ -23,26 +25,21 @@ services:
build: data/data-serve/.
volumes:
- ./data/data-pipeline/data_pipeline/data/score:/data/data-pipeline/data_pipeline/data/score
- ./data/data-pipeline/data_pipeline/data/tribal:/data/data-pipeline/data_pipeline/data/tribal
ports:
- 5000:8080
environment:
TZ: America/Los_Angeles
#The j40_website service runs the web app / map / site
j40_website:
image: j40_website
container_name: j40_website_1
build: ./client
build: client
environment:
# See the client readme for more info on environment variables:
# https://github.com/usds/justice40-tool/blob/main/client/README.md
DATA_SOURCE: local
# If you want the map to render a MapBox base map (as opposed to the
# open source one from CartoDB), please create your own API TOKEN from
# your MapBox account and add the token here:
MAPBOX_STYLES_READ_TOKEN: ""
TZ: America/Los_Angeles
volumes:
- ./client/src:/client/src