Fix docker

This commit is contained in:
ericiwamoto 2024-12-23 08:05:18 -08:00 committed by Carlos Felix
commit 682b2d34a7
10 changed files with 111 additions and 54 deletions

View file

@ -0,0 +1,17 @@
./data_pipeline/data/census/csv/*
./data_pipeline/data/census/geojson/*
./data_pipeline/data/census/shp/*
./data_pipeline/data/dataset/*
./data_pipeline/data/score/csv/*
./data_pipeline/data/score/downloadable/*
./data_pipeline/data/score/geojson/*
./data_pipeline/data/score/search/*
./data_pipeline/data/score/shapefile/*
./data_pipeline/data/score/tiles/*
./data_pipeline/data/sources/*
./data_pipeline/data/tmp/*
./data_pipeline/data/tribal/csv/*
./data_pipeline/data/tribal/geographic_data/*
./data_pipeline/data/tribal/geojson/*
./data_pipeline/data/tribal/tiles/*

View file

@ -1,4 +1,4 @@
FROM ubuntu:20.04
FROM ubuntu:22.04
ENV TZ=America/Los_Angeles
@ -10,13 +10,13 @@ RUN apt-get update && TZ=America/Los_Angeles DEBIAN_FRONTEND=noninteractive apt-
git \
unzip \
wget \
python3-dev \
python3-pip \
gdal-bin
software-properties-common \
libsqlite3-dev \
zlib1g-dev
# tippeanoe
# tippecanoe
RUN apt-get update
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN apt-get install -y software-properties-common libsqlite3-dev zlib1g-dev
RUN apt-add-repository -y ppa:git-core/ppa
RUN mkdir -p /tmp/tippecanoe-src && git clone https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe-src
WORKDIR /tmp/tippecanoe-src
@ -24,26 +24,35 @@ RUN /bin/sh -c make && make install
## gdal
RUN add-apt-repository ppa:ubuntugis/ppa
RUN apt-get -y install gdal-bin
RUN apt-get -y install gdal-bin libgdal-dev
# Install python3.10
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt install -y python3.10-dev
RUN apt install -y python3-pip
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
RUN update-alternatives --config python3
# Copy all project files into the container
COPY . /data-pipeline
WORKDIR /data-pipeline
# Python package installation using poetry. See:
# https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker
ENV PYTHONFAULTHANDLER=1 \
PYTHONUNBUFFERED=1 \
PYTHONHASHSEED=random \
PIP_NO_CACHE_DIR=off \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
POETRY_VERSION=1.1.12
WORKDIR /data-pipeline
COPY . /data-pipeline
PYTHONUNBUFFERED=1 \
PYTHONHASHSEED=random \
PIP_NO_CACHE_DIR=off \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
POETRY_VERSION=1.8.4
RUN pip install "poetry==$POETRY_VERSION"
RUN poetry config virtualenvs.create false \
&& poetry config virtualenvs.in-project false \
&& poetry install --no-dev --no-interaction --no-ansi
&& poetry config virtualenvs.in-project false \
&& poetry install --only main --no-interaction --no-ansi
RUN pip install openpyxl
# Copy all project files into the container
CMD python3 -m data_pipeline.application data-full-run --check -s aws
# Default behavior is to output the options for "full-run". This prevents the entire pipeline from running unintentionally.
ENTRYPOINT [ "poetry", "run", "python3", "-m", "data_pipeline.application"]
CMD ["full-run", "--help"]

View file

@ -88,10 +88,11 @@ def data_cleanup():
log_info("Cleaning up all data folders")
census_reset(data_path)
data_folder_cleanup()
tribal_reset(data_path)
downloadable_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
geo_score_folder_cleanup()
tribal_reset(data_path)
log_goodbye()
@ -304,45 +305,67 @@ def data_full_run(check: bool, data_source: str, use_cache: bool):
log_title("Full Run", "Census DL, ETL, Score, Combine, Generate Tiles")
data_path = settings.APP_ROOT / "data"
first_run = False
if check:
if not check_first_run():
# check if the data full run has been run before
log_info("The data full run was already executed")
sys.exit()
first_run = True
if first_run:
log_info("The data full run was already executed")
sys.exit()
else:
# census directories
# Directory cleanup
log_info("Cleaning up data folders")
census_reset(data_path)
data_folder_cleanup()
downloadable_cleanup()
score_folder_cleanup()
geo_score_folder_cleanup()
temp_folder_cleanup()
tribal_reset(data_path)
if data_source == "local":
log_info("Downloading census data")
etl_runner("census", use_cache)
log_info("Running all ETLs")
etl_runner(use_cache=use_cache)
log_info("Running all ETLs")
etl_runner(use_cache=True)
log_info("Running tribal ETL")
etl_runner("tribal", use_cache)
else:
log_info("Downloading census data")
etl_runner("census", use_cache=False)
log_info("Running all ETLs")
etl_runner(use_cache=False)
log_info("Running tribal ETL")
etl_runner("tribal", use_cache=False)
log_info("Generating score")
score_generate()
log_info("Running post score")
downloadable_cleanup()
score_post(data_source)
log_info("Combining score with census GeoJSON")
score_geo(data_source)
log_info("Combining score with census GeoJSON")
score_geo(data_source)
log_info("Generating map tiles")
generate_tiles(data_path, True)
log_info("Generating map tiles")
generate_tiles(data_path, False)
log_info("Completing pipeline")
file = "first_run.txt"
cmd = f"touch {data_path}/{file}"
call(cmd, shell=True)
log_info("Generating tribal map tiles")
generate_tiles(data_path, True)
log_info("Completing pipeline")
file = "first_run.txt"
cmd = f"touch {data_path}/{file}"
call(cmd, shell=True)
log_goodbye()
@ -427,6 +450,7 @@ def full_post_etl(ctx):
ctx.invoke(generate_score_post, data_source=None)
ctx.invoke(geo_score, data_source=None)
ctx.invoke(generate_map_tiles, generate_tribal_layer=False)
ctx.invoke(generate_map_tiles, generate_tribal_layer=True)
@cli.command(
@ -440,6 +464,7 @@ def full_run(ctx, use_cache):
ctx.invoke(data_cleanup)
ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache)
ctx.invoke(etl_run, dataset=None, use_cache=use_cache)
ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache)
ctx.invoke(full_post_etl)

View file

@ -21,5 +21,9 @@ def reset_data_directories(
)
# geojson
geojson_path = tribal_data_path / "geojson"
geojson_path = tribal_data_path / "geographic_data"
remove_all_from_dir(geojson_path)
# tiles
tiles_path = tribal_data_path / "tiles"
remove_all_from_dir(tiles_path)

View file

@ -218,6 +218,7 @@ def score_folder_cleanup() -> None:
remove_all_from_dir(data_path / "score" / "geojson")
remove_all_from_dir(data_path / "score" / "tiles")
remove_all_from_dir(data_path / "score" / "shapefile")
remove_all_from_dir(data_path / "score" / "search")
downloadable_cleanup()