From 682b2d34a75278e43bdd161ad6a167f3b6b7dcca Mon Sep 17 00:00:00 2001 From: ericiwamoto <100735505+ericiwamoto@users.noreply.github.com> Date: Mon, 23 Dec 2024 08:05:18 -0800 Subject: [PATCH 1/6] Fix docker --- QUICKSTART.md | 4 +- client/.dockerignore | 9 ++- client/Dockerfile | 9 +-- data/data-pipeline/.dockerignore | 17 ++++++ data/data-pipeline/Dockerfile | 51 ++++++++++------- .../data_pipeline/application.py | 55 ++++++++++++++----- .../data/tribal/geojson/__init__.py | 0 .../etl/sources/tribal/etl_utils.py | 6 +- data/data-pipeline/data_pipeline/utils.py | 1 + docker-compose.yml | 13 ++--- 10 files changed, 111 insertions(+), 54 deletions(-) create mode 100644 data/data-pipeline/.dockerignore delete mode 100644 data/data-pipeline/data_pipeline/data/tribal/geojson/__init__.py diff --git a/QUICKSTART.md b/QUICKSTART.md index ca7e5c5c..12cbc154 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -14,10 +14,10 @@ Install [`docker`](https://docs.docker.com/get-docker/). See [Install Docker](IN > _Important_: To be able to run the entire application, you may need to increase the memory allocated for docker to at least 8096 MB. See [this post](https://stackoverflow.com/a/44533437) for more details. -Use `docker-compose` to run the application: +Use `docker compose` to run the application: ```sh -$ docker-compose up +$ docker compose up ``` > Note: This may take a while – possibly even an hour or two – since it has to build the containers and then download and process all the data. diff --git a/client/.dockerignore b/client/.dockerignore index b512c09d..c9713701 100644 --- a/client/.dockerignore +++ b/client/.dockerignore @@ -1 +1,8 @@ -node_modules \ No newline at end of file +.git +.gitignore +*Dockerfile* +*docker-compose* +.cache +public +node_modules +npm-debug.log \ No newline at end of file diff --git a/client/Dockerfile b/client/Dockerfile index 71c6024e..8ae0f912 100644 --- a/client/Dockerfile +++ b/client/Dockerfile @@ -4,17 +4,14 @@ FROM node:14 # this working directory WORKDIR /client -# Copy the package.json and package_lock.json files from local to the docker image / container -COPY package*.json ./ - -# install all packages as a layer in the docker image / container -RUN npm install - # copy all local files from the working directory to the docker image/container however we must use # dockerignore to ignore node_modules so that the image can use what what was just installed from the above # step. COPY . . +# install all packages as a layer in the docker image / container +RUN npm ci + ENV PORT=6000 EXPOSE 6000 diff --git a/data/data-pipeline/.dockerignore b/data/data-pipeline/.dockerignore new file mode 100644 index 00000000..fe68a7b1 --- /dev/null +++ b/data/data-pipeline/.dockerignore @@ -0,0 +1,17 @@ +./data_pipeline/data/census/csv/* +./data_pipeline/data/census/geojson/* +./data_pipeline/data/census/shp/* +./data_pipeline/data/dataset/* +./data_pipeline/data/score/csv/* +./data_pipeline/data/score/downloadable/* +./data_pipeline/data/score/geojson/* +./data_pipeline/data/score/search/* +./data_pipeline/data/score/shapefile/* +./data_pipeline/data/score/tiles/* +./data_pipeline/data/sources/* +./data_pipeline/data/tmp/* +./data_pipeline/data/tribal/csv/* +./data_pipeline/data/tribal/geographic_data/* +./data_pipeline/data/tribal/geojson/* +./data_pipeline/data/tribal/tiles/* + diff --git a/data/data-pipeline/Dockerfile b/data/data-pipeline/Dockerfile index 04bbca22..91016639 100644 --- a/data/data-pipeline/Dockerfile +++ b/data/data-pipeline/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:22.04 ENV TZ=America/Los_Angeles @@ -10,13 +10,13 @@ RUN apt-get update && TZ=America/Los_Angeles DEBIAN_FRONTEND=noninteractive apt- git \ unzip \ wget \ - python3-dev \ - python3-pip \ - gdal-bin + software-properties-common \ + libsqlite3-dev \ + zlib1g-dev -# tippeanoe +# tippecanoe +RUN apt-get update RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -RUN apt-get install -y software-properties-common libsqlite3-dev zlib1g-dev RUN apt-add-repository -y ppa:git-core/ppa RUN mkdir -p /tmp/tippecanoe-src && git clone https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe-src WORKDIR /tmp/tippecanoe-src @@ -24,26 +24,35 @@ RUN /bin/sh -c make && make install ## gdal RUN add-apt-repository ppa:ubuntugis/ppa -RUN apt-get -y install gdal-bin +RUN apt-get -y install gdal-bin libgdal-dev + +# Install python3.10 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt install -y python3.10-dev +RUN apt install -y python3-pip +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 +RUN update-alternatives --config python3 + +# Copy all project files into the container +COPY . /data-pipeline +WORKDIR /data-pipeline # Python package installation using poetry. See: # https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker ENV PYTHONFAULTHANDLER=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONHASHSEED=random \ - PIP_NO_CACHE_DIR=off \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - POETRY_VERSION=1.1.12 - -WORKDIR /data-pipeline -COPY . /data-pipeline + PYTHONUNBUFFERED=1 \ + PYTHONHASHSEED=random \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + POETRY_VERSION=1.8.4 RUN pip install "poetry==$POETRY_VERSION" RUN poetry config virtualenvs.create false \ - && poetry config virtualenvs.in-project false \ - && poetry install --no-dev --no-interaction --no-ansi + && poetry config virtualenvs.in-project false \ + && poetry install --only main --no-interaction --no-ansi +RUN pip install openpyxl -# Copy all project files into the container - -CMD python3 -m data_pipeline.application data-full-run --check -s aws +# Default behavior is to output the options for "full-run". This prevents the entire pipeline from running unintentionally. +ENTRYPOINT [ "poetry", "run", "python3", "-m", "data_pipeline.application"] +CMD ["full-run", "--help"] \ No newline at end of file diff --git a/data/data-pipeline/data_pipeline/application.py b/data/data-pipeline/data_pipeline/application.py index e405d9ec..2dc8d206 100644 --- a/data/data-pipeline/data_pipeline/application.py +++ b/data/data-pipeline/data_pipeline/application.py @@ -88,10 +88,11 @@ def data_cleanup(): log_info("Cleaning up all data folders") census_reset(data_path) data_folder_cleanup() - tribal_reset(data_path) + downloadable_cleanup() score_folder_cleanup() temp_folder_cleanup() geo_score_folder_cleanup() + tribal_reset(data_path) log_goodbye() @@ -304,45 +305,67 @@ def data_full_run(check: bool, data_source: str, use_cache: bool): log_title("Full Run", "Census DL, ETL, Score, Combine, Generate Tiles") data_path = settings.APP_ROOT / "data" + first_run = False if check: if not check_first_run(): # check if the data full run has been run before - log_info("The data full run was already executed") - sys.exit() + first_run = True + + if first_run: + log_info("The data full run was already executed") + sys.exit() else: - # census directories + # Directory cleanup log_info("Cleaning up data folders") census_reset(data_path) data_folder_cleanup() + downloadable_cleanup() score_folder_cleanup() + geo_score_folder_cleanup() temp_folder_cleanup() + tribal_reset(data_path) if data_source == "local": log_info("Downloading census data") etl_runner("census", use_cache) - log_info("Running all ETLs") - etl_runner(use_cache=use_cache) + log_info("Running all ETLs") + etl_runner(use_cache=True) + + log_info("Running tribal ETL") + etl_runner("tribal", use_cache) + + else: + log_info("Downloading census data") + etl_runner("census", use_cache=False) + + log_info("Running all ETLs") + etl_runner(use_cache=False) + + log_info("Running tribal ETL") + etl_runner("tribal", use_cache=False) log_info("Generating score") score_generate() log_info("Running post score") - downloadable_cleanup() score_post(data_source) - log_info("Combining score with census GeoJSON") - score_geo(data_source) + log_info("Combining score with census GeoJSON") + score_geo(data_source) - log_info("Generating map tiles") - generate_tiles(data_path, True) + log_info("Generating map tiles") + generate_tiles(data_path, False) - log_info("Completing pipeline") - file = "first_run.txt" - cmd = f"touch {data_path}/{file}" - call(cmd, shell=True) + log_info("Generating tribal map tiles") + generate_tiles(data_path, True) + + log_info("Completing pipeline") + file = "first_run.txt" + cmd = f"touch {data_path}/{file}" + call(cmd, shell=True) log_goodbye() @@ -427,6 +450,7 @@ def full_post_etl(ctx): ctx.invoke(generate_score_post, data_source=None) ctx.invoke(geo_score, data_source=None) ctx.invoke(generate_map_tiles, generate_tribal_layer=False) + ctx.invoke(generate_map_tiles, generate_tribal_layer=True) @cli.command( @@ -440,6 +464,7 @@ def full_run(ctx, use_cache): ctx.invoke(data_cleanup) ctx.invoke(census_data_download, zip_compress=False, use_cache=use_cache) ctx.invoke(etl_run, dataset=None, use_cache=use_cache) + ctx.invoke(etl_run, dataset="tribal", use_cache=use_cache) ctx.invoke(full_post_etl) diff --git a/data/data-pipeline/data_pipeline/data/tribal/geojson/__init__.py b/data/data-pipeline/data_pipeline/data/tribal/geojson/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py index ea97db1c..6bbb5b8b 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl_utils.py @@ -21,5 +21,9 @@ def reset_data_directories( ) # geojson - geojson_path = tribal_data_path / "geojson" + geojson_path = tribal_data_path / "geographic_data" remove_all_from_dir(geojson_path) + + # tiles + tiles_path = tribal_data_path / "tiles" + remove_all_from_dir(tiles_path) diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index fd5ff5d7..8e5fe8fa 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -218,6 +218,7 @@ def score_folder_cleanup() -> None: remove_all_from_dir(data_path / "score" / "geojson") remove_all_from_dir(data_path / "score" / "tiles") remove_all_from_dir(data_path / "score" / "shapefile") + remove_all_from_dir(data_path / "score" / "search") downloadable_cleanup() diff --git a/docker-compose.yml b/docker-compose.yml index b3f109c7..8af8f990 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.4" services: # The j40_data_pipeline service runs the ETL pipeline to create the score score: @@ -15,6 +14,9 @@ services: ENV_FOR_DYNACONF: development PYTHONUNBUFFERED: 1 TZ: America/Los_Angeles + # The argument (and options) for the scoring step you want to run. Replace "full-run" with "etl-run" or whatever step you want to run + # To add arguments follow this example: command: ["generate-map-tiles", "--arg1", "value1", "--arg2", "value2"] + command: ["full-run", "--help"] # The score_server serves the data-pipeline volume as a URL j40_score_server: @@ -23,26 +25,21 @@ services: build: data/data-serve/. volumes: - ./data/data-pipeline/data_pipeline/data/score:/data/data-pipeline/data_pipeline/data/score + - ./data/data-pipeline/data_pipeline/data/tribal:/data/data-pipeline/data_pipeline/data/tribal ports: - 5000:8080 environment: TZ: America/Los_Angeles - #The j40_website service runs the web app / map / site j40_website: image: j40_website container_name: j40_website_1 - build: ./client + build: client environment: # See the client readme for more info on environment variables: # https://github.com/usds/justice40-tool/blob/main/client/README.md DATA_SOURCE: local - - # If you want the map to render a MapBox base map (as opposed to the - # open source one from CartoDB), please create your own API TOKEN from - # your MapBox account and add the token here: - MAPBOX_STYLES_READ_TOKEN: "" TZ: America/Los_Angeles volumes: - ./client/src:/client/src From 7af92f575b99e4c43c481265ad3f2adb930290ec Mon Sep 17 00:00:00 2001 From: ericiwamoto <100735505+ericiwamoto@users.noreply.github.com> Date: Thu, 26 Dec 2024 08:35:22 -0800 Subject: [PATCH 2/6] Add retry logic to downloader --- .linkspector.yml | 4 ++++ QUICKSTART.md | 7 +++++-- data/data-pipeline/data_pipeline/etl/downloader.py | 9 +++++++++ data/data-pipeline/pyproject.toml | 1 + 4 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 .linkspector.yml diff --git a/.linkspector.yml b/.linkspector.yml new file mode 100644 index 00000000..0202c8c1 --- /dev/null +++ b/.linkspector.yml @@ -0,0 +1,4 @@ +dirs: + - . +ignorePatterns: + - pattern: '^http://localhost.*$' \ No newline at end of file diff --git a/QUICKSTART.md b/QUICKSTART.md index 12cbc154..22f953ac 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -19,7 +19,10 @@ Use `docker compose` to run the application: ```sh $ docker compose up ``` +Docker compose will spin up three containers: A data pipeline container, a data server, and a web server. -> Note: This may take a while – possibly even an hour or two – since it has to build the containers and then download and process all the data. +The data pipeline container can run the entire data pipeline, or any individual step. By default it will simply display the options for the full pipeline run. To have it actually run the pipeline, remove the `, "--help"` from the `[command]` in the `docker-compose.yml` file before launch. Note that it can take an hour or more to run the full pipeline. Furthermore, the data container mounts your local repo directory to read and write files so if you've previously run the pipeline manually on your local system, your score and map tile files will get overwritten. -After it initializes, you should be able to open the application in your browser at [http://localhost:8000](http://localhost:8000). +The data server will make the files created by the data pipeline container available to the web server + +The web server will run the application website. After it initializes, you should be able to open the web server in your browser at [http://localhost:8000](http://localhost:8000). \ No newline at end of file diff --git a/data/data-pipeline/data_pipeline/etl/downloader.py b/data/data-pipeline/data_pipeline/etl/downloader.py index fd0fec50..4cc4f83e 100644 --- a/data/data-pipeline/data_pipeline/etl/downloader.py +++ b/data/data-pipeline/data_pipeline/etl/downloader.py @@ -7,6 +7,7 @@ import shutil from pathlib import Path from data_pipeline.config import settings from data_pipeline.utils import get_module_logger +from tenacity import retry, stop_after_attempt, wait_exponential logger = get_module_logger(__name__) @@ -15,6 +16,10 @@ class Downloader: """A simple class to encapsulate the download capabilities of the application""" @classmethod + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + ) def download_file_from_url( cls, file_url: str, @@ -58,6 +63,10 @@ class Downloader: return download_file_name @classmethod + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=10), + ) def download_zip_file_from_url( cls, file_url: str, diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml index f7122078..47b676dd 100644 --- a/data/data-pipeline/pyproject.toml +++ b/data/data-pipeline/pyproject.toml @@ -41,6 +41,7 @@ xlsxwriter = "^2.0.0" pydantic = "^1.9.0" Rtree = "^1.0.0" fiona = "~1.8.21" +tenacity = ">=5.0.2" [tool.poetry.group.dev.dependencies] black = "^21" From aa7d12b5708ed5d0d8a8dcd98ba553ba2bece423 Mon Sep 17 00:00:00 2001 From: Carlos Felix <63804190+carlosfelix2@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:19:38 -0500 Subject: [PATCH 3/6] Update to comparator tool to create output folder if it does not exist --- data/data-pipeline/data_pipeline/comparator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data/data-pipeline/data_pipeline/comparator.py b/data/data-pipeline/data_pipeline/comparator.py index 97e512c3..4d0cd47f 100644 --- a/data/data-pipeline/data_pipeline/comparator.py +++ b/data/data-pipeline/data_pipeline/comparator.py @@ -257,6 +257,7 @@ def _generate_delta(prod_df: pd.DataFrame, local_df: pd.DataFrame): f" There are {len(comparison_results_df.index):,} tracts with at least one difference.\n" ) + WORKING_PATH.mkdir(parents=True, exist_ok=True) comparison_path = WORKING_PATH / "deltas.csv" comparison_results_df.to_csv(path_or_buf=comparison_path) From f14eeab61d07cbb393cffc3bcbd44ffdb6bdbd2f Mon Sep 17 00:00:00 2001 From: Carlos Felix <63804190+carlosfelix2@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:20:17 -0500 Subject: [PATCH 4/6] Content updates as requested post launch --- .../datasetContainer.test.tsx.snap | 8 ++- client/src/data/copy/explore.tsx | 4 +- client/src/data/copy/faqs.tsx | 4 +- client/src/data/copy/methodology.tsx | 59 +++---------------- client/src/intl/en.json | 20 +++---- client/src/intl/es.json | 14 ++--- .../freqAskedQuestions.test.tsx.snap | 4 +- .../__snapshots__/methodology.test.tsx.snap | 14 +++-- 8 files changed, 45 insertions(+), 82 deletions(-) diff --git a/client/src/components/DatasetContainer/tests/__snapshots__/datasetContainer.test.tsx.snap b/client/src/components/DatasetContainer/tests/__snapshots__/datasetContainer.test.tsx.snap index ca1692bb..d22b249d 100644 --- a/client/src/components/DatasetContainer/tests/__snapshots__/datasetContainer.test.tsx.snap +++ b/client/src/components/DatasetContainer/tests/__snapshots__/datasetContainer.test.tsx.snap @@ -978,7 +978,7 @@ exports[`rendering of the DatasetContainer checks if various text fields are vis rel="noreferrer" target="_blank" > - The Trust for Public Lands + The Trust for Public Land and
- Share of homes built before 1960, which indicates potential lead paint exposure. Tracts with extremely high home values (i.e. median home values above the 90th percentile) that are less likely to face health risks from lead paint exposure are not included. + Share of homes built before 1960, which indicates potential lead paint exposure. Tracts with extremely high home values (i.e., median home values above the 90th percentile) that are less likely to face health risks from lead paint exposure are not included.