Merge branch 'main' into emma-nechamkin/release/score-narwhal

2025-02-22 17:44:20 -08:00 · 2022-10-26 15:17:19 -04:00 · 2022-10-26 15:17:19 -04:00 · e51af9d67e
commit e51af9d67e
parent b5cea2751f 5f588672a7
12 changed files with 292 additions and 11 deletions
--- a/.github/workflows/create-score-version.yml
+++ b/.github/workflows/create-score-version.yml
@ -0,0 +1,105 @@
+name: Create Score Version
+on:
+  workflow_dispatch:
+    inputs:
+      score_version:
+        description: "Which version of the score are you generating?"
+        required: true
+        default: 'beta'
+        type: choice
+        options:
+        - beta
+        - 1.0
+      score_date:
+        description: "What is today's date, in YYYY-MM-DD format?"
+        required: true
+        type: string
+
+env:
+  CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
+  J40_VERSION_LABEL_STRING: ${{ inputs.score_version }}
+  J40_VERSION_DATE_STRING: ${{ inputs.score_date }}
+
+jobs:
+  generate-score-tiles:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: data/data-pipeline
+    strategy:
+      matrix:
+        python-version: [3.9]
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v2
+      - name: Print variables to help debug
+        uses: hmarr/debug-action@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Load cached Poetry installation
+        id: cached-poetry-dependencies
+        uses: actions/cache@v2
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: env-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/create-score-version.yml') }}
+      - name: Install poetry
+        uses: snok/install-poetry@v1
+      - name: Print Poetry settings
+        run: poetry show -v
+      - name: Install dependencies
+        run: poetry add s4cmd && poetry install
+        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.DATA_DEV_AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.DATA_DEV_AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+      - name: Generate Score
+        run: |
+          poetry run python3 data_pipeline/application.py score-full-run
+      - name: Generate Score Post
+        run: |
+          poetry run python3 data_pipeline/application.py generate-score-post -s aws
+      - name: Deploy Score to Geoplatform AWS
+        run: |
+          poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-versions/${{J40_VERSION_LABEL_STRING}}/data/score/csv --recursive --force --API-ACL=public-read
+          poetry run s4cmd put ./data_pipeline/files/ s3://justice40-data/data-versions/${{J40_VERSION_LABEL_STRING}}/data/score/downloadable --recursive --force --API-ACL=public-read
+
+      # - name: Install GDAL/ogr2ogr
+      #   run: |
+      #     sudo add-apt-repository ppa:ubuntugis/ppa
+      #     sudo apt-get update
+      #     sudo apt-get -y install gdal-bin
+      #     ogrinfo --version
+      # - name: Set timezone for tippecanoe
+      #   uses: szenius/set-timezone@v1.0
+      #   with:
+      #     timezoneLinux: "America/Los_Angeles"
+      # - name: Get tippecanoe
+      #   run: |
+      #     sudo apt-get install -y software-properties-common libsqlite3-dev zlib1g-dev
+      #     sudo apt-add-repository -y ppa:git-core/ppa
+      #     sudo mkdir -p /tmp/tippecanoe-src
+      #     sudo git clone https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe-src
+      # - name: Make tippecanoe
+      #   working-directory: /tmp/tippecanoe-src
+      #   run: |
+      #     sudo /usr/bin/bash -c make
+      #     mkdir -p /usr/local/bin
+      #     cp tippecanoe /usr/local/bin/tippecanoe
+      #     tippecanoe -v
+      # - name: Generate Score Geo
+      #   run: |
+      #     poetry run python3 data_pipeline/application.py geo-score
+      # - name: Generate Tiles
+      #   run: |
+      #     poetry run python3 data_pipeline/application.py generate-map-tiles
+      # - name: Deploy Map to Geoplatform AWS
+      #   run: |
+      #     poetry run s4cmd put ./data_pipeline/data/score/geojson/ s3://justice40-data/data-versions/${{J40_VERSION_LABEL_STRING}}/data/score/geojson --recursive --force  --API-ACL=public-read --num-threads=250
+      #     poetry run s4cmd put ./data_pipeline/data/score/shapefile/ s3://justice40-data/data-versions/${{J40_VERSION_LABEL_STRING}}/${{env.SHA_NUMBER}}/data/score/shapefile --recursive --force  --API-ACL=public-read
+      #     poetry run s4cmd put ./data_pipeline/data/score/tiles/ s3://justice40-data/data-versions/${{J40_VERSION_LABEL_STRING}}/data/score/tiles --recursive --force  --API-ACL=public-read --num-threads=250
+      #     poetry run s4cmd put ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-versions/${{J40_VERSION_LABEL_STRING}}/data/score/downloadable --recursive --force --API-ACL=public-read
--- a/.github/workflows/deploy_be_staging.yml
+++ b/.github/workflows/deploy_be_staging.yml
@ -71,7 +71,6 @@ jobs:
      - name: Deploy Score to Geoplatform AWS
        run: |
          poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
-          poetry run s4cmd put ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read
          poetry run s4cmd put ./data_pipeline/files/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read

      - name: Update PR with deployed Score URLs
@ -111,6 +110,7 @@ jobs:
          poetry run s4cmd put ./data_pipeline/data/score/geojson/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/geojson --recursive --force  --API-ACL=public-read --num-threads=250
          poetry run s4cmd put ./data_pipeline/data/score/shapefile/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/shapefile --recursive --force  --API-ACL=public-read
          poetry run s4cmd put ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/tiles --recursive --force  --API-ACL=public-read --num-threads=250
+          poetry run s4cmd put ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/downloadable --recursive --force --API-ACL=public-read
      - name: Update PR with deployed Map URL
        uses: mshick/add-pr-comment@v1
        with:
--- a/client/.env.development
+++ b/client/.env.development
@ -15,7 +15,7 @@ GATSBY_FILE_DL_PATH_TSD_PDF=downloadable/cejst_technical_support_document.pdf
 GATSBY_FILE_DL_PATH_TSD_ES_PDF=downloadable/cejst-technical-support-document-es.pdf
 GATSBY_FILE_DL_PATH_COMMUNITIES_LIST_XLS=downloadable/communities-2022-05-12-1914GMT.xlsx
 GATSBY_FILE_DL_PATH_COMMUNITIES_LIST_CSV=downloadable/communities-2022-05-12-1914GMT.csv
-GATSBY_FILE_DL_PATH_HOW_TO_COMMUNITIES_PDF=downloadable/draft_communities_list.pdf
+GATSBY_FILE_DL_PATH_HOW_TO_COMMUNITIES_PDF=downloadable/draft-communities-list.pdf

 GATSBY_MAP_TILES_PATH=tiles

--- a/client/.env.production
+++ b/client/.env.production
@ -13,6 +13,6 @@ GATSBY_FILE_DL_PATH_TSD_PDF=downloadable/cejst_technical_support_document.pdf
 GATSBY_FILE_DL_PATH_TSD_ES_PDF=downloadable/cejst-technical-support-document-es.pdf
 GATSBY_FILE_DL_PATH_COMMUNITIES_LIST_XLS=downloadable/communities-2022-05-31-1915GMT.xlsx
 GATSBY_FILE_DL_PATH_COMMUNITIES_LIST_CSV=downloadable/communities-2022-05-31-1915GMT.csv
-GATSBY_FILE_DL_PATH_HOW_TO_COMMUNITIES_PDF=downloadable/Draft_Communities_List.pdf
+GATSBY_FILE_DL_PATH_HOW_TO_COMMUNITIES_PDF=downloadable/draft-communities-list.pdf

 GATSBY_MAP_TILES_PATH=tiles
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -59,8 +59,10 @@ else:
    version_str = os.environ.get("J40_VERSION_LABEL_STRING")

 SCORE_DOWNLOADABLE_DIR = DATA_SCORE_DIR / "downloadable"
-SCORE_DOWNLOADABLE_PDF_FILE_NAME = "draft_communities_list.pdf"
+SCORE_DOWNLOADABLE_PDF_FILE_NAME = "draft-communities-list.pdf"
 SCORE_DOWNLOADABLE_PDF_FILE_PATH = FILES_PATH / SCORE_DOWNLOADABLE_PDF_FILE_NAME
+SCORE_DOWNLOADABLE_TSD_FILE_NAME = "cejst-technical-support-document.pdf"
+SCORE_DOWNLOADABLE_TSD_FILE_PATH = FILES_PATH / SCORE_DOWNLOADABLE_TSD_FILE_NAME
 SCORE_DOWNLOADABLE_CSV_FILE_PATH = (
    SCORE_DOWNLOADABLE_DIR / f"{version_str}-communities-{timestamp_str}.csv"
 )
@ -78,6 +80,18 @@ SCORE_DOWNLOADABLE_XLS_ZIP_FILE_PATH = (
    SCORE_DOWNLOADABLE_DIR
    / f"{version_str}-communities-xls-{timestamp_str}.zip"
 )
+SCORE_VERSIONING_DATA_DOCUMENTATION_ZIP_FILE_PATH = (
+    SCORE_DOWNLOADABLE_DIR
+    / f"{version_str}-data-documentation-{timestamp_str}.zip"
+)
+SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH = (
+    SCORE_DOWNLOADABLE_DIR
+    / f"{version_str}-shapefile-codebook-{timestamp_str}.zip"
+)
+SCORE_VERSIONING_README_FILE_NAME = f"README-version-{version_str}.md"
+SCORE_VERSIONING_README_FILE_PATH = (
+    FILES_PATH / SCORE_VERSIONING_README_FILE_NAME
+)

 # For the codebook
 CEJST_SCORE_COLUMN_NAME = "score_name"
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -378,6 +378,37 @@ class GeoScoreETL(ExtractTransformLoad):
            zip_files(arcgis_zip_file_path, arcgis_files)
            logger.info("Completed zipping shapefiles")

+            # Per #1557:
+            # Zip file that contains the shapefiles, codebook and checksum file.
+            # Normally we get the codebook file path using this constant:
+            # - codebook_path = constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH
+            # However since we generate it on a separate script (etl_score_post)
+            # the time stamp can be generated again, and thus the file is not found.
+            # So we grab it from the downloadable dir and if we don't find it, it
+            # means we haven't run etl_score_post, and continue
+
+            logger.info("Getting codebook from downloadable dir")
+            codebook_path = None
+            for file in os.listdir(constants.SCORE_DOWNLOADABLE_DIR):
+                if "codebook" in file:
+                    codebook_path = constants.SCORE_DOWNLOADABLE_DIR / file
+
+            if codebook_path:
+                version_shapefile_codebook_zip_path = (
+                    constants.SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH
+                )
+                readme_path = constants.SCORE_VERSIONING_README_FILE_PATH
+
+                logger.info("Compressing shapefile and codebook files")
+                files_to_compress = [
+                    arcgis_zip_file_path,
+                    codebook_path,
+                    readme_path,
+                ]
+                zip_files(
+                    version_shapefile_codebook_zip_path, files_to_compress
+                )
+
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(task)
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -486,8 +486,18 @@ class PostScoreETL(ExtractTransformLoad):
        csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
        excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH
        codebook_path = constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH
+        readme_path = constants.SCORE_VERSIONING_README_FILE_PATH
        csv_zip_path = constants.SCORE_DOWNLOADABLE_CSV_ZIP_FILE_PATH
        xls_zip_path = constants.SCORE_DOWNLOADABLE_XLS_ZIP_FILE_PATH
+        score_downloadable_pdf_file_path = (
+            constants.SCORE_DOWNLOADABLE_PDF_FILE_PATH
+        )
+        score_downloadable_tsd_file_path = (
+            constants.SCORE_DOWNLOADABLE_TSD_FILE_PATH
+        )
+        version_data_documentation_zip_path = (
+            constants.SCORE_VERSIONING_DATA_DOCUMENTATION_ZIP_FILE_PATH
+        )

        logger.info("Writing downloadable excel")
        excel_config = self._load_excel_from_df(
@ -552,19 +562,26 @@ class PostScoreETL(ExtractTransformLoad):
        # load codebook to disk
        codebook_df.to_csv(codebook_path, index=False)

+        # zip assets
        logger.info("Compressing csv files")
-        files_to_compress = [
-            csv_path,
-            codebook_path,
-        ]
+        files_to_compress = [csv_path, codebook_path, readme_path]
        zip_files(csv_zip_path, files_to_compress)

        logger.info("Compressing xls files")
+        files_to_compress = [excel_path, codebook_path, readme_path]
+        zip_files(xls_zip_path, files_to_compress)
+
+        # Per #1557
+        # zip file that contains the .xls, .csv, .pdf, tech support document, checksum file
+        logger.info("Compressing data and documentation files")
        files_to_compress = [
            excel_path,
-            codebook_path,
+            csv_path,
+            score_downloadable_pdf_file_path,
+            score_downloadable_tsd_file_path,
+            readme_path,
        ]
-        zip_files(xls_zip_path, files_to_compress)
+        zip_files(version_data_documentation_zip_path, files_to_compress)

    def load(self) -> None:
        self._load_score_csv_full(
--- a/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
@ -74,7 +74,7 @@ def score_data_initial(sample_data_dir):

@pytest.fixture()
 def score_pdf_initial(sample_data_dir):
-    return sample_data_dir / "draft_communities_list.pdf"
+    return sample_data_dir / "draft-communities-list.pdf"


@pytest.fixture()
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -133,6 +133,16 @@ def test_load_downloadable_zip(etl, monkeypatch, score_data_expected):
        "SCORE_DOWNLOADABLE_PDF_FILE_PATH",
        static_files_path / constants.SCORE_DOWNLOADABLE_PDF_FILE_NAME,
    )
+    monkeypatch.setattr(
+        constants,
+        "SCORE_VERSIONING_README_FILE_PATH",
+        static_files_path / constants.SCORE_VERSIONING_README_FILE_NAME,
+    )
+    monkeypatch.setattr(
+        constants,
+        "SCORE_DOWNLOADABLE_TSD_FILE_PATH",
+        static_files_path / constants.SCORE_DOWNLOADABLE_TSD_FILE_NAME,
+    )
    etl.output_score_county_state_merged_df = score_data_expected
    etl._load_downloadable_zip(constants.SCORE_DOWNLOADABLE_DIR)
    assert constants.SCORE_DOWNLOADABLE_DIR.is_dir()
--- a/data/data-pipeline/data_pipeline/files/README-version-beta.md
+++ b/data/data-pipeline/data_pipeline/files/README-version-beta.md
@ -0,0 +1,104 @@
+Draft, Deliberative and Pre-Decisional
+
+**Release Notes for the Climate and Economic Justice Screening Tool (CEJST) version 1.0**
+
+**Changes between version 1.0 and the beta version**
+
+**Release update - DATE**
+
+- Added Federally Recognized Tribal Lands and Alaska Native Villages data from the Bureau of Indian Affairs at the Department of the Interior to the tool and map
+- Added new data for indicators of burden
+  - Climate change
+    - Projected flood risk
+    - Projected wildfire risk
+  - Housing
+    - Lack of plumbing
+    - Lack of green space
+    - Historic underinvestment (due to redlining)
+  - Legacy pollution
+    - Abandoned mine lands
+    - Formerly used defense sites
+  - Transportation
+    - Transportation barriers
+  - Water
+    - Leaking underground storage tanks
+- Added an adjacency indicator to methodology to include low income communities that had been completely surrounded by other disadvantaged communities, but which had just missed the thresholds in the beta version.
+- Made technical changes to enhance accuracy of the tool
+  - Removed income data for students enrolled in higher education in the low income indicator
+  - Imputed income for census tracts missing that data that have a population greater than zero
+  - Used transportation barriers and population loss data indicators only for census tracts with populations of 20 or more
+
+- Improved the user interface
+  - Added demographics (race & age) for census tracts to map side panel
+  - Improved the design of the map side panel
+  - Updated the site copy of the website and associated Spanish translations
+
+- Added new data for all the U.S. Territories, and also made a small methodology change for Puerto Rico
+  - USVI
+    - Included data from 2010 decennial census for US Virgin Islands
+    - New data from EJScreen 2.1
+      - Sustainable Housing:
+        - Lead paint
+      - Legacy pollution:
+        - Proximity to Superfund (National Priorities List (NPL)) sites
+        - Proximity to Risk Management Plan (RMP) facilities
+        - Proximity to hazardous waste facilities
+        - Leaking underground storage tanks
+  - Guam
+    - Included data from 2010 decennial census for Guam
+    - New data from EJScreen 2.1
+      - Sustainable Housing:
+        - Lead paint
+      - Legacy pollution:
+        - Proximity to Superfund (National Priorities List (NPL)) sites
+        - Proximity to Risk Management Plan (RMP) facilities
+        - Proximity to hazardous waste facilities
+        - Leaking underground storage tanks
+  - Samoa
+    - New data from EJScreen 2.1
+      - Sustainable Housing:
+        - Lead paint
+      - Legacy pollution:
+        - Proximity to Risk Management Plan (RMP) facilities
+        - Leaking underground storage tanks
+  - Marianna Islands
+    - New data from EJScreen 2.1
+      - Sustainable Housing:
+        - Lead paint
+      - Legacy pollution:
+        - Leaking underground storage tanks
+  - Puerto Rico
+    - Removed linguistic isolation as an indicator in the methodology for Puerto Rico
+    - Imported additional available data for Puerto Rico
+      - Energy cost
+      - Housing cost
+      - Abandoned mine lands
+      - Proximity to hazardous waste sites
+      - Proximity to Superfund (National Priorities List (NPL)) sites
+      - Proximity to Risk Management Plan (RMP) sites
+- Updated data from EJScreen 2.1 across the entire tool:
+  - Sustainable Housing:
+    - Lead paint - 2016-2020
+  - Legacy pollution:
+    - Proximity to Superfund (National Priorities List (NPL)) sites - 2022
+    - Proximity to Risk Management Plans (RMP) facilities - 2022
+    - Proximity to hazardous waste facilities - 2022
+  - Water and wastewater
+    - Leaking underground storage tanks - 2022
+
+- Enhanced the technical files:
+  - Added all new data indicators and demographics to .xls, .csv, and shapefiles
+  - Added versions page to house beta version of download files
+  - Updated codebook
+  - Updated Technical Support Document
+
+- Improved the way that users provide feedback on the tool:
+  - Data survey
+  - Site experience survey
+  - Census tract feedback
+  - General contact form
+- Added a link to sign up for email mailing list managed by CEQ
+- Fixes:
+
+  - Bug fix: loaded missing life expectancy data for Maine and Wisconsin
+  - Bug fix: Census tracts that have 0 land should not be included on the map
--- a/data/data-pipeline/data_pipeline/files/cejst_technical_support_document.pdf
+++ b/data/data-pipeline/data_pipeline/files/cejst_technical_support_document.pdf
--- a/data/data-pipeline/data_pipeline/files/draft-communities-list.pdf
+++ b/data/data-pipeline/data_pipeline/files/draft-communities-list.pdf