AWS Sync Public Read (#508)

* adding layer to mvts * small fix for GHA * AWS Sync Public Read * removed temp file * updated state media income ftp
2025-09-30 15:03:17 -07:00 · 2021-08-12 14:17:25 -04:00 · 2021-08-12 14:17:25 -04:00 · 773c035493
commit 773c035493
parent 1c5d5de82b
5 changed files with 38 additions and 70 deletions
--- a/.github/workflows/build_deploy.yml
+++ b/.github/workflows/build_deploy.yml
@ -96,7 +96,7 @@ jobs:
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-east-1
      - name: Deploy to Geoplatform AWS
-        run: aws s3 sync ./public/ s3://usds-geoplatform-justice40-website/justice40-tool/${{env.DESTINATION_FOLDER}} --delete
+        run: aws s3 sync ./public/ s3://usds-geoplatform-justice40-website/justice40-tool/${{env.DESTINATION_FOLDER}} --acl public-read --delete
      - name: Update PR with deployed URL
        uses: mshick/add-pr-comment@v1
        if: github.event_name == 'pull_request' && github.event.action == 'opened' || github.event_name == 'push' # Only comment if the PR has been opened or a push has updated it
@ -111,4 +111,4 @@ jobs:
        run: |
          echo "Github pages: https://usds.github.io/justice40-tool/$DESTINATION_FOLDER/en"
          echo "Standard S3 bucket version (http only) : http://usds-geoplatform-justice40-website.s3-website-us-east-1.amazonaws.com/justice40-tool/$DESTINATION_FOLDER/en"
-          echo "Cloudfront https: https://d2zjid6n5ja2pt.cloudfront.net/justice40-tool/$DESTINATION_FOLDER/en"
+          echo "Cloudfront https: https://d2zjid6n5ja2pt.cloudfront.net/justice40-tool/$DESTINATION_FOLDER/en"
--- a/.github/workflows/generate-score.yml
+++ b/.github/workflows/generate-score.yml
@ -49,9 +49,9 @@ jobs:
          aws-region: us-east-1
      - name: Deploy to Geoplatform AWS
        run: |
-          aws s3 sync ./data_pipeline/data/dataset/ s3://justice40-data/data-pipeline/data/dataset --delete
+          aws s3 sync ./data_pipeline/data/dataset/ s3://justice40-data/data-pipeline/data/dataset --acl public-read --delete
-          aws s3 sync ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline/data/score/csv --delete
+          aws s3 sync ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline/data/score/csv --acl public-read --delete
-          aws s3 sync ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline/data/score/downloadable --delete
+          aws s3 sync ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline/data/score/downloadable --acl public-read --delete
      - name: Update PR with Comment about deployment
        uses: mshick/add-pr-comment@v1
        with:
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@ -193,7 +193,7 @@ If you want to run tile generation, please install TippeCanoe [following these i
 - Start a terminal
 - Change to the package directory (i.e. `cd data/data-pipeline/data_pipeline`)
 - Then run `poetry run generate_tiles`
- If you have S3 keys, you can sync to the dev repo by doing `aws s3 sync ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline/data/score/tiles --delete`
+- If you have S3 keys, you can sync to the dev repo by doing `aws s3 sync ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline/data/score/tiles --acl public-read --delete`
 ### Serve the map locally
--- a/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv
+++ b/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv
@ -1,53 +0,0 @@
 GEOID2,Median household income (State)
 01,50536
 02,77640
 04,58945
 05,47597
 06,75235
 08,72331
 09,78444
 10,68287
 11,86420
 12,55660
 13,58700
 15,81275
 16,55785
 17,65886
 18,56303
 19,60523
 20,59597
 21,50589
 22,49469
 23,57918
 24,84805
 25,81215
 26,57144
 27,71306
 28,45081
 29,55461
 30,54970
 31,61439
 32,60365
 33,76768
 34,82545
 35,49754
 36,68486
 37,54602
 38,64894
 39,56602
 40,52919
 41,62818
 42,61744
 44,67167
 45,53199
 46,58275
 47,53320
 48,61874
 49,71621
 50,61973
 51,74222
 53,73775
 54,46711
 55,61747
 56,64049
 72,20539
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -4,6 +4,7 @@ import censusdata
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 logger = get_module_logger(__name__)
@ -11,10 +12,14 @@ logger = get_module_logger(__name__)
 class CensusACSETL(ExtractTransformLoad):
    def __init__(self):
        self.ACS_YEAR = 2019
-        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
+        self.OUTPUT_PATH = (
            self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
        )
        self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
        self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
-        self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = "Linguistic isolation (total)"
+        self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = (
            "Linguistic isolation (total)"
        )
        self.LINGUISTIC_ISOLATION_FIELDS = [
            "C16002_001E",
            "C16002_004E",
@ -23,7 +28,9 @@ class CensusACSETL(ExtractTransformLoad):
            "C16002_013E",
        ]
        self.MEDIAN_INCOME_FIELD = "B19013_001E"
-        self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months"
+        self.MEDIAN_INCOME_FIELD_NAME = (
            "Median household income in the past 12 months"
        )
        self.MEDIAN_INCOME_STATE_FIELD_NAME = "Median household income (State)"
        self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
            "Median household income (% of state median household income)"
@ -32,22 +39,32 @@ class CensusACSETL(ExtractTransformLoad):
        self.df: pd.DataFrame
        self.state_median_income_df: pd.DataFrame
-        # TODO: refactor this to put this file on s3 and download it from there
+        self.STATE_MEDIAN_INCOME_FTP_URL = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/2014_to_2019_state_median_income.zip"
        )
        self.STATE_MEDIAN_INCOME_FILE_PATH = (
-            self.DATA_PATH
+            self.TMP_PATH / "2014_to_2019_state_median_income.csv"
            / "needs_to_be_moved_to_s3"
            / "2014_to_2019_state_median_income.csv"
        )
-    def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
+    def _fips_from_censusdata_censusgeo(
        self, censusgeo: censusdata.censusgeo
    ) -> str:
        """Create a FIPS code from the proprietary censusgeo index."""
        fips = "".join([value for (key, value) in censusgeo.params()])
        return fips
    def extract(self) -> None:
        # Extract state median income
        super().extract(
            self.STATE_MEDIAN_INCOME_FTP_URL,
            self.TMP_PATH,
        )
        dfs = []
        for fips in get_state_fips_codes(self.DATA_PATH):
-            logger.info(f"Downloading data for state/territory with FIPS code {fips}")
+            logger.info(
                f"Downloading data for state/territory with FIPS code {fips}"
            )
            dfs.append(
                censusdata.download(
@ -82,7 +99,9 @@ class CensusACSETL(ExtractTransformLoad):
        logger.info("Starting Census ACS Transform")
        # Rename median income
-        self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD]
+        self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
            self.MEDIAN_INCOME_FIELD
        ]
        # TODO: handle null values for CBG median income, which are `-666666666`.
@ -104,7 +123,9 @@ class CensusACSETL(ExtractTransformLoad):
        # Calculate percent unemployment.
        # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
-        self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
+        self.df[self.UNEMPLOYED_FIELD_NAME] = (
            self.df.B23025_005E / self.df.B23025_003E
        )
        # Calculate linguistic isolation.
        individual_limited_english_fields = [