From 773c0354936f3d9a80b3c0ea620ac0cb20f728a2 Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Thu, 12 Aug 2021 14:17:25 -0400 Subject: [PATCH] AWS Sync Public Read (#508) * adding layer to mvts * small fix for GHA * AWS Sync Public Read * removed temp file * updated state media income ftp --- .github/workflows/build_deploy.yml | 4 +- .github/workflows/generate-score.yml | 6 +-- data/data-pipeline/README.md | 2 +- .../2014_to_2019_state_median_income.csv | 53 ------------------- .../etl/sources/census_acs/etl.py | 43 +++++++++++---- 5 files changed, 38 insertions(+), 70 deletions(-) delete mode 100644 data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv diff --git a/.github/workflows/build_deploy.yml b/.github/workflows/build_deploy.yml index 0c72e67b..987fc425 100644 --- a/.github/workflows/build_deploy.yml +++ b/.github/workflows/build_deploy.yml @@ -96,7 +96,7 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 - name: Deploy to Geoplatform AWS - run: aws s3 sync ./public/ s3://usds-geoplatform-justice40-website/justice40-tool/${{env.DESTINATION_FOLDER}} --delete + run: aws s3 sync ./public/ s3://usds-geoplatform-justice40-website/justice40-tool/${{env.DESTINATION_FOLDER}} --acl public-read --delete - name: Update PR with deployed URL uses: mshick/add-pr-comment@v1 if: github.event_name == 'pull_request' && github.event.action == 'opened' || github.event_name == 'push' # Only comment if the PR has been opened or a push has updated it @@ -111,4 +111,4 @@ jobs: run: | echo "Github pages: https://usds.github.io/justice40-tool/$DESTINATION_FOLDER/en" echo "Standard S3 bucket version (http only) : http://usds-geoplatform-justice40-website.s3-website-us-east-1.amazonaws.com/justice40-tool/$DESTINATION_FOLDER/en" - echo "Cloudfront https: https://d2zjid6n5ja2pt.cloudfront.net/justice40-tool/$DESTINATION_FOLDER/en" \ No newline at end of file + echo "Cloudfront https: https://d2zjid6n5ja2pt.cloudfront.net/justice40-tool/$DESTINATION_FOLDER/en" diff --git a/.github/workflows/generate-score.yml b/.github/workflows/generate-score.yml index eaa67ed2..14055122 100644 --- a/.github/workflows/generate-score.yml +++ b/.github/workflows/generate-score.yml @@ -49,9 +49,9 @@ jobs: aws-region: us-east-1 - name: Deploy to Geoplatform AWS run: | - aws s3 sync ./data_pipeline/data/dataset/ s3://justice40-data/data-pipeline/data/dataset --delete - aws s3 sync ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline/data/score/csv --delete - aws s3 sync ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline/data/score/downloadable --delete + aws s3 sync ./data_pipeline/data/dataset/ s3://justice40-data/data-pipeline/data/dataset --acl public-read --delete + aws s3 sync ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline/data/score/csv --acl public-read --delete + aws s3 sync ./data_pipeline/data/score/downloadable/ s3://justice40-data/data-pipeline/data/score/downloadable --acl public-read --delete - name: Update PR with Comment about deployment uses: mshick/add-pr-comment@v1 with: diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index caa8865e..05a4d06e 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -193,7 +193,7 @@ If you want to run tile generation, please install TippeCanoe [following these i - Start a terminal - Change to the package directory (i.e. `cd data/data-pipeline/data_pipeline`) - Then run `poetry run generate_tiles` -- If you have S3 keys, you can sync to the dev repo by doing `aws s3 sync ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline/data/score/tiles --delete` +- If you have S3 keys, you can sync to the dev repo by doing `aws s3 sync ./data_pipeline/data/score/tiles/ s3://justice40-data/data-pipeline/data/score/tiles --acl public-read --delete` ### Serve the map locally diff --git a/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv b/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv deleted file mode 100644 index 6f0cea70..00000000 --- a/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv +++ /dev/null @@ -1,53 +0,0 @@ -GEOID2,Median household income (State) -01,50536 -02,77640 -04,58945 -05,47597 -06,75235 -08,72331 -09,78444 -10,68287 -11,86420 -12,55660 -13,58700 -15,81275 -16,55785 -17,65886 -18,56303 -19,60523 -20,59597 -21,50589 -22,49469 -23,57918 -24,84805 -25,81215 -26,57144 -27,71306 -28,45081 -29,55461 -30,54970 -31,61439 -32,60365 -33,76768 -34,82545 -35,49754 -36,68486 -37,54602 -38,64894 -39,56602 -40,52919 -41,62818 -42,61744 -44,67167 -45,53199 -46,58275 -47,53320 -48,61874 -49,71621 -50,61973 -51,74222 -53,73775 -54,46711 -55,61747 -56,64049 -72,20539 diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 103e4572..a948bd12 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -4,6 +4,7 @@ import censusdata from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.utils import get_module_logger +from data_pipeline.config import settings logger = get_module_logger(__name__) @@ -11,10 +12,14 @@ logger = get_module_logger(__name__) class CensusACSETL(ExtractTransformLoad): def __init__(self): self.ACS_YEAR = 2019 - self.OUTPUT_PATH = self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" + self.OUTPUT_PATH = ( + self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" + ) self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" - self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = "Linguistic isolation (total)" + self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = ( + "Linguistic isolation (total)" + ) self.LINGUISTIC_ISOLATION_FIELDS = [ "C16002_001E", "C16002_004E", @@ -23,7 +28,9 @@ class CensusACSETL(ExtractTransformLoad): "C16002_013E", ] self.MEDIAN_INCOME_FIELD = "B19013_001E" - self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months" + self.MEDIAN_INCOME_FIELD_NAME = ( + "Median household income in the past 12 months" + ) self.MEDIAN_INCOME_STATE_FIELD_NAME = "Median household income (State)" self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = ( "Median household income (% of state median household income)" @@ -32,22 +39,32 @@ class CensusACSETL(ExtractTransformLoad): self.df: pd.DataFrame self.state_median_income_df: pd.DataFrame - # TODO: refactor this to put this file on s3 and download it from there + self.STATE_MEDIAN_INCOME_FTP_URL = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/2014_to_2019_state_median_income.zip" + ) self.STATE_MEDIAN_INCOME_FILE_PATH = ( - self.DATA_PATH - / "needs_to_be_moved_to_s3" - / "2014_to_2019_state_median_income.csv" + self.TMP_PATH / "2014_to_2019_state_median_income.csv" ) - def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str: + def _fips_from_censusdata_censusgeo( + self, censusgeo: censusdata.censusgeo + ) -> str: """Create a FIPS code from the proprietary censusgeo index.""" fips = "".join([value for (key, value) in censusgeo.params()]) return fips def extract(self) -> None: + # Extract state median income + super().extract( + self.STATE_MEDIAN_INCOME_FTP_URL, + self.TMP_PATH, + ) dfs = [] for fips in get_state_fips_codes(self.DATA_PATH): - logger.info(f"Downloading data for state/territory with FIPS code {fips}") + logger.info( + f"Downloading data for state/territory with FIPS code {fips}" + ) dfs.append( censusdata.download( @@ -82,7 +99,9 @@ class CensusACSETL(ExtractTransformLoad): logger.info("Starting Census ACS Transform") # Rename median income - self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD] + self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[ + self.MEDIAN_INCOME_FIELD + ] # TODO: handle null values for CBG median income, which are `-666666666`. @@ -104,7 +123,9 @@ class CensusACSETL(ExtractTransformLoad): # Calculate percent unemployment. # TODO: remove small-sample data that should be `None` instead of a high-variance fraction. - self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E + self.df[self.UNEMPLOYED_FIELD_NAME] = ( + self.df.B23025_005E / self.df.B23025_003E + ) # Calculate linguistic isolation. individual_limited_english_fields = [