S3 Parallel Upload and Deletions (#1410)

* installation step

* trigger action

* installing to home dir

* dry-run

* pyenv

* py 2.8

* trying s4cmd

* removing pyenv

* poetry s4cmd

* num-threads

* public read

* poetry cache

* s4cmd all around

* poetry cache

* poetry cache

* install poetry packages

* poetry echo

* let's do this

* s4cmd install on run

* s4cmd

* ad aws back

* add aws back

* testing census api key and poetry caching

* census api key

* census api

* census api key #3

* 250

* poetry update

* poetry change

* check census api key

* force flag

* update score gen and tilefy; remove cached fips

* small gdal update

* invalidation

* missing cache ids
This commit is contained in:
Jorge Escobar 2022-03-17 23:19:23 -04:00 committed by GitHub
commit 7b05ee9c76
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 307 additions and 197 deletions

View file

@ -72,7 +72,7 @@ class ExtractTransformLoad:
# Eleven digits in a census tract ID.
EXPECTED_CENSUS_TRACTS_CHARACTER_LENGTH: int = 11
# TODO: investigate. Census says there are only 74,134 tracts in the US,
# TODO: investigate. Census says there are only 74,134 tracts in the United States,
# Puerto Rico, and island areas. This might be from tracts at different time
# periods. https://github.com/usds/justice40-tool/issues/964
EXPECTED_MAX_CENSUS_TRACTS: int = 74160

View file

@ -41,14 +41,12 @@ def get_state_fips_codes(data_path: Path) -> list:
"""Returns a list with state data"""
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
# check if file exists
if not os.path.isfile(fips_csv_path):
logger.info("Downloading fips from S3 repository")
unzip_file_from_url(
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
data_path / "tmp",
data_path / "census" / "csv",
)
logger.info("Downloading fips from S3 repository")
unzip_file_from_url(
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
data_path / "tmp",
data_path / "census" / "csv",
)
fips_state_list = []
with open(fips_csv_path, encoding="utf-8") as csv_file:

View file

@ -1,3 +1,4 @@
import os
from pathlib import Path
from typing import List
import censusdata
@ -33,8 +34,11 @@ def retrieve_census_acs_data(
f"Skipping download for state/territory with FIPS code {fips}"
)
else:
census_api_key = ""
if os.environ.get("CENSUS_API_KEY"):
census_api_key = "with API key"
logger.info(
f"Downloading data for state/territory with FIPS code {fips}"
f"Downloading data for state/territory with FIPS code {fips} {census_api_key}"
)
try:
@ -45,6 +49,7 @@ def retrieve_census_acs_data(
[("state", fips), ("county", "*"), ("tract", "*")]
),
var=variables,
key=os.environ.get("CENSUS_API_KEY"),
)
dfs.append(response)