S3 Parallel Upload and Deletions (#1410)

* installation step * trigger action * installing to home dir * dry-run * pyenv * py 2.8 * trying s4cmd * removing pyenv * poetry s4cmd * num-threads * public read * poetry cache * s4cmd all around * poetry cache * poetry cache * install poetry packages * poetry echo * let's do this * s4cmd install on run * s4cmd * ad aws back * add aws back * testing census api key and poetry caching * census api key * census api * census api key #3 * 250 * poetry update * poetry change * check census api key * force flag * update score gen and tilefy; remove cached fips * small gdal update * invalidation * missing cache ids
2025-09-11 15:08:17 -07:00 · 2022-03-17 23:19:23 -04:00 · 2022-03-17 23:19:23 -04:00 · 7b05ee9c76
commit 7b05ee9c76
parent e31a4f3b94
8 changed files with 307 additions and 197 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -72,7 +72,7 @@ class ExtractTransformLoad:

    # Eleven digits in a census tract ID.
    EXPECTED_CENSUS_TRACTS_CHARACTER_LENGTH: int = 11
-    # TODO: investigate. Census says there are only 74,134 tracts in the US,
+    # TODO: investigate. Census says there are only 74,134 tracts in the United States,
    #  Puerto Rico, and island areas. This might be from tracts at different time
    #  periods. https://github.com/usds/justice40-tool/issues/964
    EXPECTED_MAX_CENSUS_TRACTS: int = 74160
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -41,14 +41,12 @@ def get_state_fips_codes(data_path: Path) -> list:
    """Returns a list with state data"""
    fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"

-    # check if file exists
-    if not os.path.isfile(fips_csv_path):
-        logger.info("Downloading fips from S3 repository")
-        unzip_file_from_url(
-            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
-            data_path / "tmp",
-            data_path / "census" / "csv",
-        )
+    logger.info("Downloading fips from S3 repository")
+    unzip_file_from_url(
+        settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
+        data_path / "tmp",
+        data_path / "census" / "csv",
+    )

    fips_state_list = []
    with open(fips_csv_path, encoding="utf-8") as csv_file:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py
@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import List
 import censusdata
@ -33,8 +34,11 @@ def retrieve_census_acs_data(
                f"Skipping download for state/territory with FIPS code {fips}"
            )
        else:
+            census_api_key = ""
+            if os.environ.get("CENSUS_API_KEY"):
+                census_api_key = "with API key"
            logger.info(
-                f"Downloading data for state/territory with FIPS code {fips}"
+                f"Downloading data for state/territory with FIPS code {fips} {census_api_key}"
            )

            try:
@ -45,6 +49,7 @@ def retrieve_census_acs_data(
                        [("state", fips), ("county", "*"), ("tract", "*")]
                    ),
                    var=variables,
+                    key=os.environ.get("CENSUS_API_KEY"),
                )
                dfs.append(response)