Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
This commit is contained in:
Travis Newby 2023-03-03 12:26:24 -06:00 committed by GitHub
commit 6f39033dde
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 1787 additions and 686 deletions

View file

@ -8,7 +8,8 @@ import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -20,7 +21,7 @@ class GeoFileType(Enum):
class CensusETL(ExtractTransformLoad):
SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
# SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def __init__(self):
self.shape_file_path = self.get_sources_path() / "shp"
# the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
file_path: Path
if file_type == GeoFileType.SHP:
file_path = Path(
self.SHP_BASE_PATH
self.shape_file_path
/ fips_code
/ f"tl_2010_{fips_code}_tract10.shp"
)
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
return file_path
def _extract_shp(self, fips_code: str) -> None:
"""Download the SHP file for the provided FIPS code
def get_data_sources(self) -> [DataSource]:
Args:
fips_code (str): the FIPS code for the region of interest
sources = []
Returns:
None
"""
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
for fips_code in self.STATE_FIPS_CODES:
# check if file exists
if not shp_file_path.is_file():
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
unzip_file_from_url(
tract_state_url,
self.TMP_PATH,
self.DATA_PATH / "census" / "shp" / fips_code,
destination_path = self.shape_file_path / fips_code
sources.append(
ZIPDataSource(
source=tract_state_url, destination=destination_path
)
)
def extract(self) -> None:
logger.debug("Extracting census data")
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
logger.debug(
f"Extracting shape for FIPS {fips_code} {index+1} of {len(self.STATE_FIPS_CODES)}"
)
self._extract_shp(fips_code)
return sources
def _transform_to_geojson(self, fips_code: str) -> None:
"""Convert the downloaded SHP file for the associated FIPS to geojson

View file

@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
else:
fips = row[0].strip()
fips_state_list.append(fips)
return fips_state_list