From 0a21fc6b12b007f137b9bb63fe275e70d1001c42 Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Tue, 16 Nov 2021 10:05:09 -0500 Subject: [PATCH] Add territory boundary data (#885) * Add territory boundary data * housing and transp * lint * lint * lint --- data/data-pipeline/data_pipeline/etl/base.py | 2 +- .../etl/sources/census/etl_utils.py | 8 ++++++-- .../etl/sources/census_acs/etl.py | 11 ++++++++--- .../sources/housing_and_transportation/etl.py | 13 ++++++++----- data/data-pipeline/data_pipeline/utils.py | 18 ++++++++++++------ 5 files changed, 35 insertions(+), 17 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 4936ab1f..ca636fa9 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -33,7 +33,7 @@ class ExtractTransformLoad: GEOID_FIELD_NAME: str = "GEOID10" GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods. - EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405 + EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000 EXPECTED_MAX_CENSUS_TRACTS: int = 73076 def __init__(self, config_path: Path) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py index 9ef74d71..d77d1874 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py @@ -16,13 +16,17 @@ from data_pipeline.utils import ( logger = get_module_logger(__name__) -def reset_data_directories(data_path: Path) -> None: +def reset_data_directories( + data_path: Path, +) -> None: """Empties all census folders""" census_data_path = data_path / "census" # csv csv_path = census_data_path / "csv" - remove_files_from_dir(csv_path, ".csv") + remove_files_from_dir( + csv_path, ".csv", exception_list=["fips_states_2010.csv"] + ) # geojson geojson_path = census_data_path / "geojson" diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index dda15015..79ba1258 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad): f"Downloading data for state/territory with FIPS code {fips}" ) - dfs.append( - censusdata.download( + try: + response = censusdata.download( src="acs5", year=self.ACS_YEAR, geo=censusdata.censusgeo( @@ -91,7 +91,12 @@ class CensusACSETL(ExtractTransformLoad): + self.LINGUISTIC_ISOLATION_FIELDS + self.POVERTY_FIELDS, ) - ) + except ValueError: + logger.error( + f"Could not download data for state/territory with FIPS code {fips}" + ) + + dfs.append(response) self.df = pd.concat(dfs) diff --git a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py index 6e054df6..9e8986a8 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py @@ -1,4 +1,5 @@ import pandas as pd +from pandas.errors import EmptyDataError from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes @@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad): f"Downloading housing data for state/territory with FIPS code {fips}" ) - # Puerto Rico has no data, so skip - if fips == "72": - continue - unzip_file_from_url( f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir ) @@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad): tmp_csv_file_path = ( zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv" ) - tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path) + + try: + tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path) + except EmptyDataError: + logger.error( + f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}" + ) dfs.append(tmp_df) diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index 5edce8fa..58761a9f 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -46,25 +46,31 @@ def get_module_logger(module_name: str) -> logging.Logger: logger = get_module_logger(__name__) -def remove_files_from_dir(files_path: Path, extension: str = None) -> None: +def remove_files_from_dir( + files_path: Path, extension: str = None, exception_list: list = None +) -> None: """Removes all files from a specific directory with the exception of __init__.py files or files with a specific extension Args: files_path (pathlib.Path): Name of the directory where the files will be deleted extension (str): Extension of the file pattern to delete, example "json" (optional) + exception_list (list): List of files to not remove (optional) Returns: None """ for file in os.listdir(files_path): - if extension: - if not file.endswith(extension): + # don't rempove __init__ files as they conserve dir structure + if file == "__init__.py": + continue + + if exception_list: + if file in exception_list: continue - else: - # don't rempove __init__ files as they conserve dir structure - if file == "__init__.py": + elif extension: + if not file.endswith(extension): continue os.remove(files_path / file) logger.info(f"Removing {file}")