Add territory boundary data (#885)

* Add territory boundary data * housing and transp * lint * lint * lint
2025-08-06 01:44:18 -07:00 · 2021-11-16 10:05:09 -05:00 · 2021-11-16 10:05:09 -05:00 · 0a21fc6b12
commit 0a21fc6b12
parent f00cc5f7b2
5 changed files with 35 additions and 17 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -33,7 +33,7 @@ class ExtractTransformLoad:
    GEOID_FIELD_NAME: str = "GEOID10"
    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
    # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
-    EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
+    EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
    EXPECTED_MAX_CENSUS_TRACTS: int = 73076
    def __init__(self, config_path: Path) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -16,13 +16,17 @@ from data_pipeline.utils import (
 logger = get_module_logger(__name__)
-def reset_data_directories(data_path: Path) -> None:
+def reset_data_directories(
    data_path: Path,
 ) -> None:
    """Empties all census folders"""
    census_data_path = data_path / "census"
    # csv
    csv_path = census_data_path / "csv"
-    remove_files_from_dir(csv_path, ".csv")
+    remove_files_from_dir(
        csv_path, ".csv", exception_list=["fips_states_2010.csv"]
    )
    # geojson
    geojson_path = census_data_path / "geojson"
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad):
                f"Downloading data for state/territory with FIPS code {fips}"
            )
-            dfs.append(
+            try:
-                censusdata.download(
+                response = censusdata.download(
                    src="acs5",
                    year=self.ACS_YEAR,
                    geo=censusdata.censusgeo(
@ -91,8 +91,13 @@ class CensusACSETL(ExtractTransformLoad):
                    + self.LINGUISTIC_ISOLATION_FIELDS
                    + self.POVERTY_FIELDS,
                )
            except ValueError:
                logger.error(
                    f"Could not download data for state/territory with FIPS code {fips}"
                )
            dfs.append(response)
        self.df = pd.concat(dfs)
        self.df[self.GEOID_FIELD_NAME] = self.df.index.to_series().apply(
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@ -1,4 +1,5 @@
 import pandas as pd
 from pandas.errors import EmptyDataError
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad):
                f"Downloading housing data for state/territory with FIPS code {fips}"
            )
            # Puerto Rico has no data, so skip
            if fips == "72":
                continue
            unzip_file_from_url(
                f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
            )
@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad):
            tmp_csv_file_path = (
                zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
            )
            try:
                tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
            except EmptyDataError:
                logger.error(
                    f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
                )
            dfs.append(tmp_df)
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -46,26 +46,32 @@ def get_module_logger(module_name: str) -> logging.Logger:
 logger = get_module_logger(__name__)
-def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
+def remove_files_from_dir(
    files_path: Path, extension: str = None, exception_list: list = None
 ) -> None:
    """Removes all files from a specific directory with the exception of __init__.py
    files or files with a specific extension
    Args:
        files_path (pathlib.Path): Name of the directory where the files will be deleted
        extension (str): Extension of the file pattern to delete, example "json" (optional)
        exception_list (list): List of files to not remove (optional)
    Returns:
        None
    """
    for file in os.listdir(files_path):
        if extension:
            if not file.endswith(extension):
                continue
        else:
        # don't rempove __init__ files as they conserve dir structure
        if file == "__init__.py":
            continue
        if exception_list:
            if file in exception_list:
                continue
        elif extension:
            if not file.endswith(extension):
                continue
        os.remove(files_path / file)
        logger.info(f"Removing {file}")