From 0a21fc6b12b007f137b9bb63fe275e70d1001c42 Mon Sep 17 00:00:00 2001
From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com>
Date: Tue, 16 Nov 2021 10:05:09 -0500
Subject: [PATCH] Add territory boundary data (#885)

* Add territory boundary data

* housing and transp

* lint

* lint

* lint
---
 data/data-pipeline/data_pipeline/etl/base.py   |  2 +-
 .../etl/sources/census/etl_utils.py            |  8 ++++++--
 .../etl/sources/census_acs/etl.py              | 11 ++++++++---
 .../sources/housing_and_transportation/etl.py  | 13 ++++++++-----
 data/data-pipeline/data_pipeline/utils.py      | 18 ++++++++++++------
 5 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py
index 4936ab1f..ca636fa9 100644
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@@ -33,7 +33,7 @@ class ExtractTransformLoad:
     GEOID_FIELD_NAME: str = "GEOID10"
     GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
     # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
-    EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
+    EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
     EXPECTED_MAX_CENSUS_TRACTS: int = 73076
 
     def __init__(self, config_path: Path) -> None:
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
index 9ef74d71..d77d1874 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@@ -16,13 +16,17 @@ from data_pipeline.utils import (
 logger = get_module_logger(__name__)
 
 
-def reset_data_directories(data_path: Path) -> None:
+def reset_data_directories(
+    data_path: Path,
+) -> None:
     """Empties all census folders"""
     census_data_path = data_path / "census"
 
     # csv
     csv_path = census_data_path / "csv"
-    remove_files_from_dir(csv_path, ".csv")
+    remove_files_from_dir(
+        csv_path, ".csv", exception_list=["fips_states_2010.csv"]
+    )
 
     # geojson
     geojson_path = census_data_path / "geojson"
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
index dda15015..79ba1258 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad):
                 f"Downloading data for state/territory with FIPS code {fips}"
             )
 
-            dfs.append(
-                censusdata.download(
+            try:
+                response = censusdata.download(
                     src="acs5",
                     year=self.ACS_YEAR,
                     geo=censusdata.censusgeo(
@@ -91,7 +91,12 @@ class CensusACSETL(ExtractTransformLoad):
                     + self.LINGUISTIC_ISOLATION_FIELDS
                     + self.POVERTY_FIELDS,
                 )
-            )
+            except ValueError:
+                logger.error(
+                    f"Could not download data for state/territory with FIPS code {fips}"
+                )
+
+            dfs.append(response)
 
         self.df = pd.concat(dfs)
 
diff --git a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
index 6e054df6..9e8986a8 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@@ -1,4 +1,5 @@
 import pandas as pd
+from pandas.errors import EmptyDataError
 
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
@@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad):
                 f"Downloading housing data for state/territory with FIPS code {fips}"
             )
 
-            # Puerto Rico has no data, so skip
-            if fips == "72":
-                continue
-
             unzip_file_from_url(
                 f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
             )
@@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad):
             tmp_csv_file_path = (
                 zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
             )
-            tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
+
+            try:
+                tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
+            except EmptyDataError:
+                logger.error(
+                    f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
+                )
 
             dfs.append(tmp_df)
 
diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py
index 5edce8fa..58761a9f 100644
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@@ -46,25 +46,31 @@ def get_module_logger(module_name: str) -> logging.Logger:
 logger = get_module_logger(__name__)
 
 
-def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
+def remove_files_from_dir(
+    files_path: Path, extension: str = None, exception_list: list = None
+) -> None:
     """Removes all files from a specific directory with the exception of __init__.py
     files or files with a specific extension
 
     Args:
         files_path (pathlib.Path): Name of the directory where the files will be deleted
         extension (str): Extension of the file pattern to delete, example "json" (optional)
+        exception_list (list): List of files to not remove (optional)
 
     Returns:
         None
 
     """
     for file in os.listdir(files_path):
-        if extension:
-            if not file.endswith(extension):
+        # don't rempove __init__ files as they conserve dir structure
+        if file == "__init__.py":
+            continue
+
+        if exception_list:
+            if file in exception_list:
                 continue
-        else:
-            # don't rempove __init__ files as they conserve dir structure
-            if file == "__init__.py":
+        elif extension:
+            if not file.endswith(extension):
                 continue
         os.remove(files_path / file)
         logger.info(f"Removing {file}")