Add territory boundary data (#885)

* Add territory boundary data

* housing and transp

* lint

* lint

* lint
This commit is contained in:
Jorge Escobar 2021-11-16 10:05:09 -05:00 committed by GitHub
parent f00cc5f7b2
commit 0a21fc6b12
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 35 additions and 17 deletions

View file

@ -33,7 +33,7 @@ class ExtractTransformLoad:
GEOID_FIELD_NAME: str = "GEOID10"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
EXPECTED_MAX_CENSUS_TRACTS: int = 73076
def __init__(self, config_path: Path) -> None:

View file

@ -16,13 +16,17 @@ from data_pipeline.utils import (
logger = get_module_logger(__name__)
def reset_data_directories(data_path: Path) -> None:
def reset_data_directories(
data_path: Path,
) -> None:
"""Empties all census folders"""
census_data_path = data_path / "census"
# csv
csv_path = census_data_path / "csv"
remove_files_from_dir(csv_path, ".csv")
remove_files_from_dir(
csv_path, ".csv", exception_list=["fips_states_2010.csv"]
)
# geojson
geojson_path = census_data_path / "geojson"

View file

@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad):
f"Downloading data for state/territory with FIPS code {fips}"
)
dfs.append(
censusdata.download(
try:
response = censusdata.download(
src="acs5",
year=self.ACS_YEAR,
geo=censusdata.censusgeo(
@ -91,7 +91,12 @@ class CensusACSETL(ExtractTransformLoad):
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS,
)
)
except ValueError:
logger.error(
f"Could not download data for state/territory with FIPS code {fips}"
)
dfs.append(response)
self.df = pd.concat(dfs)

View file

@ -1,4 +1,5 @@
import pandas as pd
from pandas.errors import EmptyDataError
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad):
f"Downloading housing data for state/territory with FIPS code {fips}"
)
# Puerto Rico has no data, so skip
if fips == "72":
continue
unzip_file_from_url(
f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
)
@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad):
tmp_csv_file_path = (
zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
)
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
try:
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
except EmptyDataError:
logger.error(
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
)
dfs.append(tmp_df)

View file

@ -46,25 +46,31 @@ def get_module_logger(module_name: str) -> logging.Logger:
logger = get_module_logger(__name__)
def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
def remove_files_from_dir(
files_path: Path, extension: str = None, exception_list: list = None
) -> None:
"""Removes all files from a specific directory with the exception of __init__.py
files or files with a specific extension
Args:
files_path (pathlib.Path): Name of the directory where the files will be deleted
extension (str): Extension of the file pattern to delete, example "json" (optional)
exception_list (list): List of files to not remove (optional)
Returns:
None
"""
for file in os.listdir(files_path):
if extension:
if not file.endswith(extension):
# don't rempove __init__ files as they conserve dir structure
if file == "__init__.py":
continue
if exception_list:
if file in exception_list:
continue
else:
# don't rempove __init__ files as they conserve dir structure
if file == "__init__.py":
elif extension:
if not file.endswith(extension):
continue
os.remove(files_path / file)
logger.info(f"Removing {file}")