Add territory boundary data (#885)

* Add territory boundary data

* housing and transp

* lint

* lint

* lint
This commit is contained in:
Jorge Escobar 2021-11-16 10:05:09 -05:00 committed by GitHub
parent f00cc5f7b2
commit 0a21fc6b12
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 35 additions and 17 deletions

View file

@ -33,7 +33,7 @@ class ExtractTransformLoad:
GEOID_FIELD_NAME: str = "GEOID10" GEOID_FIELD_NAME: str = "GEOID10"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods. # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405 EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
EXPECTED_MAX_CENSUS_TRACTS: int = 73076 EXPECTED_MAX_CENSUS_TRACTS: int = 73076
def __init__(self, config_path: Path) -> None: def __init__(self, config_path: Path) -> None:

View file

@ -16,13 +16,17 @@ from data_pipeline.utils import (
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
def reset_data_directories(data_path: Path) -> None: def reset_data_directories(
data_path: Path,
) -> None:
"""Empties all census folders""" """Empties all census folders"""
census_data_path = data_path / "census" census_data_path = data_path / "census"
# csv # csv
csv_path = census_data_path / "csv" csv_path = census_data_path / "csv"
remove_files_from_dir(csv_path, ".csv") remove_files_from_dir(
csv_path, ".csv", exception_list=["fips_states_2010.csv"]
)
# geojson # geojson
geojson_path = census_data_path / "geojson" geojson_path = census_data_path / "geojson"

View file

@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad):
f"Downloading data for state/territory with FIPS code {fips}" f"Downloading data for state/territory with FIPS code {fips}"
) )
dfs.append( try:
censusdata.download( response = censusdata.download(
src="acs5", src="acs5",
year=self.ACS_YEAR, year=self.ACS_YEAR,
geo=censusdata.censusgeo( geo=censusdata.censusgeo(
@ -91,8 +91,13 @@ class CensusACSETL(ExtractTransformLoad):
+ self.LINGUISTIC_ISOLATION_FIELDS + self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS, + self.POVERTY_FIELDS,
) )
except ValueError:
logger.error(
f"Could not download data for state/territory with FIPS code {fips}"
) )
dfs.append(response)
self.df = pd.concat(dfs) self.df = pd.concat(dfs)
self.df[self.GEOID_FIELD_NAME] = self.df.index.to_series().apply( self.df[self.GEOID_FIELD_NAME] = self.df.index.to_series().apply(

View file

@ -1,4 +1,5 @@
import pandas as pd import pandas as pd
from pandas.errors import EmptyDataError
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad):
f"Downloading housing data for state/territory with FIPS code {fips}" f"Downloading housing data for state/territory with FIPS code {fips}"
) )
# Puerto Rico has no data, so skip
if fips == "72":
continue
unzip_file_from_url( unzip_file_from_url(
f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
) )
@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad):
tmp_csv_file_path = ( tmp_csv_file_path = (
zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv" zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
) )
try:
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path) tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
except EmptyDataError:
logger.error(
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
)
dfs.append(tmp_df) dfs.append(tmp_df)

View file

@ -46,26 +46,32 @@ def get_module_logger(module_name: str) -> logging.Logger:
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
def remove_files_from_dir(files_path: Path, extension: str = None) -> None: def remove_files_from_dir(
files_path: Path, extension: str = None, exception_list: list = None
) -> None:
"""Removes all files from a specific directory with the exception of __init__.py """Removes all files from a specific directory with the exception of __init__.py
files or files with a specific extension files or files with a specific extension
Args: Args:
files_path (pathlib.Path): Name of the directory where the files will be deleted files_path (pathlib.Path): Name of the directory where the files will be deleted
extension (str): Extension of the file pattern to delete, example "json" (optional) extension (str): Extension of the file pattern to delete, example "json" (optional)
exception_list (list): List of files to not remove (optional)
Returns: Returns:
None None
""" """
for file in os.listdir(files_path): for file in os.listdir(files_path):
if extension:
if not file.endswith(extension):
continue
else:
# don't rempove __init__ files as they conserve dir structure # don't rempove __init__ files as they conserve dir structure
if file == "__init__.py": if file == "__init__.py":
continue continue
if exception_list:
if file in exception_list:
continue
elif extension:
if not file.endswith(extension):
continue
os.remove(files_path / file) os.remove(files_path / file)
logger.info(f"Removing {file}") logger.info(f"Removing {file}")