mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 09:41:26 -08:00
Add territory boundary data (#885)
* Add territory boundary data * housing and transp * lint * lint * lint
This commit is contained in:
parent
f00cc5f7b2
commit
0a21fc6b12
5 changed files with 35 additions and 17 deletions
|
@ -33,7 +33,7 @@ class ExtractTransformLoad:
|
|||
GEOID_FIELD_NAME: str = "GEOID10"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
|
||||
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
|
||||
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
|
||||
EXPECTED_MAX_CENSUS_TRACTS: int = 73076
|
||||
|
||||
def __init__(self, config_path: Path) -> None:
|
||||
|
|
|
@ -16,13 +16,17 @@ from data_pipeline.utils import (
|
|||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def reset_data_directories(data_path: Path) -> None:
|
||||
def reset_data_directories(
|
||||
data_path: Path,
|
||||
) -> None:
|
||||
"""Empties all census folders"""
|
||||
census_data_path = data_path / "census"
|
||||
|
||||
# csv
|
||||
csv_path = census_data_path / "csv"
|
||||
remove_files_from_dir(csv_path, ".csv")
|
||||
remove_files_from_dir(
|
||||
csv_path, ".csv", exception_list=["fips_states_2010.csv"]
|
||||
)
|
||||
|
||||
# geojson
|
||||
geojson_path = census_data_path / "geojson"
|
||||
|
|
|
@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
f"Downloading data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
dfs.append(
|
||||
censusdata.download(
|
||||
try:
|
||||
response = censusdata.download(
|
||||
src="acs5",
|
||||
year=self.ACS_YEAR,
|
||||
geo=censusdata.censusgeo(
|
||||
|
@ -91,7 +91,12 @@ class CensusACSETL(ExtractTransformLoad):
|
|||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||
+ self.POVERTY_FIELDS,
|
||||
)
|
||||
)
|
||||
except ValueError:
|
||||
logger.error(
|
||||
f"Could not download data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
dfs.append(response)
|
||||
|
||||
self.df = pd.concat(dfs)
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pandas as pd
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
|
@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad):
|
|||
f"Downloading housing data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
# Puerto Rico has no data, so skip
|
||||
if fips == "72":
|
||||
continue
|
||||
|
||||
unzip_file_from_url(
|
||||
f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
|
||||
)
|
||||
|
@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad):
|
|||
tmp_csv_file_path = (
|
||||
zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
|
||||
)
|
||||
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
|
||||
|
||||
try:
|
||||
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
|
||||
except EmptyDataError:
|
||||
logger.error(
|
||||
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
|
||||
)
|
||||
|
||||
dfs.append(tmp_df)
|
||||
|
||||
|
|
|
@ -46,25 +46,31 @@ def get_module_logger(module_name: str) -> logging.Logger:
|
|||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
|
||||
def remove_files_from_dir(
|
||||
files_path: Path, extension: str = None, exception_list: list = None
|
||||
) -> None:
|
||||
"""Removes all files from a specific directory with the exception of __init__.py
|
||||
files or files with a specific extension
|
||||
|
||||
Args:
|
||||
files_path (pathlib.Path): Name of the directory where the files will be deleted
|
||||
extension (str): Extension of the file pattern to delete, example "json" (optional)
|
||||
exception_list (list): List of files to not remove (optional)
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
for file in os.listdir(files_path):
|
||||
if extension:
|
||||
if not file.endswith(extension):
|
||||
# don't rempove __init__ files as they conserve dir structure
|
||||
if file == "__init__.py":
|
||||
continue
|
||||
|
||||
if exception_list:
|
||||
if file in exception_list:
|
||||
continue
|
||||
else:
|
||||
# don't rempove __init__ files as they conserve dir structure
|
||||
if file == "__init__.py":
|
||||
elif extension:
|
||||
if not file.endswith(extension):
|
||||
continue
|
||||
os.remove(files_path / file)
|
||||
logger.info(f"Removing {file}")
|
||||
|
|
Loading…
Add table
Reference in a new issue