mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 01:54:18 -08:00
Add territory boundary data (#885)
* Add territory boundary data * housing and transp * lint * lint * lint
This commit is contained in:
parent
f00cc5f7b2
commit
0a21fc6b12
5 changed files with 35 additions and 17 deletions
|
@ -33,7 +33,7 @@ class ExtractTransformLoad:
|
||||||
GEOID_FIELD_NAME: str = "GEOID10"
|
GEOID_FIELD_NAME: str = "GEOID10"
|
||||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
|
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
|
||||||
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
|
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 250000
|
||||||
EXPECTED_MAX_CENSUS_TRACTS: int = 73076
|
EXPECTED_MAX_CENSUS_TRACTS: int = 73076
|
||||||
|
|
||||||
def __init__(self, config_path: Path) -> None:
|
def __init__(self, config_path: Path) -> None:
|
||||||
|
|
|
@ -16,13 +16,17 @@ from data_pipeline.utils import (
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def reset_data_directories(data_path: Path) -> None:
|
def reset_data_directories(
|
||||||
|
data_path: Path,
|
||||||
|
) -> None:
|
||||||
"""Empties all census folders"""
|
"""Empties all census folders"""
|
||||||
census_data_path = data_path / "census"
|
census_data_path = data_path / "census"
|
||||||
|
|
||||||
# csv
|
# csv
|
||||||
csv_path = census_data_path / "csv"
|
csv_path = census_data_path / "csv"
|
||||||
remove_files_from_dir(csv_path, ".csv")
|
remove_files_from_dir(
|
||||||
|
csv_path, ".csv", exception_list=["fips_states_2010.csv"]
|
||||||
|
)
|
||||||
|
|
||||||
# geojson
|
# geojson
|
||||||
geojson_path = census_data_path / "geojson"
|
geojson_path = census_data_path / "geojson"
|
||||||
|
|
|
@ -72,8 +72,8 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
f"Downloading data for state/territory with FIPS code {fips}"
|
f"Downloading data for state/territory with FIPS code {fips}"
|
||||||
)
|
)
|
||||||
|
|
||||||
dfs.append(
|
try:
|
||||||
censusdata.download(
|
response = censusdata.download(
|
||||||
src="acs5",
|
src="acs5",
|
||||||
year=self.ACS_YEAR,
|
year=self.ACS_YEAR,
|
||||||
geo=censusdata.censusgeo(
|
geo=censusdata.censusgeo(
|
||||||
|
@ -91,7 +91,12 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||||
+ self.POVERTY_FIELDS,
|
+ self.POVERTY_FIELDS,
|
||||||
)
|
)
|
||||||
)
|
except ValueError:
|
||||||
|
logger.error(
|
||||||
|
f"Could not download data for state/territory with FIPS code {fips}"
|
||||||
|
)
|
||||||
|
|
||||||
|
dfs.append(response)
|
||||||
|
|
||||||
self.df = pd.concat(dfs)
|
self.df = pd.concat(dfs)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from pandas.errors import EmptyDataError
|
||||||
|
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||||
|
@ -26,10 +27,6 @@ class HousingTransportationETL(ExtractTransformLoad):
|
||||||
f"Downloading housing data for state/territory with FIPS code {fips}"
|
f"Downloading housing data for state/territory with FIPS code {fips}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Puerto Rico has no data, so skip
|
|
||||||
if fips == "72":
|
|
||||||
continue
|
|
||||||
|
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
|
f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
|
||||||
)
|
)
|
||||||
|
@ -38,7 +35,13 @@ class HousingTransportationETL(ExtractTransformLoad):
|
||||||
tmp_csv_file_path = (
|
tmp_csv_file_path = (
|
||||||
zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
|
zip_file_dir / f"htaindex_data_blkgrps_{fips}.csv"
|
||||||
)
|
)
|
||||||
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
|
|
||||||
|
try:
|
||||||
|
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
|
||||||
|
except EmptyDataError:
|
||||||
|
logger.error(
|
||||||
|
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
|
||||||
|
)
|
||||||
|
|
||||||
dfs.append(tmp_df)
|
dfs.append(tmp_df)
|
||||||
|
|
||||||
|
|
|
@ -46,25 +46,31 @@ def get_module_logger(module_name: str) -> logging.Logger:
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
|
def remove_files_from_dir(
|
||||||
|
files_path: Path, extension: str = None, exception_list: list = None
|
||||||
|
) -> None:
|
||||||
"""Removes all files from a specific directory with the exception of __init__.py
|
"""Removes all files from a specific directory with the exception of __init__.py
|
||||||
files or files with a specific extension
|
files or files with a specific extension
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
files_path (pathlib.Path): Name of the directory where the files will be deleted
|
files_path (pathlib.Path): Name of the directory where the files will be deleted
|
||||||
extension (str): Extension of the file pattern to delete, example "json" (optional)
|
extension (str): Extension of the file pattern to delete, example "json" (optional)
|
||||||
|
exception_list (list): List of files to not remove (optional)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for file in os.listdir(files_path):
|
for file in os.listdir(files_path):
|
||||||
if extension:
|
# don't rempove __init__ files as they conserve dir structure
|
||||||
if not file.endswith(extension):
|
if file == "__init__.py":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if exception_list:
|
||||||
|
if file in exception_list:
|
||||||
continue
|
continue
|
||||||
else:
|
elif extension:
|
||||||
# don't rempove __init__ files as they conserve dir structure
|
if not file.endswith(extension):
|
||||||
if file == "__init__.py":
|
|
||||||
continue
|
continue
|
||||||
os.remove(files_path / file)
|
os.remove(files_path / file)
|
||||||
logger.info(f"Removing {file}")
|
logger.info(f"Removing {file}")
|
||||||
|
|
Loading…
Add table
Reference in a new issue