mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 01:54:18 -08:00
User Story 2152 – Clean up logging (#2155)
Update logging messages and message consistency This update includes changes to the level of many log messages. Rather than everything being logged at the info level, it differentiates between debug, info, warning, and error messages. It also changes the default log level to info to avoid much of the noise previously in the logs. It also removes many extra log messages, and adds additional decorators at the beginning of each pipeline run.
This commit is contained in:
parent
7cfb56476e
commit
03a6d3c660
63 changed files with 307 additions and 339 deletions
|
@ -28,6 +28,8 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
dataset_cli_help = "Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository"
|
dataset_cli_help = "Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository"
|
||||||
|
|
||||||
|
LOG_LINE_WIDTH = 60
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
def cli():
|
||||||
|
@ -37,23 +39,26 @@ def cli():
|
||||||
@cli.command(help="Clean up all census data folders")
|
@cli.command(help="Clean up all census data folders")
|
||||||
def census_cleanup():
|
def census_cleanup():
|
||||||
"""CLI command to clean up the census data folder"""
|
"""CLI command to clean up the census data folder"""
|
||||||
|
log_title("Clean Up Census Data")
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
# census directories
|
# census directories
|
||||||
logger.info("Initializing all census data")
|
log_info("Cleaning up all census data")
|
||||||
census_reset(data_path)
|
census_reset(data_path)
|
||||||
|
|
||||||
logger.info("Cleaned up all census data files")
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
@cli.command(help="Clean up all data folders")
|
@cli.command(help="Clean up all data folders")
|
||||||
def data_cleanup():
|
def data_cleanup():
|
||||||
"""CLI command to clean up the all the data folders"""
|
"""CLI command to clean up the all the data folders"""
|
||||||
|
log_title("Clean Up Data ")
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
|
log_info("Cleaning up all data folders")
|
||||||
census_reset(data_path)
|
census_reset(data_path)
|
||||||
data_folder_cleanup()
|
data_folder_cleanup()
|
||||||
tribal_reset(data_path)
|
tribal_reset(data_path)
|
||||||
|
@ -61,7 +66,7 @@ def data_cleanup():
|
||||||
temp_folder_cleanup()
|
temp_folder_cleanup()
|
||||||
geo_score_folder_cleanup()
|
geo_score_folder_cleanup()
|
||||||
|
|
||||||
logger.info("Cleaned up all data folders")
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,19 +82,19 @@ def data_cleanup():
|
||||||
def census_data_download(zip_compress):
|
def census_data_download(zip_compress):
|
||||||
"""CLI command to download all census shape files from the Census FTP and extract the geojson
|
"""CLI command to download all census shape files from the Census FTP and extract the geojson
|
||||||
to generate national and by state Census Block Group CSVs"""
|
to generate national and by state Census Block Group CSVs"""
|
||||||
|
log_title("Download Census Data ")
|
||||||
logger.info("Initializing all census data")
|
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
census_reset(data_path)
|
census_reset(data_path)
|
||||||
|
|
||||||
logger.info("Downloading census data")
|
log_info("Downloading census data")
|
||||||
etl_runner("census")
|
etl_runner("census")
|
||||||
|
|
||||||
if zip_compress:
|
if zip_compress:
|
||||||
|
log_info("Zipping census data")
|
||||||
zip_census_data()
|
zip_census_data()
|
||||||
|
|
||||||
logger.info("Completed downloading census data")
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@ -103,10 +108,14 @@ def census_data_download(zip_compress):
|
||||||
help=dataset_cli_help,
|
help=dataset_cli_help,
|
||||||
)
|
)
|
||||||
def pull_census_data(data_source: str):
|
def pull_census_data(data_source: str):
|
||||||
logger.info("Pulling census data from %s", data_source)
|
|
||||||
|
log_title("Pull Census Data")
|
||||||
|
|
||||||
|
log_info(f"Pulling census data from {data_source}")
|
||||||
data_path = settings.APP_ROOT / "data" / "census"
|
data_path = settings.APP_ROOT / "data" / "census"
|
||||||
check_census_data_source(data_path, data_source)
|
check_census_data_source(data_path, data_source)
|
||||||
logger.info("Finished pulling census data")
|
|
||||||
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@ -129,8 +138,12 @@ def etl_run(dataset: str):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
|
log_title("Run ETL")
|
||||||
|
|
||||||
|
log_info("Running dataset(s)")
|
||||||
etl_runner(dataset)
|
etl_runner(dataset)
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@ -139,9 +152,15 @@ def etl_run(dataset: str):
|
||||||
)
|
)
|
||||||
def score_run():
|
def score_run():
|
||||||
"""CLI command to generate the score"""
|
"""CLI command to generate the score"""
|
||||||
|
log_title("Score", "Generate Score")
|
||||||
|
|
||||||
|
log_info("Cleaning up data folders")
|
||||||
score_folder_cleanup()
|
score_folder_cleanup()
|
||||||
|
|
||||||
|
log_info("Generating score")
|
||||||
score_generate()
|
score_generate()
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@ -150,63 +169,25 @@ def score_run():
|
||||||
)
|
)
|
||||||
def score_full_run():
|
def score_full_run():
|
||||||
"""CLI command to run ETL and generate the score in one command"""
|
"""CLI command to run ETL and generate the score in one command"""
|
||||||
|
log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")
|
||||||
|
|
||||||
|
log_info("Cleaning up data folders")
|
||||||
data_folder_cleanup()
|
data_folder_cleanup()
|
||||||
score_folder_cleanup()
|
score_folder_cleanup()
|
||||||
temp_folder_cleanup()
|
temp_folder_cleanup()
|
||||||
|
|
||||||
|
log_info("Running all ETLs")
|
||||||
etl_runner()
|
etl_runner()
|
||||||
|
|
||||||
|
log_info("Generating score")
|
||||||
score_generate()
|
score_generate()
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
@cli.command(help="Generate Geojson files with scores baked in")
|
|
||||||
@click.option(
|
|
||||||
"-s",
|
|
||||||
"--data-source",
|
|
||||||
default="local",
|
|
||||||
required=False,
|
|
||||||
type=str,
|
|
||||||
help=dataset_cli_help,
|
|
||||||
)
|
|
||||||
def geo_score(data_source: str):
|
|
||||||
"""CLI command to combine score with GeoJSON data and generate low and high files
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data_source (str): Source for the census data (optional)
|
|
||||||
Options:
|
|
||||||
- local: fetch census and score data from the local data directory
|
|
||||||
- aws: fetch census and score from AWS S3 J40 data repository
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
None
|
|
||||||
"""
|
|
||||||
|
|
||||||
geo_score_folder_cleanup()
|
|
||||||
score_geo(data_source=data_source)
|
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
@cli.command(
|
@cli.command(
|
||||||
help="Generate map tiles. Pass -t to generate tribal layer as well.",
|
help="Run etl_score_post to create score csv, tile csv, and downloadable zip"
|
||||||
)
|
|
||||||
@click.option(
|
|
||||||
"-t",
|
|
||||||
"--generate-tribal-layer",
|
|
||||||
default=False,
|
|
||||||
required=False,
|
|
||||||
is_flag=True,
|
|
||||||
type=bool,
|
|
||||||
)
|
|
||||||
def generate_map_tiles(generate_tribal_layer):
|
|
||||||
"""CLI command to generate the map tiles"""
|
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
|
||||||
generate_tiles(data_path, generate_tribal_layer)
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command(
|
|
||||||
help="Run etl_score_post to create score csv, tile csv, and downloadable zip",
|
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-s",
|
"-s",
|
||||||
|
@ -228,9 +209,74 @@ def generate_score_post(data_source: str):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
|
log_title(
|
||||||
|
"Generate Score Post ", "Create Score CSV, Tile CSV, Downloadable ZIP"
|
||||||
|
)
|
||||||
|
|
||||||
|
log_info("Cleaning up downloadable folder")
|
||||||
downloadable_cleanup()
|
downloadable_cleanup()
|
||||||
|
|
||||||
|
log_info("Running score post activities")
|
||||||
score_post(data_source)
|
score_post(data_source)
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help="Generate GeoJSON files with scores baked in")
|
||||||
|
@click.option(
|
||||||
|
"-s",
|
||||||
|
"--data-source",
|
||||||
|
default="local",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
help=dataset_cli_help,
|
||||||
|
)
|
||||||
|
def geo_score(data_source: str):
|
||||||
|
"""CLI command to combine score with GeoJSON data and generate low and high files
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_source (str): Source for the census data (optional)
|
||||||
|
Options:
|
||||||
|
- local: fetch census and score data from the local data directory
|
||||||
|
- aws: fetch census and score from AWS S3 J40 data repository
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
log_title("Generate GeoJSON", "Combine Score and GeoJSON")
|
||||||
|
|
||||||
|
log_info("Cleaning up geo score folder")
|
||||||
|
geo_score_folder_cleanup()
|
||||||
|
|
||||||
|
log_info("Combining score with GeoJSON")
|
||||||
|
score_geo(data_source=data_source)
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(
|
||||||
|
help="Generate map tiles. Pass -t to generate tribal layer as well.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-t",
|
||||||
|
"--generate-tribal-layer",
|
||||||
|
default=False,
|
||||||
|
required=False,
|
||||||
|
is_flag=True,
|
||||||
|
type=bool,
|
||||||
|
)
|
||||||
|
def generate_map_tiles(generate_tribal_layer):
|
||||||
|
"""CLI command to generate the map tiles"""
|
||||||
|
log_title("Generate Map Tiles")
|
||||||
|
|
||||||
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
|
log_info("Generating tiles")
|
||||||
|
generate_tiles(data_path, generate_tribal_layer)
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@ -264,49 +310,74 @@ def data_full_run(check: bool, data_source: str):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
|
log_title("Full Run", "Census DL, ETL, Score, Combine, Generate Tiles")
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
if check:
|
if check:
|
||||||
if not check_first_run():
|
if not check_first_run():
|
||||||
# check if the data full run has been run before
|
# check if the data full run has been run before
|
||||||
logger.info("*** The data full run was already executed")
|
log_info("The data full run was already executed")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# census directories
|
# census directories
|
||||||
logger.info("*** Initializing all data folders")
|
log_info("Cleaning up data folders")
|
||||||
census_reset(data_path)
|
census_reset(data_path)
|
||||||
data_folder_cleanup()
|
data_folder_cleanup()
|
||||||
score_folder_cleanup()
|
score_folder_cleanup()
|
||||||
temp_folder_cleanup()
|
temp_folder_cleanup()
|
||||||
|
|
||||||
if data_source == "local":
|
if data_source == "local":
|
||||||
logger.info("*** Downloading census data")
|
log_info("Downloading census data")
|
||||||
etl_runner("census")
|
etl_runner("census")
|
||||||
|
|
||||||
logger.info("*** Running all ETLs")
|
log_info("Running all ETLs")
|
||||||
etl_runner()
|
etl_runner()
|
||||||
|
|
||||||
logger.info("*** Generating Score")
|
log_info("Generating score")
|
||||||
score_generate()
|
score_generate()
|
||||||
|
|
||||||
logger.info("*** Running Post Score scripts")
|
log_info("Running post score")
|
||||||
downloadable_cleanup()
|
downloadable_cleanup()
|
||||||
score_post(data_source)
|
score_post(data_source)
|
||||||
|
|
||||||
logger.info("*** Combining Score with Census Geojson")
|
log_info("Combining score with census GeoJSON")
|
||||||
score_geo(data_source)
|
score_geo(data_source)
|
||||||
|
|
||||||
logger.info("*** Generating Map Tiles")
|
log_info("Generating map tiles")
|
||||||
generate_tiles(data_path, True)
|
generate_tiles(data_path, True)
|
||||||
|
|
||||||
|
log_info("Completing pipeline")
|
||||||
file = "first_run.txt"
|
file = "first_run.txt"
|
||||||
cmd = f"touch {data_path}/{file}"
|
cmd = f"touch {data_path}/{file}"
|
||||||
call(cmd, shell=True)
|
call(cmd, shell=True)
|
||||||
|
|
||||||
logger.info("*** Map data ready")
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
def log_title(title: str, subtitle: str = None):
|
||||||
|
"""Logs a title in our fancy title format"""
|
||||||
|
logger.info("-" * LOG_LINE_WIDTH)
|
||||||
|
logger.info("")
|
||||||
|
logger.info(f"{title}")
|
||||||
|
if subtitle:
|
||||||
|
logger.info(f"{subtitle}")
|
||||||
|
logger.info("")
|
||||||
|
logger.info("-" * LOG_LINE_WIDTH)
|
||||||
|
logger.info("")
|
||||||
|
|
||||||
|
|
||||||
|
def log_info(info: str):
|
||||||
|
"""Logs a general informational message"""
|
||||||
|
logger.info(f"- {info}")
|
||||||
|
|
||||||
|
|
||||||
|
def log_goodbye():
|
||||||
|
"""Logs a goodbye message"""
|
||||||
|
logger.info("- Finished. Bye!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
|
|
@ -225,8 +225,8 @@ class ExtractTransformLoad:
|
||||||
# TODO: remove this once all ETL classes are converted to using the new
|
# TODO: remove this once all ETL classes are converted to using the new
|
||||||
# base class parameters and patterns.
|
# base class parameters and patterns.
|
||||||
if self.GEO_LEVEL is None:
|
if self.GEO_LEVEL is None:
|
||||||
logger.info(
|
logger.warning(
|
||||||
"Skipping validation step for this class because it does not "
|
f"Skipping validation step for {self.__class__.__name__} because it does not "
|
||||||
"seem to be converted to new ETL class patterns."
|
"seem to be converted to new ETL class patterns."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
@ -331,7 +331,7 @@ class ExtractTransformLoad:
|
||||||
|
|
||||||
Uses the directory and the file name from `self._get_output_file_path`.
|
Uses the directory and the file name from `self._get_output_file_path`.
|
||||||
"""
|
"""
|
||||||
logger.info(f"Saving `{self.NAME}` CSV")
|
logger.debug(f"Saving `{self.NAME}` CSV")
|
||||||
|
|
||||||
# Create directory if necessary.
|
# Create directory if necessary.
|
||||||
output_file_path = self._get_output_file_path()
|
output_file_path = self._get_output_file_path()
|
||||||
|
@ -342,7 +342,7 @@ class ExtractTransformLoad:
|
||||||
output_file_path, index=False, float_format=float_format
|
output_file_path, index=False, float_format=float_format
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"File written to `{output_file_path}`.")
|
logger.debug(f"File written to `{output_file_path}`.")
|
||||||
|
|
||||||
# This is a classmethod so it can be used without needing to create an instance of
|
# This is a classmethod so it can be used without needing to create an instance of
|
||||||
# the class. This is a use case in `etl_score`.
|
# the class. This is a use case in `etl_score`.
|
||||||
|
@ -362,7 +362,7 @@ class ExtractTransformLoad:
|
||||||
f"No file found at `{output_file_path}`."
|
f"No file found at `{output_file_path}`."
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Reading in CSV `{output_file_path}` for ETL of class `{cls}`."
|
f"Reading in CSV `{output_file_path}` for ETL of class `{cls}`."
|
||||||
)
|
)
|
||||||
output_df = pd.read_csv(
|
output_df = pd.read_csv(
|
||||||
|
|
|
@ -42,6 +42,9 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
|
||||||
|
|
||||||
def _run_one_dataset(dataset: dict) -> None:
|
def _run_one_dataset(dataset: dict) -> None:
|
||||||
"""Runs one etl process."""
|
"""Runs one etl process."""
|
||||||
|
|
||||||
|
logger.info(f"Running ETL for {dataset['name']}")
|
||||||
|
|
||||||
etl_module = importlib.import_module(
|
etl_module = importlib.import_module(
|
||||||
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
|
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
|
||||||
)
|
)
|
||||||
|
@ -49,21 +52,26 @@ def _run_one_dataset(dataset: dict) -> None:
|
||||||
etl_instance = etl_class()
|
etl_instance = etl_class()
|
||||||
|
|
||||||
# run extract
|
# run extract
|
||||||
|
logger.debug(f"Extracting {dataset['name']}")
|
||||||
etl_instance.extract()
|
etl_instance.extract()
|
||||||
|
|
||||||
# run transform
|
# run transform
|
||||||
|
logger.debug(f"Transforming {dataset['name']}")
|
||||||
etl_instance.transform()
|
etl_instance.transform()
|
||||||
|
|
||||||
# run load
|
# run load
|
||||||
|
logger.debug(f"Loading {dataset['name']}")
|
||||||
etl_instance.load()
|
etl_instance.load()
|
||||||
|
|
||||||
# run validate
|
# run validate
|
||||||
|
logger.debug(f"Validating {dataset['name']}")
|
||||||
etl_instance.validate()
|
etl_instance.validate()
|
||||||
|
|
||||||
# cleanup
|
# cleanup
|
||||||
|
logger.debug(f"Cleaning up {dataset['name']}")
|
||||||
etl_instance.cleanup()
|
etl_instance.cleanup()
|
||||||
|
|
||||||
logger.info(f"Finished `etl-run` for dataset `{dataset['name']}`.")
|
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
||||||
|
|
||||||
|
|
||||||
def etl_runner(dataset_to_run: str = None) -> None:
|
def etl_runner(dataset_to_run: str = None) -> None:
|
||||||
|
@ -94,7 +102,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
||||||
]
|
]
|
||||||
|
|
||||||
if concurrent_datasets:
|
if concurrent_datasets:
|
||||||
logger.info("Running concurrent jobs")
|
logger.info("Running concurrent ETL jobs")
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
futures = {
|
futures = {
|
||||||
executor.submit(_run_one_dataset, dataset=dataset)
|
executor.submit(_run_one_dataset, dataset=dataset)
|
||||||
|
@ -106,10 +114,10 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
||||||
# Otherwise, the exceptions are silently ignored.
|
# Otherwise, the exceptions are silently ignored.
|
||||||
fut.result()
|
fut.result()
|
||||||
|
|
||||||
# Note: these high-memory datasets also usually require the Census geojson to be
|
# Note: these high-memory datasets also usually require the Census GeoJSON to be
|
||||||
# generated, and one of them requires the Tribal geojson to be generated.
|
# generated, and one of them requires the Tribal GeoJSON to be generated.
|
||||||
if high_memory_datasets:
|
if high_memory_datasets:
|
||||||
logger.info("Running high-memory jobs")
|
logger.info("Running high-memory ETL jobs")
|
||||||
for dataset in high_memory_datasets:
|
for dataset in high_memory_datasets:
|
||||||
_run_one_dataset(dataset=dataset)
|
_run_one_dataset(dataset=dataset)
|
||||||
|
|
||||||
|
|
|
@ -56,8 +56,6 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
|
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Loading data sets from disk.")
|
|
||||||
|
|
||||||
# EJSCreen csv Load
|
# EJSCreen csv Load
|
||||||
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
|
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
|
||||||
self.ejscreen_df = pd.read_csv(
|
self.ejscreen_df = pd.read_csv(
|
||||||
|
@ -200,7 +198,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
||||||
logger.info("Joining Census Tract dataframes")
|
logger.debug("Joining Census Tract dataframes")
|
||||||
|
|
||||||
def merge_function(
|
def merge_function(
|
||||||
left: pd.DataFrame, right: pd.DataFrame
|
left: pd.DataFrame, right: pd.DataFrame
|
||||||
|
@ -317,7 +315,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts),
|
~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts),
|
||||||
np.nan,
|
np.nan,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Creating special case column for percentiles from {input_column_name}"
|
f"Creating special case column for percentiles from {input_column_name}"
|
||||||
)
|
)
|
||||||
df[
|
df[
|
||||||
|
@ -335,7 +333,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
# TODO Move a lot of this to the ETL part of the pipeline
|
# TODO Move a lot of this to the ETL part of the pipeline
|
||||||
def _prepare_initial_df(self) -> pd.DataFrame:
|
def _prepare_initial_df(self) -> pd.DataFrame:
|
||||||
logger.info("Preparing initial dataframe")
|
logger.debug("Preparing initial dataframe")
|
||||||
|
|
||||||
# Join all the data sources that use census tracts
|
# Join all the data sources that use census tracts
|
||||||
census_tract_dfs = [
|
census_tract_dfs = [
|
||||||
|
@ -377,7 +375,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
assert (
|
assert (
|
||||||
census_tract_df.shape[0] <= pre_join_len
|
census_tract_df.shape[0] <= pre_join_len
|
||||||
), "Join against national tract list ADDED rows"
|
), "Join against national tract list ADDED rows"
|
||||||
logger.info(
|
logger.debug(
|
||||||
"Dropped %s tracts not in the 2010 tract data",
|
"Dropped %s tracts not in the 2010 tract data",
|
||||||
pre_join_len
|
pre_join_len
|
||||||
- census_tract_df[field_names.GEOID_TRACT_FIELD].nunique(),
|
- census_tract_df[field_names.GEOID_TRACT_FIELD].nunique(),
|
||||||
|
@ -560,7 +558,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
for col in boolean_columns:
|
for col in boolean_columns:
|
||||||
tmp = df_copy[col].copy()
|
tmp = df_copy[col].copy()
|
||||||
df_copy[col] = np.where(tmp.notna(), tmp.astype(bool), None)
|
df_copy[col] = np.where(tmp.notna(), tmp.astype(bool), None)
|
||||||
logger.info(f"{col} contains {df_copy[col].isna().sum()} nulls.")
|
logger.debug(f"{col} contains {df_copy[col].isna().sum()} nulls.")
|
||||||
|
|
||||||
# Convert all columns to numeric and do math
|
# Convert all columns to numeric and do math
|
||||||
# Note that we have a few special conditions here and we handle them explicitly.
|
# Note that we have a few special conditions here and we handle them explicitly.
|
||||||
|
@ -591,7 +589,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
.astype(bool)
|
.astype(bool)
|
||||||
.fillna(False)
|
.fillna(False)
|
||||||
][field_names.GEOID_TRACT_FIELD].to_list()
|
][field_names.GEOID_TRACT_FIELD].to_list()
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
|
f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
|
||||||
)
|
)
|
||||||
elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
|
elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
|
||||||
|
@ -599,7 +597,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
# 72 is the FIPS code for Puerto Rico
|
# 72 is the FIPS code for Puerto Rico
|
||||||
df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72")
|
df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72")
|
||||||
][field_names.GEOID_TRACT_FIELD].to_list()
|
][field_names.GEOID_TRACT_FIELD].to_list()
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
|
f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -615,7 +613,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
df_copy[field_names.TOTAL_POP_FIELD].fillna(0)
|
df_copy[field_names.TOTAL_POP_FIELD].fillna(0)
|
||||||
<= low_population
|
<= low_population
|
||||||
][field_names.GEOID_TRACT_FIELD].to_list()
|
][field_names.GEOID_TRACT_FIELD].to_list()
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Dropping {len(drop_tracts)} tracts from DOT traffic burden"
|
f"Dropping {len(drop_tracts)} tracts from DOT traffic burden"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -666,7 +664,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame:
|
def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
logger.info("Backfilling island demographic data")
|
logger.debug("Backfilling island demographic data")
|
||||||
island_index = self._get_island_areas(df)
|
island_index = self._get_island_areas(df)
|
||||||
for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS:
|
for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS:
|
||||||
actual_field_name = backfill_field_name.replace(
|
actual_field_name = backfill_field_name.replace(
|
||||||
|
@ -684,8 +682,6 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Score Data")
|
|
||||||
|
|
||||||
# prepare the df with the right CBG/tract IDs, column names/types, and percentiles
|
# prepare the df with the right CBG/tract IDs, column names/types, and percentiles
|
||||||
self.df = self._prepare_initial_df()
|
self.df = self._prepare_initial_df()
|
||||||
|
|
||||||
|
@ -696,9 +692,6 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
self.df = self._backfill_island_demographics(self.df)
|
self.df = self._backfill_island_demographics(self.df)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info(
|
|
||||||
f"Saving Score CSV to {constants.DATA_SCORE_CSV_FULL_FILE_PATH}."
|
|
||||||
)
|
|
||||||
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
|
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
|
||||||
|
|
|
@ -118,7 +118,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
|
fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
|
||||||
|
|
||||||
# TODO update this join
|
# TODO update this join
|
||||||
logger.info("Merging and compressing score CSV with USA GeoJSON")
|
logger.info("Merging and compressing score csv with USA GeoJSON")
|
||||||
self.geojson_score_usa_high = self.score_usa_df.set_index(
|
self.geojson_score_usa_high = self.score_usa_df.set_index(
|
||||||
self.GEOID_FIELD_NAME
|
self.GEOID_FIELD_NAME
|
||||||
).merge(
|
).merge(
|
||||||
|
@ -143,7 +143,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
columns={self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO}
|
columns={self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO}
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Converting geojson into geodf with tracts")
|
logger.info("Converting GeoJSON into GeoDataFrame with tracts")
|
||||||
usa_tracts = gpd.GeoDataFrame(
|
usa_tracts = gpd.GeoDataFrame(
|
||||||
usa_tracts,
|
usa_tracts,
|
||||||
columns=[
|
columns=[
|
||||||
|
@ -154,15 +154,15 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
crs="EPSG:4326",
|
crs="EPSG:4326",
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Creating buckets from tracts")
|
logger.debug("Creating buckets from tracts")
|
||||||
usa_bucketed, keep_high_zoom_df = self._create_buckets_from_tracts(
|
usa_bucketed, keep_high_zoom_df = self._create_buckets_from_tracts(
|
||||||
usa_tracts, self.NUMBER_OF_BUCKETS
|
usa_tracts, self.NUMBER_OF_BUCKETS
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Aggregating buckets")
|
logger.debug("Aggregating buckets")
|
||||||
usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean")
|
usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean")
|
||||||
|
|
||||||
logger.info("Breaking up polygons")
|
logger.debug("Breaking up polygons")
|
||||||
compressed = self._breakup_multipolygons(
|
compressed = self._breakup_multipolygons(
|
||||||
usa_aggregated, self.NUMBER_OF_BUCKETS
|
usa_aggregated, self.NUMBER_OF_BUCKETS
|
||||||
)
|
)
|
||||||
|
@ -220,7 +220,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
len(state_tracts.index) / self.NUMBER_OF_BUCKETS
|
len(state_tracts.index) / self.NUMBER_OF_BUCKETS
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"The number of buckets has increased to {self.NUMBER_OF_BUCKETS}"
|
f"The number of buckets has increased to {self.NUMBER_OF_BUCKETS}"
|
||||||
)
|
)
|
||||||
for i in range(len(state_tracts.index)):
|
for i in range(len(state_tracts.index)):
|
||||||
|
|
|
@ -62,7 +62,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
# End YAML definition constants
|
# End YAML definition constants
|
||||||
|
|
||||||
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
|
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
|
||||||
logger.info("Reading Counties CSV")
|
logger.debug("Reading Counties CSV")
|
||||||
return pd.read_csv(
|
return pd.read_csv(
|
||||||
county_path,
|
county_path,
|
||||||
sep="\t",
|
sep="\t",
|
||||||
|
@ -75,7 +75,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_states(self, state_path: Path) -> pd.DataFrame:
|
def _extract_states(self, state_path: Path) -> pd.DataFrame:
|
||||||
logger.info("Reading States CSV")
|
logger.debug("Reading States CSV")
|
||||||
return pd.read_csv(
|
return pd.read_csv(
|
||||||
state_path,
|
state_path,
|
||||||
dtype={"fips": "string", "state_abbreviation": "string"},
|
dtype={"fips": "string", "state_abbreviation": "string"},
|
||||||
|
@ -83,7 +83,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
def _extract_score(self, score_path: Path) -> pd.DataFrame:
|
||||||
logger.info("Reading Score CSV")
|
logger.debug("Reading Score CSV")
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
score_path,
|
score_path,
|
||||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
|
@ -98,8 +98,6 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting Extraction")
|
|
||||||
|
|
||||||
# check census data
|
# check census data
|
||||||
check_census_data_source(
|
check_census_data_source(
|
||||||
census_data_path=self.DATA_PATH / "census",
|
census_data_path=self.DATA_PATH / "census",
|
||||||
|
@ -170,7 +168,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
score_df: pd.DataFrame,
|
score_df: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
|
|
||||||
logger.info("Merging county info with score info")
|
logger.debug("Merging county info with score info")
|
||||||
score_county_merged = score_df.merge(
|
score_county_merged = score_df.merge(
|
||||||
# We drop state abbreviation so we don't get it twice
|
# We drop state abbreviation so we don't get it twice
|
||||||
counties_df[["GEOID", "County Name"]],
|
counties_df[["GEOID", "County Name"]],
|
||||||
|
@ -178,7 +176,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Merging state info with county-score info")
|
logger.debug("Merging state info with county-score info")
|
||||||
# Here, we need to join on a separate key, since there's no
|
# Here, we need to join on a separate key, since there's no
|
||||||
# entry for the island areas in the counties df (there are no
|
# entry for the island areas in the counties df (there are no
|
||||||
# counties!) Thus, unless we join state separately from county,
|
# counties!) Thus, unless we join state separately from county,
|
||||||
|
@ -207,7 +205,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
score_county_state_merged_df: pd.DataFrame,
|
score_county_state_merged_df: pd.DataFrame,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
|
|
||||||
logger.info("Rounding Decimals")
|
logger.debug("Rounding Decimals")
|
||||||
# grab all the keys from tiles score columns
|
# grab all the keys from tiles score columns
|
||||||
tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())
|
tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())
|
||||||
|
|
||||||
|
@ -218,7 +216,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
# We may not want some states/territories on the map, so this will drop all
|
# We may not want some states/territories on the map, so this will drop all
|
||||||
# rows with those FIPS codes (first two digits of the census tract)
|
# rows with those FIPS codes (first two digits of the census tract)
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Dropping specified FIPS codes from tile data: {constants.DROP_FIPS_CODES}"
|
f"Dropping specified FIPS codes from tile data: {constants.DROP_FIPS_CODES}"
|
||||||
)
|
)
|
||||||
tracts_to_drop = []
|
tracts_to_drop = []
|
||||||
|
@ -241,7 +239,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
score_tiles[float_cols] * scale_factor
|
score_tiles[float_cols] * scale_factor
|
||||||
).apply(np.floor) / scale_factor
|
).apply(np.floor) / scale_factor
|
||||||
|
|
||||||
logger.info("Adding fields for island areas and Puerto Rico")
|
logger.debug("Adding fields for island areas and Puerto Rico")
|
||||||
# The below operation constructs variables for the front end.
|
# The below operation constructs variables for the front end.
|
||||||
# Since the Island Areas, Puerto Rico, and the nation all have a different
|
# Since the Island Areas, Puerto Rico, and the nation all have a different
|
||||||
# set of available data, each has its own user experience.
|
# set of available data, each has its own user experience.
|
||||||
|
@ -381,8 +379,6 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
return final_df
|
return final_df
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming data sources for Score + County CSVs")
|
|
||||||
|
|
||||||
transformed_counties = self._transform_counties(self.input_counties_df)
|
transformed_counties = self._transform_counties(self.input_counties_df)
|
||||||
transformed_states = self._transform_states(self.input_states_df)
|
transformed_states = self._transform_states(self.input_states_df)
|
||||||
transformed_score = self._transform_score(self.input_score_df)
|
transformed_score = self._transform_score(self.input_score_df)
|
||||||
|
@ -403,7 +399,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
def _load_score_csv_full(
|
def _load_score_csv_full(
|
||||||
self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
|
self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
|
||||||
) -> None:
|
) -> None:
|
||||||
logger.info("Saving Full Score CSV with County Information")
|
logger.debug("Saving Full Score CSV with County Information")
|
||||||
score_csv_path.parent.mkdir(parents=True, exist_ok=True)
|
score_csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
score_county_state_merged.to_csv(
|
score_county_state_merged.to_csv(
|
||||||
score_csv_path,
|
score_csv_path,
|
||||||
|
@ -476,7 +472,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
def _load_tile_csv(
|
def _load_tile_csv(
|
||||||
self, score_tiles_df: pd.DataFrame, tile_score_path: Path
|
self, score_tiles_df: pd.DataFrame, tile_score_path: Path
|
||||||
) -> None:
|
) -> None:
|
||||||
logger.info("Saving Tile Score CSV")
|
logger.debug("Saving Tile Score CSV")
|
||||||
tile_score_path.parent.mkdir(parents=True, exist_ok=True)
|
tile_score_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")
|
score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")
|
||||||
|
|
||||||
|
@ -498,13 +494,13 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
constants.SCORE_VERSIONING_DATA_DOCUMENTATION_ZIP_FILE_PATH
|
constants.SCORE_VERSIONING_DATA_DOCUMENTATION_ZIP_FILE_PATH
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Writing downloadable excel")
|
logger.debug("Writing downloadable excel")
|
||||||
excel_config = self._load_excel_from_df(
|
excel_config = self._load_excel_from_df(
|
||||||
excel_df=self.output_score_county_state_merged_df,
|
excel_df=self.output_score_county_state_merged_df,
|
||||||
excel_path=excel_path,
|
excel_path=excel_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Writing downloadable csv")
|
logger.debug("Writing downloadable csv")
|
||||||
# open yaml config
|
# open yaml config
|
||||||
downloadable_csv_config = load_yaml_dict_from_file(
|
downloadable_csv_config = load_yaml_dict_from_file(
|
||||||
self.CONTENT_CONFIG / "csv.yml", CSVConfig
|
self.CONTENT_CONFIG / "csv.yml", CSVConfig
|
||||||
|
@ -516,7 +512,7 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
downloadable_df.to_csv(csv_path, index=False)
|
downloadable_df.to_csv(csv_path, index=False)
|
||||||
|
|
||||||
logger.info("Creating codebook for download zip")
|
logger.debug("Creating codebook for download zip")
|
||||||
|
|
||||||
# consolidate all excel fields from the config yml. The codebook
|
# consolidate all excel fields from the config yml. The codebook
|
||||||
# code takes in a list of fields, but the excel config file
|
# code takes in a list of fields, but the excel config file
|
||||||
|
@ -562,17 +558,17 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
codebook_df.to_csv(codebook_path, index=False)
|
codebook_df.to_csv(codebook_path, index=False)
|
||||||
|
|
||||||
# zip assets
|
# zip assets
|
||||||
logger.info("Compressing csv files")
|
logger.debug("Compressing csv files")
|
||||||
files_to_compress = [csv_path, codebook_path, readme_path]
|
files_to_compress = [csv_path, codebook_path, readme_path]
|
||||||
zip_files(csv_zip_path, files_to_compress)
|
zip_files(csv_zip_path, files_to_compress)
|
||||||
|
|
||||||
logger.info("Compressing xls files")
|
logger.debug("Compressing xls files")
|
||||||
files_to_compress = [excel_path, codebook_path, readme_path]
|
files_to_compress = [excel_path, codebook_path, readme_path]
|
||||||
zip_files(xls_zip_path, files_to_compress)
|
zip_files(xls_zip_path, files_to_compress)
|
||||||
|
|
||||||
# Per #1557
|
# Per #1557
|
||||||
# zip file that contains the .xls, .csv, .pdf, tech support document, checksum file
|
# zip file that contains the .xls, .csv, .pdf, tech support document, checksum file
|
||||||
logger.info("Compressing data and documentation files")
|
logger.debug("Compressing data and documentation files")
|
||||||
files_to_compress = [
|
files_to_compress = [
|
||||||
excel_path,
|
excel_path,
|
||||||
csv_path,
|
csv_path,
|
||||||
|
|
|
@ -47,14 +47,14 @@ def check_score_data_source(
|
||||||
|
|
||||||
# download from s3 if census_data_source is aws
|
# download from s3 if census_data_source is aws
|
||||||
if score_data_source == "aws":
|
if score_data_source == "aws":
|
||||||
logger.info("Fetching Score Tile data from AWS S3")
|
logger.debug("Fetching Score Tile data from AWS S3")
|
||||||
download_file_from_url(
|
download_file_from_url(
|
||||||
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
|
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# check if score data is found locally
|
# check if score data is found locally
|
||||||
if not os.path.isfile(TILE_SCORE_CSV):
|
if not os.path.isfile(TILE_SCORE_CSV):
|
||||||
logger.info(
|
logger.warning(
|
||||||
"No local score tiles data found. Please use '-s aws` to fetch from AWS"
|
"No local score tiles data found. Please use '-s aws` to fetch from AWS"
|
||||||
)
|
)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
@ -409,7 +409,7 @@ def compare_to_list_of_expected_state_fips_codes(
|
||||||
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
|
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.debug(
|
||||||
"Data matches expected state and territory representation"
|
"Data matches expected state and territory representation"
|
||||||
f"{dataset_name_phrase}."
|
f"{dataset_name_phrase}."
|
||||||
)
|
)
|
||||||
|
|
|
@ -33,15 +33,12 @@ class CalEnviroScreenETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading CalEnviroScreen Data")
|
|
||||||
super().extract(
|
super().extract(
|
||||||
self.CALENVIROSCREEN_FTP_URL,
|
self.CALENVIROSCREEN_FTP_URL,
|
||||||
self.get_tmp_path(),
|
self.get_tmp_path(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming CalEnviroScreen Data")
|
|
||||||
|
|
||||||
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
|
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
|
||||||
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
|
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
|
||||||
# Load comparison index (CalEnviroScreen 4)
|
# Load comparison index (CalEnviroScreen 4)
|
||||||
|
@ -70,7 +67,6 @@ class CalEnviroScreenETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving CalEnviroScreen CSV")
|
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
|
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
|
||||||
|
|
|
@ -81,7 +81,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting data download.")
|
|
||||||
|
|
||||||
all_usa_raw_df = self._download_and_prep_data(
|
all_usa_raw_df = self._download_and_prep_data(
|
||||||
file_url=self.USA_FILE_URL,
|
file_url=self.USA_FILE_URL,
|
||||||
|
@ -102,13 +101,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
|
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Downloading data for Maine")
|
logger.debug("Downloading data for Maine")
|
||||||
maine_raw_df = self._download_and_prep_data(
|
maine_raw_df = self._download_and_prep_data(
|
||||||
file_url=self.MAINE_FILE_URL,
|
file_url=self.MAINE_FILE_URL,
|
||||||
download_file_name=self.get_tmp_path() / "maine.csv",
|
download_file_name=self.get_tmp_path() / "maine.csv",
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Downloading data for Wisconsin")
|
logger.debug("Downloading data for Wisconsin")
|
||||||
wisconsin_raw_df = self._download_and_prep_data(
|
wisconsin_raw_df = self._download_and_prep_data(
|
||||||
file_url=self.WISCONSIN_FILE_URL,
|
file_url=self.WISCONSIN_FILE_URL,
|
||||||
download_file_name=self.get_tmp_path() / "wisconsin.csv",
|
download_file_name=self.get_tmp_path() / "wisconsin.csv",
|
||||||
|
@ -138,7 +137,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
self.raw_df = combined_df
|
self.raw_df = combined_df
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting CDC life expectancy transform.")
|
|
||||||
|
|
||||||
self.output_df = self.raw_df.rename(
|
self.output_df = self.raw_df.rename(
|
||||||
columns={
|
columns={
|
||||||
|
@ -148,7 +146,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving CDC Life Expectancy CSV")
|
|
||||||
|
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.output_df[self.COLUMNS_TO_KEEP].to_csv(
|
self.output_df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
|
|
@ -44,7 +44,6 @@ class CDCPlacesETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting to download 520MB CDC Places file.")
|
|
||||||
file_path = download_file_from_url(
|
file_path = download_file_from_url(
|
||||||
file_url=self.CDC_PLACES_URL,
|
file_url=self.CDC_PLACES_URL,
|
||||||
download_file_name=self.get_tmp_path() / "census_tract.csv",
|
download_file_name=self.get_tmp_path() / "census_tract.csv",
|
||||||
|
@ -57,8 +56,6 @@ class CDCPlacesETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting CDC Places transform")
|
|
||||||
|
|
||||||
# Rename GEOID field
|
# Rename GEOID field
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
|
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
|
||||||
|
|
|
@ -48,7 +48,6 @@ class CDCSVIIndex(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading 43 MB CDC SVI INDEX")
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.CDC_SVI_INDEX_URL,
|
filepath_or_buffer=self.CDC_SVI_INDEX_URL,
|
||||||
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
|
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
|
||||||
|
@ -56,7 +55,6 @@ class CDCSVIIndex(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting CDC SVI INDEX transform")
|
|
||||||
# Note: In this dataset all US census tracts are ranked against one another.
|
# Note: In this dataset all US census tracts are ranked against one another.
|
||||||
# Puerto Rico is not included in this dataset
|
# Puerto Rico is not included in this dataset
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
|
@ -109,8 +107,6 @@ class CDCSVIIndex(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving CDC SVI Index Data")
|
|
||||||
|
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
|
|
@ -70,14 +70,9 @@ class CensusETL(ExtractTransformLoad):
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
|
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
|
||||||
logger.info(f"Checking if {fips_code} shp file exists")
|
|
||||||
|
|
||||||
# check if file exists
|
# check if file exists
|
||||||
if not shp_file_path.is_file():
|
if not shp_file_path.is_file():
|
||||||
logger.info(
|
|
||||||
f"{fips_code} shp file does not exist. Downloading and extracting shape file"
|
|
||||||
)
|
|
||||||
|
|
||||||
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
|
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
tract_state_url,
|
tract_state_url,
|
||||||
|
@ -86,8 +81,11 @@ class CensusETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading Census Data")
|
logger.debug("Extracting census data")
|
||||||
for fips_code in self.STATE_FIPS_CODES:
|
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
|
||||||
|
logger.debug(
|
||||||
|
f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}"
|
||||||
|
)
|
||||||
self._extract_shp(fips_code)
|
self._extract_shp(fips_code)
|
||||||
|
|
||||||
def _transform_to_geojson(self, fips_code: str) -> None:
|
def _transform_to_geojson(self, fips_code: str) -> None:
|
||||||
|
@ -100,11 +98,8 @@ class CensusETL(ExtractTransformLoad):
|
||||||
geojson_file_path = self._path_for_fips_file(
|
geojson_file_path = self._path_for_fips_file(
|
||||||
fips_code, GeoFileType.GEOJSON
|
fips_code, GeoFileType.GEOJSON
|
||||||
)
|
)
|
||||||
logger.info(f"Checking if {fips_code} geoJSON file exists ")
|
|
||||||
if not geojson_file_path.is_file():
|
if not geojson_file_path.is_file():
|
||||||
logger.info(
|
|
||||||
f"GeoJSON file {fips_code} does not exist. Converting shp to geoJSON"
|
|
||||||
)
|
|
||||||
cmd = [
|
cmd = [
|
||||||
"ogr2ogr",
|
"ogr2ogr",
|
||||||
"-f",
|
"-f",
|
||||||
|
@ -120,9 +115,11 @@ class CensusETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
|
logger.debug("Transforming tracts")
|
||||||
|
|
||||||
for file in self.GEOJSON_BASE_PATH.iterdir():
|
for file in self.GEOJSON_BASE_PATH.iterdir():
|
||||||
if file.suffix == ".json":
|
if file.suffix == ".json":
|
||||||
logger.info(f"Ingesting geoid10 for file {file}")
|
logger.debug(f"Adding GEOID10 for file {file.name}")
|
||||||
with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f:
|
with open(self.GEOJSON_BASE_PATH / file, encoding="utf-8") as f:
|
||||||
geojson = json.load(f)
|
geojson = json.load(f)
|
||||||
for feature in geojson["features"]:
|
for feature in geojson["features"]:
|
||||||
|
@ -142,13 +139,19 @@ class CensusETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming Census Data")
|
logger.debug("Transforming census data")
|
||||||
for fips_code in self.STATE_FIPS_CODES:
|
|
||||||
|
logger.debug("Transforming SHP files to GeoJSON")
|
||||||
|
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
|
||||||
|
logger.debug(
|
||||||
|
f"Transforming FIPS {fips_code} to GeoJSON – {index+1} of {len(self.STATE_FIPS_CODES)}"
|
||||||
|
)
|
||||||
self._transform_to_geojson(fips_code)
|
self._transform_to_geojson(fips_code)
|
||||||
|
|
||||||
self._generate_tract_table()
|
self._generate_tract_table()
|
||||||
|
|
||||||
def _load_into_state_csvs(self, fips_code: str) -> None:
|
def _load_into_state_csvs(self, fips_code: str) -> None:
|
||||||
"""Load state CSVS into individual CSV files
|
"""Load state CSVs into individual CSV files
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
fips_code (str): the FIPS code for the region of interest
|
fips_code (str): the FIPS code for the region of interest
|
||||||
|
@ -182,10 +185,9 @@ class CensusETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
logger.info("Writing national us.csv file")
|
logger.debug("Loading national US.csv")
|
||||||
|
|
||||||
if not self.NATIONAL_TRACT_CSV_PATH.is_file():
|
if not self.NATIONAL_TRACT_CSV_PATH.is_file():
|
||||||
logger.info(f"Creating {self.NATIONAL_TRACT_CSV_PATH}")
|
|
||||||
with open(
|
with open(
|
||||||
self.NATIONAL_TRACT_CSV_PATH,
|
self.NATIONAL_TRACT_CSV_PATH,
|
||||||
mode="w",
|
mode="w",
|
||||||
|
@ -211,22 +213,21 @@ class CensusETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
logger.info("Generating national geojson file")
|
logger.debug("Loading National GeoJson")
|
||||||
|
|
||||||
usa_df = gpd.GeoDataFrame()
|
usa_df = gpd.GeoDataFrame()
|
||||||
|
|
||||||
for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"):
|
for file_name in self.GEOJSON_BASE_PATH.rglob("*.json"):
|
||||||
logger.info(f"Ingesting {file_name}")
|
logger.debug(f"Adding national GeoJSON file {file_name.name}")
|
||||||
state_gdf = gpd.read_file(file_name)
|
state_gdf = gpd.read_file(file_name)
|
||||||
usa_df = usa_df.append(state_gdf)
|
usa_df = usa_df.append(state_gdf)
|
||||||
|
|
||||||
usa_df = usa_df.to_crs(
|
usa_df = usa_df.to_crs(
|
||||||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||||
)
|
)
|
||||||
logger.info("Writing national geojson file")
|
|
||||||
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
|
|
||||||
|
|
||||||
logger.info("Census tract downloading complete")
|
logger.debug("Saving national GeoJSON file")
|
||||||
|
usa_df.to_file(self.NATIONAL_TRACT_JSON_PATH, driver="GeoJSON")
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
"""Create state CSVs, National CSV, and National GeoJSON
|
"""Create state CSVs, National CSV, and National GeoJSON
|
||||||
|
@ -234,8 +235,13 @@ class CensusETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
logger.info("Saving Census CSV")
|
logger.debug("Loading census data")
|
||||||
|
|
||||||
|
logger.debug("Loading individual state csv files")
|
||||||
for fips_code in self.TRACT_PER_STATE:
|
for fips_code in self.TRACT_PER_STATE:
|
||||||
self._load_into_state_csvs(fips_code)
|
self._load_into_state_csvs(fips_code)
|
||||||
|
|
||||||
self._load_national_csv()
|
self._load_national_csv()
|
||||||
self._load_national_geojson()
|
self._load_national_geojson()
|
||||||
|
|
||||||
|
logger.debug("Census data complete")
|
||||||
|
|
|
@ -39,7 +39,6 @@ def get_state_fips_codes(data_path: Path) -> list:
|
||||||
"""Returns a list with state data"""
|
"""Returns a list with state data"""
|
||||||
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
||||||
|
|
||||||
logger.info("Downloading fips from S3 repository")
|
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fips_states_2010.zip",
|
||||||
data_path / "tmp",
|
data_path / "tmp",
|
||||||
|
@ -97,7 +96,6 @@ def check_census_data_source(
|
||||||
|
|
||||||
# download from s3 if census_data_source is aws
|
# download from s3 if census_data_source is aws
|
||||||
if census_data_source == "aws":
|
if census_data_source == "aws":
|
||||||
logger.info("Fetching Census data from AWS S3")
|
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
CENSUS_DATA_S3_URL,
|
CENSUS_DATA_S3_URL,
|
||||||
DATA_PATH / "tmp",
|
DATA_PATH / "tmp",
|
||||||
|
@ -106,14 +104,13 @@ def check_census_data_source(
|
||||||
else:
|
else:
|
||||||
# check if census data is found locally
|
# check if census data is found locally
|
||||||
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
|
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
|
||||||
logger.info(
|
logger.error(
|
||||||
"No local census data found. Please use '-s aws` to fetch from AWS"
|
"No local census data found. Please use '-s aws` to fetch from AWS"
|
||||||
)
|
)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
def zip_census_data():
|
def zip_census_data():
|
||||||
logger.info("Compressing census files to data/tmp folder")
|
|
||||||
|
|
||||||
CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
|
CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
|
||||||
TMP_PATH = settings.APP_ROOT / "data" / "tmp"
|
TMP_PATH = settings.APP_ROOT / "data" / "tmp"
|
||||||
|
|
|
@ -363,18 +363,16 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting Census ACS Transform")
|
|
||||||
|
|
||||||
df = self.df
|
df = self.df
|
||||||
|
|
||||||
# Here we join the geometry of the US to the dataframe so that we can impute
|
# Here we join the geometry of the US to the dataframe so that we can impute
|
||||||
# The income of neighbors. first this looks locally; if there's no local
|
# The income of neighbors. first this looks locally; if there's no local
|
||||||
# geojson file for all of the US, this will read it off of S3
|
# geojson file for all of the US, this will read it off of S3
|
||||||
logger.info("Reading in geojson for the country")
|
logger.debug("Reading in geojson for the country")
|
||||||
if not os.path.exists(
|
if not os.path.exists(
|
||||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||||
):
|
):
|
||||||
logger.info("Fetching Census data from AWS S3")
|
logger.debug("Fetching Census data from AWS S3")
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
CENSUS_DATA_S3_URL,
|
CENSUS_DATA_S3_URL,
|
||||||
self.DATA_PATH / "tmp",
|
self.DATA_PATH / "tmp",
|
||||||
|
@ -406,7 +404,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
|
||||||
]:
|
]:
|
||||||
missing_value_count = sum(df[field] == -666666666)
|
missing_value_count = sum(df[field] == -666666666)
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
|
f"There are {missing_value_count} ({int(100*missing_value_count/df[field].count())}%) values of "
|
||||||
+ f"`{field}` being marked as null values."
|
+ f"`{field}` being marked as null values."
|
||||||
)
|
)
|
||||||
|
@ -591,7 +589,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
|
|
||||||
# we impute income for both income measures
|
# we impute income for both income measures
|
||||||
## TODO: Convert to pydantic for clarity
|
## TODO: Convert to pydantic for clarity
|
||||||
logger.info("Imputing income information")
|
logger.debug("Imputing income information")
|
||||||
ImputeVariables = namedtuple(
|
ImputeVariables = namedtuple(
|
||||||
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
|
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
|
||||||
)
|
)
|
||||||
|
@ -612,7 +610,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
|
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Calculating with imputed values")
|
logger.debug("Calculating with imputed values")
|
||||||
|
|
||||||
df[
|
df[
|
||||||
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
|
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
|
||||||
|
@ -644,7 +642,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
== 0
|
== 0
|
||||||
), "Error: not all values were filled..."
|
), "Error: not all values were filled..."
|
||||||
|
|
||||||
logger.info("Renaming columns...")
|
logger.debug("Renaming columns...")
|
||||||
df = df.rename(
|
df = df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME: field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
|
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME: field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
|
||||||
|
|
|
@ -88,7 +88,7 @@ def _prepare_dataframe_for_imputation(
|
||||||
][geoid_field].unique()
|
][geoid_field].unique()
|
||||||
|
|
||||||
# Check that imputation is a valid choice for this set of fields
|
# Check that imputation is a valid choice for this set of fields
|
||||||
logger.info(f"Imputing values for {len(tract_list)} unique tracts.")
|
logger.debug(f"Imputing values for {len(tract_list)} unique tracts.")
|
||||||
assert len(tract_list) > 0, "Error: No missing values to impute"
|
assert len(tract_list) > 0, "Error: No missing values to impute"
|
||||||
|
|
||||||
return tract_list, geo_df
|
return tract_list, geo_df
|
||||||
|
@ -156,7 +156,7 @@ def calculate_income_measures(
|
||||||
mask_to_use
|
mask_to_use
|
||||||
][impute_var_pair.raw_field_name].mean()
|
][impute_var_pair.raw_field_name].mean()
|
||||||
|
|
||||||
logger.info("Casting geodataframe as a typical dataframe")
|
logger.debug("Casting geodataframe as a typical dataframe")
|
||||||
# get rid of the geometry column and cast as a typical df
|
# get rid of the geometry column and cast as a typical df
|
||||||
df = pd.DataFrame(
|
df = pd.DataFrame(
|
||||||
geo_df[[col for col in geo_df.columns if col != "geometry"]]
|
geo_df[[col for col in geo_df.columns if col != "geometry"]]
|
||||||
|
|
|
@ -30,14 +30,14 @@ def retrieve_census_acs_data(
|
||||||
dfs = []
|
dfs = []
|
||||||
for fips in get_state_fips_codes(data_path_for_fips_codes):
|
for fips in get_state_fips_codes(data_path_for_fips_codes):
|
||||||
if fips in CENSUS_ACS_FIPS_CODES_TO_SKIP:
|
if fips in CENSUS_ACS_FIPS_CODES_TO_SKIP:
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Skipping download for state/territory with FIPS code {fips}"
|
f"Skipping download for state/territory with FIPS code {fips}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
census_api_key = ""
|
census_api_key = ""
|
||||||
if os.environ.get("CENSUS_API_KEY"):
|
if os.environ.get("CENSUS_API_KEY"):
|
||||||
census_api_key = "with API key"
|
census_api_key = "with API key"
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Downloading data for state/territory with FIPS code {fips} {census_api_key}"
|
f"Downloading data for state/territory with FIPS code {fips} {census_api_key}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ def retrieve_census_acs_data(
|
||||||
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Could not download data for state/territory with FIPS code {fips}"
|
f"Could not download data for state/territory with FIPS code {fips} because {e}"
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
|
@ -100,7 +100,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting Census 2010 ACS Transform")
|
|
||||||
# Define the variables to retrieve
|
# Define the variables to retrieve
|
||||||
variables = (
|
variables = (
|
||||||
self.UNEMPLOYED_FIELDS
|
self.UNEMPLOYED_FIELDS
|
||||||
|
@ -118,8 +117,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting Census 2010 ACS Transform")
|
|
||||||
|
|
||||||
df = self.df
|
df = self.df
|
||||||
|
|
||||||
# Calculate percent unemployment.
|
# Calculate percent unemployment.
|
||||||
|
@ -184,8 +181,6 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
||||||
self.df = output_df
|
self.df = output_df
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Census 2010 ACS Data")
|
|
||||||
|
|
||||||
# mkdir census
|
# mkdir census
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
|
@ -224,7 +224,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
return state_median_incomes_df
|
return state_median_incomes_df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting four separate downloads.")
|
|
||||||
# Load and clean GEOCORR data
|
# Load and clean GEOCORR data
|
||||||
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
|
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
|
||||||
# The specific query used is the following, which takes a couple of minutes to run:
|
# The specific query used is the following, which takes a couple of minutes to run:
|
||||||
|
@ -239,7 +238,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
# and with the "target geographies" selected being:
|
# and with the "target geographies" selected being:
|
||||||
# - Core based statistical area (CBSA)
|
# - Core based statistical area (CBSA)
|
||||||
# - CBSA Type (Metro or Micro)
|
# - CBSA Type (Metro or Micro)
|
||||||
logger.info("Starting download of 1.5MB Geocorr information.")
|
logger.debug("Starting download of 1.5MB Geocorr information.")
|
||||||
|
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
|
@ -265,7 +264,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Pulling PR tract list down.")
|
logger.debug("Pulling PR tract list down.")
|
||||||
# This step is necessary because PR is not in geocorr at the level that gets joined
|
# This step is necessary because PR is not in geocorr at the level that gets joined
|
||||||
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
|
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
|
||||||
download_file_from_url(
|
download_file_from_url(
|
||||||
|
@ -282,7 +281,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
self.pr_tracts["State Abbreviation"] = "PR"
|
self.pr_tracts["State Abbreviation"] = "PR"
|
||||||
|
|
||||||
# Download MSA median incomes
|
# Download MSA median incomes
|
||||||
logger.info("Starting download of MSA median incomes.")
|
logger.debug("Starting download of MSA median incomes.")
|
||||||
download = requests.get(
|
download = requests.get(
|
||||||
self.MSA_MEDIAN_INCOME_URL,
|
self.MSA_MEDIAN_INCOME_URL,
|
||||||
verify=None,
|
verify=None,
|
||||||
|
@ -291,7 +290,7 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
self.msa_median_incomes = json.loads(download.content)
|
self.msa_median_incomes = json.loads(download.content)
|
||||||
|
|
||||||
# Download state median incomes
|
# Download state median incomes
|
||||||
logger.info("Starting download of state median incomes.")
|
logger.debug("Starting download of state median incomes.")
|
||||||
download_state = requests.get(
|
download_state = requests.get(
|
||||||
self.STATE_MEDIAN_INCOME_URL,
|
self.STATE_MEDIAN_INCOME_URL,
|
||||||
verify=None,
|
verify=None,
|
||||||
|
@ -301,8 +300,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
## NOTE we already have PR's MI here
|
## NOTE we already have PR's MI here
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting transforms.")
|
|
||||||
|
|
||||||
# Run transforms:
|
# Run transforms:
|
||||||
geocorr_df = self._transform_geocorr()
|
geocorr_df = self._transform_geocorr()
|
||||||
msa_median_incomes_df = self._transform_msa_median_incomes()
|
msa_median_incomes_df = self._transform_msa_median_incomes()
|
||||||
|
@ -352,8 +349,6 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
self.output_df = merged_with_state_income_df
|
self.output_df = merged_with_state_income_df
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Census ACS Median Income CSV")
|
|
||||||
|
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.output_df[self.COLUMNS_TO_KEEP].to_csv(
|
self.output_df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||||
|
|
|
@ -352,7 +352,7 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
dfs = []
|
dfs = []
|
||||||
dfs_vi = []
|
dfs_vi = []
|
||||||
for island in self.ISLAND_TERRITORIES:
|
for island in self.ISLAND_TERRITORIES:
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Downloading data for state/territory {island['state_abbreviation']}"
|
f"Downloading data for state/territory {island['state_abbreviation']}"
|
||||||
)
|
)
|
||||||
for county in island["county_fips"]:
|
for county in island["county_fips"]:
|
||||||
|
@ -369,7 +369,13 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
df = json.loads(download.content)
|
df = json.loads(download.content)
|
||||||
|
except ValueError as e:
|
||||||
|
logger.error(
|
||||||
|
f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
|
||||||
|
)
|
||||||
|
|
||||||
# First row is the header
|
# First row is the header
|
||||||
df = pd.DataFrame(df[1:], columns=df[0])
|
df = pd.DataFrame(df[1:], columns=df[0])
|
||||||
|
|
||||||
|
@ -393,8 +399,6 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
self.df_vi = pd.concat(dfs_vi)
|
self.df_vi = pd.concat(dfs_vi)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting Census Decennial Transform")
|
|
||||||
|
|
||||||
# Rename All Fields
|
# Rename All Fields
|
||||||
self.df.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
self.df.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
||||||
self.df_vi.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
self.df_vi.rename(columns=self.FIELD_NAME_XWALK, inplace=True)
|
||||||
|
@ -489,13 +493,11 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
# Reporting Missing Values
|
# Reporting Missing Values
|
||||||
for col in self.df_all.columns:
|
for col in self.df_all.columns:
|
||||||
missing_value_count = self.df_all[col].isnull().sum()
|
missing_value_count = self.df_all[col].isnull().sum()
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
|
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Census Decennial Data")
|
|
||||||
|
|
||||||
# mkdir census
|
# mkdir census
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
|
@ -65,14 +65,12 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting 51MB data download.")
|
|
||||||
super().extract(
|
super().extract(
|
||||||
source_url=self.SOURCE_URL,
|
source_url=self.SOURCE_URL,
|
||||||
extract_path=self.get_tmp_path(),
|
extract_path=self.get_tmp_path(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting transforms.")
|
|
||||||
raw_df = pd.read_csv(
|
raw_df = pd.read_csv(
|
||||||
filepath_or_buffer=self.get_tmp_path() / "raw.csv",
|
filepath_or_buffer=self.get_tmp_path() / "raw.csv",
|
||||||
# The following need to remain as strings for all of their digits, not get
|
# The following need to remain as strings for all of their digits, not get
|
||||||
|
|
|
@ -30,7 +30,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting DOE Energy Burden transforms.")
|
|
||||||
raw_df: pd.DataFrame = pd.read_csv(
|
raw_df: pd.DataFrame = pd.read_csv(
|
||||||
filepath_or_buffer=self.get_tmp_path()
|
filepath_or_buffer=self.get_tmp_path()
|
||||||
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
|
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
|
||||||
|
@ -41,7 +40,7 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Renaming columns and ensuring output format is correct")
|
logger.debug("Renaming columns and ensuring output format is correct")
|
||||||
output_df = raw_df.rename(
|
output_df = raw_df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
|
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
|
||||||
|
|
|
@ -53,7 +53,6 @@ class TravelCompositeETL(ExtractTransformLoad):
|
||||||
- Renames the Census Tract column to match the other datasets
|
- Renames the Census Tract column to match the other datasets
|
||||||
- Converts to CSV
|
- Converts to CSV
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming DOT Travel Disadvantage Data")
|
|
||||||
|
|
||||||
# read in the unzipped shapefile from data source
|
# read in the unzipped shapefile from data source
|
||||||
# reformat it to be standard df, remove unassigned rows, and
|
# reformat it to be standard df, remove unassigned rows, and
|
||||||
|
|
|
@ -60,7 +60,6 @@ class AbandonedMineETL(ExtractTransformLoad):
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting eAMLIS transforms.")
|
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
self.get_tmp_path() / "eAMLIS export of all data.tsv",
|
self.get_tmp_path() / "eAMLIS export of all data.tsv",
|
||||||
sep="\t",
|
sep="\t",
|
||||||
|
|
|
@ -44,7 +44,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
||||||
]
|
]
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading EJScreen Data")
|
|
||||||
super().extract(
|
super().extract(
|
||||||
self.EJSCREEN_FTP_URL,
|
self.EJSCREEN_FTP_URL,
|
||||||
self.get_tmp_path(),
|
self.get_tmp_path(),
|
||||||
|
@ -52,7 +51,6 @@ class EJSCREENETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming EJScreen Data")
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
self.EJSCREEN_CSV,
|
self.EJSCREEN_CSV,
|
||||||
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
||||||
|
|
|
@ -39,7 +39,7 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
if self.ejscreen_areas_of_concern_data_exists():
|
if self.ejscreen_areas_of_concern_data_exists():
|
||||||
logger.info("Loading EJSCREEN Areas of Concern Data Locally")
|
logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
|
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
|
||||||
dtype={
|
dtype={
|
||||||
|
@ -48,24 +48,24 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.warning(
|
||||||
"EJSCREEN areas of concern data does not exist locally. Not loading the data."
|
"EJSCREEN areas of concern data does not exist locally. Not loading the data."
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming EJSCREEN Areas of Concern Data")
|
logger.debug("Transforming EJSCREEN Areas of Concern Data")
|
||||||
|
|
||||||
# TO DO: As a one off we did all the processing in a separate Notebook
|
# TO DO: As a one off we did all the processing in a separate Notebook
|
||||||
# Can add here later for a future PR
|
# Can add here later for a future PR
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
if self.ejscreen_areas_of_concern_data_exists():
|
if self.ejscreen_areas_of_concern_data_exists():
|
||||||
logger.info("Saving EJSCREEN Areas of Concern Data")
|
logger.debug("Saving EJSCREEN Areas of Concern Data")
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
|
self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.warning(
|
||||||
"EJSCREEN areas of concern data does not exist locally. Not saving the data."
|
"EJSCREEN areas of concern data does not exist locally. Not saving the data."
|
||||||
)
|
)
|
||||||
|
|
|
@ -49,8 +49,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting data download.")
|
|
||||||
|
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
|
file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
|
||||||
download_path=self.get_tmp_path(),
|
download_path=self.get_tmp_path(),
|
||||||
|
@ -70,8 +68,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting transforms.")
|
|
||||||
|
|
||||||
self.df = self.df.rename(
|
self.df = self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
|
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
@ -105,8 +101,6 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving CSV")
|
|
||||||
|
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||||
|
|
|
@ -65,8 +65,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting 2.5 MB data download.")
|
|
||||||
|
|
||||||
# the column headers from the above dataset are actually a census tract's data at this point
|
# the column headers from the above dataset are actually a census tract's data at this point
|
||||||
# We will use this data structure later to specify the column names
|
# We will use this data structure later to specify the column names
|
||||||
input_columns = [
|
input_columns = [
|
||||||
|
@ -98,8 +96,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting transforms.")
|
|
||||||
|
|
||||||
score_columns = [x for x in self.df.columns if "SCORE" in x]
|
score_columns = [x for x in self.df.columns if "SCORE" in x]
|
||||||
|
|
||||||
# coerce dataframe type to perform correct next steps
|
# coerce dataframe type to perform correct next steps
|
||||||
|
@ -157,8 +153,6 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving CSV")
|
|
||||||
|
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||||
|
|
|
@ -48,7 +48,6 @@ class FloodRiskETL(ExtractTransformLoad):
|
||||||
- Renames the Census Tract column to match the other datasets
|
- Renames the Census Tract column to match the other datasets
|
||||||
- Calculates share of properties at risk, left-clipping number of properties at 250
|
- Calculates share of properties at risk, left-clipping number of properties at 250
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming National Risk Index Data")
|
|
||||||
|
|
||||||
# read in the unzipped csv data source then rename the
|
# read in the unzipped csv data source then rename the
|
||||||
# Census Tract column for merging
|
# Census Tract column for merging
|
||||||
|
|
|
@ -48,7 +48,6 @@ class WildfireRiskETL(ExtractTransformLoad):
|
||||||
- Renames the Census Tract column to match the other datasets
|
- Renames the Census Tract column to match the other datasets
|
||||||
- Calculates share of properties at risk, left-clipping number of properties at 250
|
- Calculates share of properties at risk, left-clipping number of properties at 250
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming National Risk Index Data")
|
|
||||||
# read in the unzipped csv data source then rename the
|
# read in the unzipped csv data source then rename the
|
||||||
# Census Tract column for merging
|
# Census Tract column for merging
|
||||||
df_fsf_fire: pd.DataFrame = pd.read_csv(
|
df_fsf_fire: pd.DataFrame = pd.read_csv(
|
||||||
|
|
|
@ -16,7 +16,7 @@ logger = get_module_logger(__name__)
|
||||||
def get_tract_geojson(
|
def get_tract_geojson(
|
||||||
_tract_data_path: Optional[Path] = None,
|
_tract_data_path: Optional[Path] = None,
|
||||||
) -> gpd.GeoDataFrame:
|
) -> gpd.GeoDataFrame:
|
||||||
logger.info("Loading tract geometry data from census ETL")
|
logger.debug("Loading tract geometry data from census ETL")
|
||||||
GEOJSON_PATH = _tract_data_path
|
GEOJSON_PATH = _tract_data_path
|
||||||
if GEOJSON_PATH is None:
|
if GEOJSON_PATH is None:
|
||||||
GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH
|
GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH
|
||||||
|
@ -40,7 +40,7 @@ def get_tract_geojson(
|
||||||
def get_tribal_geojson(
|
def get_tribal_geojson(
|
||||||
_tribal_data_path: Optional[Path] = None,
|
_tribal_data_path: Optional[Path] = None,
|
||||||
) -> gpd.GeoDataFrame:
|
) -> gpd.GeoDataFrame:
|
||||||
logger.info("Loading Tribal geometry data from Tribal ETL")
|
logger.debug("Loading Tribal geometry data from Tribal ETL")
|
||||||
GEOJSON_PATH = _tribal_data_path
|
GEOJSON_PATH = _tribal_data_path
|
||||||
if GEOJSON_PATH is None:
|
if GEOJSON_PATH is None:
|
||||||
GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH
|
GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH
|
||||||
|
|
|
@ -34,9 +34,6 @@ class GeoCorrETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info(
|
|
||||||
"Starting to download 2MB GeoCorr Urban Rural Census Tract Map file."
|
|
||||||
)
|
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/geocorr_urban_rural.csv.zip",
|
+ "/geocorr_urban_rural.csv.zip",
|
||||||
|
@ -53,7 +50,6 @@ class GeoCorrETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting GeoCorr Urban Rural Map transform")
|
|
||||||
# Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr
|
# Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr
|
||||||
|
|
||||||
self.output_df = self.df.rename(
|
self.output_df = self.df.rename(
|
||||||
|
|
|
@ -43,7 +43,6 @@ class HistoricRedliningETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Historic Redlining Data")
|
|
||||||
# this is obviously temporary
|
# this is obviously temporary
|
||||||
historic_redlining_data = pd.read_excel(
|
historic_redlining_data = pd.read_excel(
|
||||||
self.HISTORIC_REDLINING_FILE_PATH
|
self.HISTORIC_REDLINING_FILE_PATH
|
||||||
|
@ -55,7 +54,7 @@ class HistoricRedliningETL(ExtractTransformLoad):
|
||||||
columns={"HRS2010": self.REDLINING_SCALAR}
|
columns={"HRS2010": self.REDLINING_SCALAR}
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"{historic_redlining_data.columns}")
|
logger.debug(f"{historic_redlining_data.columns}")
|
||||||
|
|
||||||
# Calculate lots of different score thresholds for convenience
|
# Calculate lots of different score thresholds for convenience
|
||||||
for threshold in [3.25, 3.5, 3.75]:
|
for threshold in [3.25, 3.5, 3.75]:
|
||||||
|
|
|
@ -23,7 +23,7 @@ class HousingTransportationETL(ExtractTransformLoad):
|
||||||
dfs = []
|
dfs = []
|
||||||
zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
|
zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
|
||||||
for fips in get_state_fips_codes(self.DATA_PATH):
|
for fips in get_state_fips_codes(self.DATA_PATH):
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Downloading housing data for state/territory with FIPS code {fips}"
|
f"Downloading housing data for state/territory with FIPS code {fips}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -50,8 +50,6 @@ class HousingTransportationETL(ExtractTransformLoad):
|
||||||
self.df = pd.concat(dfs)
|
self.df = pd.concat(dfs)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Housing and Transportation Data")
|
|
||||||
|
|
||||||
# Rename and reformat tract ID
|
# Rename and reformat tract ID
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns={"tract": self.GEOID_TRACT_FIELD_NAME}, inplace=True
|
columns={"tract": self.GEOID_TRACT_FIELD_NAME}, inplace=True
|
||||||
|
@ -61,7 +59,5 @@ class HousingTransportationETL(ExtractTransformLoad):
|
||||||
].str.replace('"', "")
|
].str.replace('"', "")
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Housing and Transportation Data")
|
|
||||||
|
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||||
|
|
|
@ -56,7 +56,6 @@ class HudHousingETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Extracting 1.09 GB HUD Housing Data")
|
|
||||||
super().extract(
|
super().extract(
|
||||||
self.HOUSING_FTP_URL,
|
self.HOUSING_FTP_URL,
|
||||||
self.HOUSING_ZIP_FILE_DIR,
|
self.HOUSING_ZIP_FILE_DIR,
|
||||||
|
@ -80,8 +79,6 @@ class HudHousingETL(ExtractTransformLoad):
|
||||||
return tmp_df
|
return tmp_df
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming HUD Housing Data")
|
|
||||||
|
|
||||||
table_8 = self._read_chas_table("Table8.csv")
|
table_8 = self._read_chas_table("Table8.csv")
|
||||||
table_3 = self._read_chas_table("Table3.csv")
|
table_3 = self._read_chas_table("Table3.csv")
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,6 @@ class HudRecapETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading HUD Recap Data")
|
|
||||||
download = requests.get(
|
download = requests.get(
|
||||||
self.HUD_RECAP_CSV_URL,
|
self.HUD_RECAP_CSV_URL,
|
||||||
verify=None,
|
verify=None,
|
||||||
|
@ -48,8 +47,6 @@ class HudRecapETL(ExtractTransformLoad):
|
||||||
csv_file.close()
|
csv_file.close()
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming HUD Recap Data")
|
|
||||||
|
|
||||||
# Load comparison index (CalEnviroScreen 4)
|
# Load comparison index (CalEnviroScreen 4)
|
||||||
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
|
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
|
||||||
|
|
||||||
|
@ -75,7 +72,6 @@ class HudRecapETL(ExtractTransformLoad):
|
||||||
self.df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True)
|
self.df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving HUD Recap CSV")
|
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)
|
self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)
|
||||||
|
|
|
@ -39,7 +39,6 @@ class MappingForEJETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading Mapping for EJ Data")
|
|
||||||
super().extract(
|
super().extract(
|
||||||
self.MAPPING_FOR_EJ_VA_URL,
|
self.MAPPING_FOR_EJ_VA_URL,
|
||||||
self.get_tmp_path(),
|
self.get_tmp_path(),
|
||||||
|
@ -50,8 +49,6 @@ class MappingForEJETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Mapping for EJ Data")
|
|
||||||
|
|
||||||
# Join (here, it's just concatenating) the two dataframes from
|
# Join (here, it's just concatenating) the two dataframes from
|
||||||
# CO and VA
|
# CO and VA
|
||||||
self.df = pd.concat(
|
self.df = pd.concat(
|
||||||
|
@ -86,7 +83,6 @@ class MappingForEJETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Mapping for EJ CSV")
|
|
||||||
# write selected states csv
|
# write selected states csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
@ -94,4 +90,4 @@ class MappingForEJETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def validate(self) -> None:
|
def validate(self) -> None:
|
||||||
logger.info("Validating Mapping For EJ Data")
|
logger.debug("Skipping validation for MappingForEJETL")
|
||||||
|
|
|
@ -75,14 +75,12 @@ class MappingInequalityETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading Mapping Inequality Data")
|
|
||||||
download_file_from_url(
|
download_file_from_url(
|
||||||
file_url=self.MAPPING_INEQUALITY_CSV_URL,
|
file_url=self.MAPPING_INEQUALITY_CSV_URL,
|
||||||
download_file_name=self.MAPPING_INEQUALITY_CSV,
|
download_file_name=self.MAPPING_INEQUALITY_CSV,
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Mapping Inequality Data")
|
|
||||||
df: pd.DataFrame = pd.read_csv(
|
df: pd.DataFrame = pd.read_csv(
|
||||||
self.MAPPING_INEQUALITY_CSV,
|
self.MAPPING_INEQUALITY_CSV,
|
||||||
dtype={self.TRACT_INPUT_FIELD: "string"},
|
dtype={self.TRACT_INPUT_FIELD: "string"},
|
||||||
|
@ -207,7 +205,6 @@ class MappingInequalityETL(ExtractTransformLoad):
|
||||||
self.df = grouped_df
|
self.df = grouped_df
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Mapping Inequality CSV")
|
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
|
|
@ -33,15 +33,13 @@ class MarylandEJScreenETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading 207MB Maryland EJSCREEN Data")
|
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
|
||||||
super().extract(
|
super().extract(
|
||||||
self.MARYLAND_EJSCREEN_URL,
|
self.MARYLAND_EJSCREEN_URL,
|
||||||
self.get_tmp_path(),
|
self.get_tmp_path(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Maryland EJSCREEN Data")
|
|
||||||
|
|
||||||
list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
|
list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
|
||||||
|
|
||||||
# Ignore counties becauses this is not the level of measurement
|
# Ignore counties becauses this is not the level of measurement
|
||||||
|
@ -105,7 +103,6 @@ class MarylandEJScreenETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Maryland EJSCREEN CSV")
|
|
||||||
# write maryland tracts to csv
|
# write maryland tracts to csv
|
||||||
self.OUTPUT_CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
|
|
@ -33,7 +33,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading Michigan EJSCREEN Data")
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
|
filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
|
||||||
dtype={"GEO_ID": "string"},
|
dtype={"GEO_ID": "string"},
|
||||||
|
@ -41,8 +40,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Michigan EJSCREEN Data")
|
|
||||||
|
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
"GEO_ID": self.GEOID_TRACT_FIELD_NAME,
|
"GEO_ID": self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
@ -60,7 +57,6 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Michigan Environmental Screening Tool to CSV")
|
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
|
|
|
@ -69,7 +69,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
"""Unzips NRI dataset from the FEMA data source and writes the files
|
"""Unzips NRI dataset from the FEMA data source and writes the files
|
||||||
to the temporary data folder for use in the transform() method
|
to the temporary data folder for use in the transform() method
|
||||||
"""
|
"""
|
||||||
logger.info("Downloading 405MB National Risk Index Data")
|
|
||||||
|
|
||||||
super().extract(
|
super().extract(
|
||||||
source_url=self.SOURCE_URL,
|
source_url=self.SOURCE_URL,
|
||||||
|
@ -84,7 +83,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
- Applies the NRI score for each Census Tract to the Census Block
|
- Applies the NRI score for each Census Tract to the Census Block
|
||||||
Groups inside of that Tract
|
Groups inside of that Tract
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming National Risk Index Data")
|
|
||||||
|
|
||||||
# read in the unzipped csv from NRI data source then rename the
|
# read in the unzipped csv from NRI data source then rename the
|
||||||
# Census Tract column for merging
|
# Census Tract column for merging
|
||||||
|
|
|
@ -53,7 +53,6 @@ class NatureDeprivedETL(ExtractTransformLoad):
|
||||||
|
|
||||||
- Renames columns as needed
|
- Renames columns as needed
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming NLCD Data")
|
|
||||||
|
|
||||||
df_ncld: pd.DataFrame = pd.read_csv(
|
df_ncld: pd.DataFrame = pd.read_csv(
|
||||||
self.INPUT_CSV,
|
self.INPUT_CSV,
|
||||||
|
|
|
@ -76,8 +76,6 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting to download 86MB persistent poverty file.")
|
|
||||||
|
|
||||||
unzipped_file_path = self.get_tmp_path()
|
unzipped_file_path = self.get_tmp_path()
|
||||||
|
|
||||||
unzip_file_from_url(
|
unzip_file_from_url(
|
||||||
|
@ -124,7 +122,6 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
||||||
self.df = self._join_input_dfs(temporary_input_dfs)
|
self.df = self._join_input_dfs(temporary_input_dfs)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting persistent poverty transform")
|
|
||||||
transformed_df = self.df
|
transformed_df = self.df
|
||||||
|
|
||||||
# Note: the fields are defined as following.
|
# Note: the fields are defined as following.
|
||||||
|
|
|
@ -77,7 +77,6 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
||||||
]
|
]
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Downloading Tree Equity Score Data")
|
|
||||||
for state in self.states:
|
for state in self.states:
|
||||||
super().extract(
|
super().extract(
|
||||||
f"{self.TES_URL}{state}.zip.zip",
|
f"{self.TES_URL}{state}.zip.zip",
|
||||||
|
@ -85,7 +84,6 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Transforming Tree Equity Score Data")
|
|
||||||
tes_state_dfs = []
|
tes_state_dfs = []
|
||||||
for state in self.states:
|
for state in self.states:
|
||||||
tes_state_dfs.append(
|
tes_state_dfs.append(
|
||||||
|
@ -103,7 +101,6 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Tree Equity Score CSV")
|
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df = self.df[
|
self.df = self.df[
|
||||||
|
|
|
@ -28,7 +28,6 @@ class TribalETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
logger.info("Downloading Tribal Data")
|
|
||||||
|
|
||||||
bia_shapefile_zip_url = (
|
bia_shapefile_zip_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
|
@ -77,7 +76,7 @@ class TribalETL(ExtractTransformLoad):
|
||||||
bia_national_lar_df = gpd.read_file(path)
|
bia_national_lar_df = gpd.read_file(path)
|
||||||
|
|
||||||
# DELETE
|
# DELETE
|
||||||
logger.info(f"Columns: {bia_national_lar_df.columns}\n")
|
logger.debug(f"Columns: {bia_national_lar_df.columns}\n")
|
||||||
|
|
||||||
bia_national_lar_df.drop(
|
bia_national_lar_df.drop(
|
||||||
["GISAcres"],
|
["GISAcres"],
|
||||||
|
@ -186,8 +185,6 @@ class TribalETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
logger.info("Transforming Tribal Data")
|
|
||||||
|
|
||||||
# Set the filepaths:
|
# Set the filepaths:
|
||||||
bia_national_lar_shapefile = (
|
bia_national_lar_shapefile = (
|
||||||
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
|
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
|
||||||
|
@ -220,7 +217,7 @@ class TribalETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
logger.info("Saving Tribal GeoJson and CSV")
|
logger.debug("Saving Tribal GeoJson and CSV")
|
||||||
usa_tribal_df = gpd.GeoDataFrame(
|
usa_tribal_df = gpd.GeoDataFrame(
|
||||||
pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True)
|
pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True)
|
||||||
)
|
)
|
||||||
|
@ -228,7 +225,7 @@ class TribalETL(ExtractTransformLoad):
|
||||||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Writing national geojson file")
|
logger.debug("Writing national geojson file")
|
||||||
usa_tribal_df.to_file(
|
usa_tribal_df.to_file(
|
||||||
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
|
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
|
||||||
)
|
)
|
||||||
|
|
|
@ -94,8 +94,6 @@ class TribalOverlapETL(ExtractTransformLoad):
|
||||||
self.tribal_gdf = get_tribal_geojson()
|
self.tribal_gdf = get_tribal_geojson()
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting tribal overlap transforms.")
|
|
||||||
|
|
||||||
# First, calculate whether tracts include any areas from the Tribal areas,
|
# First, calculate whether tracts include any areas from the Tribal areas,
|
||||||
# for both the points in AK and the polygons in the continental US (CONUS).
|
# for both the points in AK and the polygons in the continental US (CONUS).
|
||||||
tribal_overlap_with_tracts = add_tracts_for_geometries(
|
tribal_overlap_with_tracts = add_tracts_for_geometries(
|
||||||
|
|
|
@ -56,8 +56,6 @@ class USArmyFUDS(ExtractTransformLoad):
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self) -> None:
|
||||||
logger.info("Starting FUDS data download.")
|
|
||||||
|
|
||||||
download_file_from_url(
|
download_file_from_url(
|
||||||
file_url=self.FILE_URL,
|
file_url=self.FILE_URL,
|
||||||
download_file_name=self.DOWNLOAD_FILE_NAME,
|
download_file_name=self.DOWNLOAD_FILE_NAME,
|
||||||
|
@ -65,11 +63,10 @@ class USArmyFUDS(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
logger.info("Starting FUDS transform.")
|
|
||||||
# before we try to do any transformation, get the tract data
|
# before we try to do any transformation, get the tract data
|
||||||
# so it's loaded and the census ETL is out of scope
|
# so it's loaded and the census ETL is out of scope
|
||||||
|
|
||||||
logger.info("Loading FUDS data as GeoDataFrame for transform")
|
logger.debug("Loading FUDS data as GeoDataFrame for transform")
|
||||||
raw_df = gpd.read_file(
|
raw_df = gpd.read_file(
|
||||||
filename=self.DOWNLOAD_FILE_NAME,
|
filename=self.DOWNLOAD_FILE_NAME,
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class ScoreA(Score):
|
class ScoreA(Score):
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score A")
|
logger.debug("Adding Score A")
|
||||||
self.df[field_names.SCORE_A] = self.df[
|
self.df[field_names.SCORE_A] = self.df[
|
||||||
[
|
[
|
||||||
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class ScoreB(Score):
|
class ScoreB(Score):
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score B")
|
logger.debug("Adding Score B")
|
||||||
self.df[field_names.SCORE_B] = (
|
self.df[field_names.SCORE_B] = (
|
||||||
self.df[
|
self.df[
|
||||||
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
|
||||||
|
|
|
@ -72,7 +72,7 @@ class ScoreC(Score):
|
||||||
|
|
||||||
# "CalEnviroScreen for the US" score
|
# "CalEnviroScreen for the US" score
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score C")
|
logger.debug("Adding Score C")
|
||||||
# Average all the percentile values in each bucket into a single score for each of the four buckets.
|
# Average all the percentile values in each bucket into a single score for each of the four buckets.
|
||||||
for bucket in self.BUCKETS:
|
for bucket in self.BUCKETS:
|
||||||
self.df[bucket.name] = self.df[bucket.fields].mean(axis=1)
|
self.df[bucket.name] = self.df[bucket.fields].mean(axis=1)
|
||||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class ScoreD(Score):
|
class ScoreD(Score):
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Scores D and E")
|
logger.debug("Adding Scores D and E")
|
||||||
fields_to_use_in_score = [
|
fields_to_use_in_score = [
|
||||||
field_names.UNEMPLOYMENT_FIELD,
|
field_names.UNEMPLOYMENT_FIELD,
|
||||||
field_names.LINGUISTIC_ISO_FIELD,
|
field_names.LINGUISTIC_ISO_FIELD,
|
||||||
|
|
|
@ -10,7 +10,7 @@ class ScoreF(Score):
|
||||||
# TODO Make variables and constants clearer (meaning and type)
|
# TODO Make variables and constants clearer (meaning and type)
|
||||||
|
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score F")
|
logger.debug("Adding Score F")
|
||||||
ami_and_high_school_field = "Low AMI, Low HS graduation"
|
ami_and_high_school_field = "Low AMI, Low HS graduation"
|
||||||
meets_socio_field = "Meets socioeconomic criteria"
|
meets_socio_field = "Meets socioeconomic criteria"
|
||||||
meets_burden_field = "Meets burden criteria"
|
meets_burden_field = "Meets burden criteria"
|
||||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class ScoreG(Score):
|
class ScoreG(Score):
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score G")
|
logger.debug("Adding Score G")
|
||||||
|
|
||||||
high_school_cutoff_threshold = 0.05
|
high_school_cutoff_threshold = 0.05
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class ScoreH(Score):
|
class ScoreH(Score):
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score H")
|
logger.debug("Adding Score H")
|
||||||
|
|
||||||
high_school_cutoff_threshold = 0.06
|
high_school_cutoff_threshold = 0.06
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class ScoreI(Score):
|
class ScoreI(Score):
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score I")
|
logger.debug("Adding Score I")
|
||||||
|
|
||||||
high_school_cutoff_threshold = 0.05
|
high_school_cutoff_threshold = 0.05
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class ScoreK(Score):
|
class ScoreK(Score):
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score K")
|
logger.debug("Adding Score K")
|
||||||
|
|
||||||
high_school_cutoff_threshold = 0.06
|
high_school_cutoff_threshold = 0.06
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,7 @@ class ScoreL(Score):
|
||||||
[column_from_island_areas, column_from_decennial_census]
|
[column_from_island_areas, column_from_decennial_census]
|
||||||
].mean(axis=1, skipna=True)
|
].mean(axis=1, skipna=True)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"Combined field `{combined_column_name}` has "
|
f"Combined field `{combined_column_name}` has "
|
||||||
f"{df[combined_column_name].isnull().sum()} "
|
f"{df[combined_column_name].isnull().sum()} "
|
||||||
f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
|
f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
|
||||||
|
@ -64,7 +64,7 @@ class ScoreL(Score):
|
||||||
a=df[combined_column_name], q=threshold_cutoff_for_island_areas
|
a=df[combined_column_name], q=threshold_cutoff_for_island_areas
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"For combined field `{combined_column_name}`, "
|
f"For combined field `{combined_column_name}`, "
|
||||||
f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
|
f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
|
||||||
f"raw value of {raw_threshold:.3f}."
|
f"raw value of {raw_threshold:.3f}."
|
||||||
|
@ -627,7 +627,7 @@ class ScoreL(Score):
|
||||||
.sum()
|
.sum()
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"For workforce criteria in island areas, "
|
f"For workforce criteria in island areas, "
|
||||||
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
||||||
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
||||||
|
@ -642,7 +642,7 @@ class ScoreL(Score):
|
||||||
)
|
)
|
||||||
|
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score L")
|
logger.debug("Adding Score L")
|
||||||
|
|
||||||
self.df[field_names.THRESHOLD_COUNT] = 0
|
self.df[field_names.THRESHOLD_COUNT] = 0
|
||||||
self.df[field_names.FPL_200_SERIES] = self._create_low_income_threshold(
|
self.df[field_names.FPL_200_SERIES] = self._create_low_income_threshold(
|
||||||
|
|
|
@ -768,7 +768,7 @@ class ScoreM(Score):
|
||||||
.sum()
|
.sum()
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"For workforce criteria in island areas, "
|
f"For workforce criteria in island areas, "
|
||||||
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
||||||
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
||||||
|
@ -812,7 +812,7 @@ class ScoreM(Score):
|
||||||
)
|
)
|
||||||
|
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score M")
|
logger.debug("Adding Score M")
|
||||||
|
|
||||||
self.df[field_names.THRESHOLD_COUNT] = 0
|
self.df[field_names.THRESHOLD_COUNT] = 0
|
||||||
|
|
||||||
|
|
|
@ -889,7 +889,7 @@ class ScoreNarwhal(Score):
|
||||||
.sum()
|
.sum()
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.debug(
|
||||||
f"For workforce criteria in island areas, "
|
f"For workforce criteria in island areas, "
|
||||||
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
f"{workforce_combined_criteria_for_island_areas.sum()} ("
|
||||||
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
|
||||||
|
@ -947,7 +947,7 @@ class ScoreNarwhal(Score):
|
||||||
|
|
||||||
We calculate "donut holes" after the initial score generation
|
We calculate "donut holes" after the initial score generation
|
||||||
"""
|
"""
|
||||||
logger.info("Marking donut hole tracts")
|
logger.debug("Marking donut hole tracts")
|
||||||
|
|
||||||
# This is the boolean we pass to the front end for the donut-hole-specific
|
# This is the boolean we pass to the front end for the donut-hole-specific
|
||||||
# low income criterion
|
# low income criterion
|
||||||
|
@ -1025,7 +1025,7 @@ class ScoreNarwhal(Score):
|
||||||
)
|
)
|
||||||
|
|
||||||
def add_columns(self) -> pd.DataFrame:
|
def add_columns(self) -> pd.DataFrame:
|
||||||
logger.info("Adding Score Narhwal")
|
logger.debug("Adding Score Narhwal")
|
||||||
self.df[field_names.THRESHOLD_COUNT] = 0
|
self.df[field_names.THRESHOLD_COUNT] = 0
|
||||||
|
|
||||||
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = (
|
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = (
|
||||||
|
|
|
@ -23,7 +23,7 @@ def toy_score_df(scope="module"):
|
||||||
|
|
||||||
|
|
||||||
def _helper_test_dropping_tracts(toy_score_df, drop_tracts):
|
def _helper_test_dropping_tracts(toy_score_df, drop_tracts):
|
||||||
logger.info(drop_tracts)
|
logger.debug(drop_tracts)
|
||||||
test_frame = toy_score_df[
|
test_frame = toy_score_df[
|
||||||
~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)
|
~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)
|
||||||
]
|
]
|
||||||
|
|
|
@ -41,12 +41,10 @@ class ExampleETL(ExtractTransformLoad):
|
||||||
/ "input.zip"
|
/ "input.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Extracting {zip_file_path}")
|
|
||||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||||
zip_ref.extractall(self.get_tmp_path())
|
zip_ref.extractall(self.get_tmp_path())
|
||||||
|
|
||||||
def transform(self):
|
def transform(self):
|
||||||
logger.info(f"Loading file from {self.get_tmp_path() / 'input.csv'}.")
|
|
||||||
df: pd.DataFrame = pd.read_csv(
|
df: pd.DataFrame = pd.read_csv(
|
||||||
self.get_tmp_path() / "input.csv",
|
self.get_tmp_path() / "input.csv",
|
||||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
|
|
|
@ -202,7 +202,7 @@ class TestETL:
|
||||||
|
|
||||||
expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv"
|
expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv"
|
||||||
|
|
||||||
logger.info(f"Expected: {expected_file_path}")
|
logger.debug(f"Expected: {expected_file_path}")
|
||||||
|
|
||||||
assert actual_file_path == expected_file_path
|
assert actual_file_path == expected_file_path
|
||||||
|
|
||||||
|
@ -545,7 +545,7 @@ class TestETL:
|
||||||
# Delete output file.
|
# Delete output file.
|
||||||
output_file_path = etl._get_output_file_path()
|
output_file_path = etl._get_output_file_path()
|
||||||
if os.path.exists(output_file_path):
|
if os.path.exists(output_file_path):
|
||||||
logger.info("Deleting output file created by other tests.")
|
logger.debug("Deleting output file created by other tests.")
|
||||||
os.remove(output_file_path)
|
os.remove(output_file_path)
|
||||||
|
|
||||||
# Run more steps to generate test data.
|
# Run more steps to generate test data.
|
||||||
|
|
|
@ -39,7 +39,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
||||||
os.mkdir(low_tile_path)
|
os.mkdir(low_tile_path)
|
||||||
|
|
||||||
# generate high mbtiles file
|
# generate high mbtiles file
|
||||||
logger.info("Generating USA High mbtiles file")
|
logger.debug("Generating USA High mbtiles file")
|
||||||
cmd = "tippecanoe "
|
cmd = "tippecanoe "
|
||||||
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --layer=blocks "
|
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --layer=blocks "
|
||||||
cmd += "--no-feature-limit --no-tile-size-limit "
|
cmd += "--no-feature-limit --no-tile-size-limit "
|
||||||
|
@ -48,7 +48,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
||||||
call(cmd, shell=True)
|
call(cmd, shell=True)
|
||||||
|
|
||||||
# generate high mvts
|
# generate high mvts
|
||||||
logger.info("Generating USA High mvt folders and files")
|
logger.debug("Generating USA High mvt folders and files")
|
||||||
cmd = "tippecanoe "
|
cmd = "tippecanoe "
|
||||||
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression "
|
cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression "
|
||||||
cmd += "--no-feature-limit --no-tile-size-limit "
|
cmd += "--no-feature-limit --no-tile-size-limit "
|
||||||
|
@ -57,7 +57,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
||||||
call(cmd, shell=True)
|
call(cmd, shell=True)
|
||||||
|
|
||||||
# generate low mbtiles file
|
# generate low mbtiles file
|
||||||
logger.info("Generating USA Low mbtiles file")
|
logger.debug("Generating USA Low mbtiles file")
|
||||||
cmd = "tippecanoe "
|
cmd = "tippecanoe "
|
||||||
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --layer=blocks "
|
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --layer=blocks "
|
||||||
cmd += f"--output={low_tile_path}/usa_low.mbtiles "
|
cmd += f"--output={low_tile_path}/usa_low.mbtiles "
|
||||||
|
@ -65,7 +65,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
||||||
call(cmd, shell=True)
|
call(cmd, shell=True)
|
||||||
|
|
||||||
# generate low mvts
|
# generate low mvts
|
||||||
logger.info("Generating USA Low mvt folders and files")
|
logger.debug("Generating USA Low mvt folders and files")
|
||||||
cmd = "tippecanoe "
|
cmd = "tippecanoe "
|
||||||
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression "
|
cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression "
|
||||||
cmd += "--drop-densest-as-needed "
|
cmd += "--drop-densest-as-needed "
|
||||||
|
@ -86,7 +86,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
||||||
remove_all_from_dir(tribal_tiles_path)
|
remove_all_from_dir(tribal_tiles_path)
|
||||||
|
|
||||||
# generate mbtiles file
|
# generate mbtiles file
|
||||||
logger.info("Generating Tribal mbtiles file")
|
logger.debug("Generating Tribal mbtiles file")
|
||||||
cmd = "tippecanoe "
|
cmd = "tippecanoe "
|
||||||
cmd += "--layer=blocks "
|
cmd += "--layer=blocks "
|
||||||
cmd += "--base-zoom=3 "
|
cmd += "--base-zoom=3 "
|
||||||
|
@ -96,7 +96,7 @@ def generate_tiles(data_path: Path, generate_tribal_layer: bool) -> None:
|
||||||
call(cmd, shell=True)
|
call(cmd, shell=True)
|
||||||
|
|
||||||
# generate mvts
|
# generate mvts
|
||||||
logger.info("Generating Tribal mvt folders and files")
|
logger.debug("Generating Tribal mvt folders and files")
|
||||||
cmd = "tippecanoe "
|
cmd = "tippecanoe "
|
||||||
cmd += "--layer=blocks "
|
cmd += "--layer=blocks "
|
||||||
cmd += "--base-zoom=3 "
|
cmd += "--base-zoom=3 "
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -17,7 +16,9 @@ from data_pipeline.config import settings
|
||||||
from data_pipeline.content.schemas.download_schemas import CodebookConfig
|
from data_pipeline.content.schemas.download_schemas import CodebookConfig
|
||||||
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
||||||
from data_pipeline.content.schemas.download_schemas import ExcelConfig
|
from data_pipeline.content.schemas.download_schemas import ExcelConfig
|
||||||
from data_pipeline.etl.score.constants import SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH
|
from data_pipeline.etl.score.constants import (
|
||||||
|
SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH,
|
||||||
|
)
|
||||||
from marshmallow import ValidationError
|
from marshmallow import ValidationError
|
||||||
from marshmallow_dataclass import class_schema
|
from marshmallow_dataclass import class_schema
|
||||||
|
|
||||||
|
@ -43,11 +44,12 @@ def get_module_logger(module_name: str) -> logging.Logger:
|
||||||
logger = logging.getLogger(module_name)
|
logger = logging.getLogger(module_name)
|
||||||
handler = logging.StreamHandler()
|
handler = logging.StreamHandler()
|
||||||
formatter = logging.Formatter(
|
formatter = logging.Formatter(
|
||||||
"%(asctime)s [%(name)-12s] %(levelname)-8s %(message)s"
|
"%(asctime)s [%(name)40.40s] %(levelname)-8s %(message)s"
|
||||||
)
|
)
|
||||||
handler.setFormatter(formatter)
|
handler.setFormatter(formatter)
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.INFO)
|
||||||
|
logger.propagate = False # don't send log messages to the parent logger (to avoid duplicate log messages)
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,7 +83,6 @@ def remove_files_from_dir(
|
||||||
if not file.endswith(extension):
|
if not file.endswith(extension):
|
||||||
continue
|
continue
|
||||||
os.remove(files_path / file)
|
os.remove(files_path / file)
|
||||||
logger.info(f"Removing {file}")
|
|
||||||
|
|
||||||
|
|
||||||
def remove_all_from_dir(files_path: Path) -> None:
|
def remove_all_from_dir(files_path: Path) -> None:
|
||||||
|
@ -103,9 +104,8 @@ def remove_all_from_dir(files_path: Path) -> None:
|
||||||
os.remove(files_path / file)
|
os.remove(files_path / file)
|
||||||
else:
|
else:
|
||||||
shutil.rmtree(files_path / file)
|
shutil.rmtree(files_path / file)
|
||||||
logger.info(f"Removing {file}")
|
|
||||||
else:
|
else:
|
||||||
logger.info(f"The following path does not exist: `{files_path}`.")
|
logger.warning(f"The following path does not exist: `{files_path}`.")
|
||||||
|
|
||||||
|
|
||||||
def remove_all_dirs_from_dir(dir_path: Path) -> None:
|
def remove_all_dirs_from_dir(dir_path: Path) -> None:
|
||||||
|
@ -122,7 +122,6 @@ def remove_all_dirs_from_dir(dir_path: Path) -> None:
|
||||||
file_path = os.path.join(dir_path, filename)
|
file_path = os.path.join(dir_path, filename)
|
||||||
if os.path.isdir(file_path):
|
if os.path.isdir(file_path):
|
||||||
shutil.rmtree(file_path)
|
shutil.rmtree(file_path)
|
||||||
logging.info(f"Removing directory {file_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def download_file_from_url(
|
def download_file_from_url(
|
||||||
|
@ -147,7 +146,6 @@ def download_file_from_url(
|
||||||
if not os.path.isdir(download_file_name.parent):
|
if not os.path.isdir(download_file_name.parent):
|
||||||
os.mkdir(download_file_name.parent)
|
os.mkdir(download_file_name.parent)
|
||||||
|
|
||||||
logger.info(f"Downloading {file_url}")
|
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
|
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
|
||||||
)
|
)
|
||||||
|
@ -193,7 +191,6 @@ def unzip_file_from_url(
|
||||||
verify=verify,
|
verify=verify,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Extracting {zip_file_path}")
|
|
||||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||||
zip_ref.extractall(unzipped_file_path)
|
zip_ref.extractall(unzipped_file_path)
|
||||||
|
|
||||||
|
@ -206,7 +203,7 @@ def data_folder_cleanup() -> None:
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
logger.info("Initializing all dataset directoriees")
|
logger.debug("Initializing all dataset directoriees")
|
||||||
remove_all_from_dir(data_path / "dataset")
|
remove_all_from_dir(data_path / "dataset")
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,7 +212,7 @@ def score_folder_cleanup() -> None:
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
logger.info("Initializing all score data")
|
logger.debug("Initializing all score data")
|
||||||
remove_all_from_dir(data_path / "score" / "csv")
|
remove_all_from_dir(data_path / "score" / "csv")
|
||||||
remove_all_from_dir(data_path / "score" / "geojson")
|
remove_all_from_dir(data_path / "score" / "geojson")
|
||||||
remove_all_from_dir(data_path / "score" / "tiles")
|
remove_all_from_dir(data_path / "score" / "tiles")
|
||||||
|
@ -230,10 +227,12 @@ def geo_score_folder_cleanup() -> None:
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
logger.info("Removing zip files")
|
logger.debug("Removing zip files")
|
||||||
remove_files_from_dir(data_path / "score" / "shapefile", ".zip")
|
remove_files_from_dir(data_path / "score" / "shapefile", ".zip")
|
||||||
|
|
||||||
shapefile_and_codebook_zipped = SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH
|
shapefile_and_codebook_zipped = (
|
||||||
|
SCORE_VERSIONING_SHAPEFILE_CODEBOOK_FILE_PATH
|
||||||
|
)
|
||||||
|
|
||||||
if os.path.isfile(shapefile_and_codebook_zipped):
|
if os.path.isfile(shapefile_and_codebook_zipped):
|
||||||
os.remove(shapefile_and_codebook_zipped)
|
os.remove(shapefile_and_codebook_zipped)
|
||||||
|
@ -251,7 +250,7 @@ def temp_folder_cleanup() -> None:
|
||||||
|
|
||||||
data_path = settings.APP_ROOT / "data"
|
data_path = settings.APP_ROOT / "data"
|
||||||
|
|
||||||
logger.info("Initializing all temp directories")
|
logger.debug("Initializing all temp directories")
|
||||||
remove_all_from_dir(data_path / "tmp")
|
remove_all_from_dir(data_path / "tmp")
|
||||||
|
|
||||||
|
|
||||||
|
@ -307,8 +306,6 @@ def zip_files(zip_file_path: Path, files_to_compress: List[Path]):
|
||||||
with zipfile.ZipFile(zip_file_path, "w") as zf:
|
with zipfile.ZipFile(zip_file_path, "w") as zf:
|
||||||
for f in files_to_compress:
|
for f in files_to_compress:
|
||||||
zf.write(f, arcname=Path(f).name, compress_type=compression)
|
zf.write(f, arcname=Path(f).name, compress_type=compression)
|
||||||
zip_info = get_zip_info(zip_file_path)
|
|
||||||
logger.info(json.dumps(zip_info, indent=4, sort_keys=True, default=str))
|
|
||||||
|
|
||||||
|
|
||||||
def zip_directory(
|
def zip_directory(
|
||||||
|
@ -327,7 +324,6 @@ def zip_directory(
|
||||||
def zipdir(origin_directory: Path, ziph: zipfile.ZipFile):
|
def zipdir(origin_directory: Path, ziph: zipfile.ZipFile):
|
||||||
for root, dirs, files in os.walk(origin_directory):
|
for root, dirs, files in os.walk(origin_directory):
|
||||||
for file in files:
|
for file in files:
|
||||||
logger.info(f"Compressing file: {file}")
|
|
||||||
ziph.write(
|
ziph.write(
|
||||||
os.path.join(root, file),
|
os.path.join(root, file),
|
||||||
os.path.relpath(
|
os.path.relpath(
|
||||||
|
@ -337,7 +333,6 @@ def zip_directory(
|
||||||
compress_type=compression,
|
compress_type=compression,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Compressing {Path(origin_zip_directory).name} directory")
|
|
||||||
zip_file_name = f"{Path(origin_zip_directory).name}.zip"
|
zip_file_name = f"{Path(origin_zip_directory).name}.zip"
|
||||||
|
|
||||||
# start archiving
|
# start archiving
|
||||||
|
@ -347,10 +342,6 @@ def zip_directory(
|
||||||
zipdir(f"{origin_zip_directory}/", zipf)
|
zipdir(f"{origin_zip_directory}/", zipf)
|
||||||
zipf.close()
|
zipf.close()
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Completed compression of {Path(origin_zip_directory).name} directory"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_yaml_dict_from_file(
|
def load_yaml_dict_from_file(
|
||||||
yaml_file_path: Path,
|
yaml_file_path: Path,
|
||||||
|
|
Loading…
Add table
Reference in a new issue