j40-cejst-2/data/data-pipeline/data_pipeline/application.py

213 lines
4.6 KiB
Python
Raw Normal View History

from subprocess import call
import sys
import click
from data_pipeline.config import settings
Add ETL Contract Checks (#619) * Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
2021-10-13 15:54:15 -04:00
from data_pipeline.etl.runner import (
etl_runner,
score_generate,
score_geo,
score_post,
)
from data_pipeline.etl.sources.census.etl_utils import (
reset_data_directories as census_reset,
)
from data_pipeline.tile.generate import generate_tiles
from data_pipeline.utils import (
data_folder_cleanup,
get_module_logger,
score_folder_cleanup,
downloadable_cleanup,
temp_folder_cleanup,
check_first_run,
)
logger = get_module_logger(__name__)
@click.group()
def cli():
"""Defines a click group for the commands below"""
pass
@cli.command(help="Clean up all census data folders")
def census_cleanup():
"""CLI command to clean up the census data folder"""
data_path = settings.APP_ROOT / "data"
# census directories
logger.info("Initializing all census data")
census_reset(data_path)
logger.info("Cleaned up all census data files")
sys.exit()
@cli.command(help="Clean up all data folders")
def data_cleanup():
"""CLI command to clean up the all the data folders"""
data_path = settings.APP_ROOT / "data"
census_reset(data_path)
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
logger.info("Cleaned up all data folders")
sys.exit()
@cli.command(
help="Census data download",
)
def census_data_download():
"""CLI command to download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs"""
data_path = settings.APP_ROOT / "data"
logger.info("Initializing all census data")
census_reset(data_path)
logger.info("Downloading census data")
etl_runner("census")
logger.info("Completed downloading census data")
sys.exit()
@cli.command(
help="Run all ETL processes or a specific one",
)
@click.option("-d", "--dataset", required=False, type=str)
def etl_run(dataset: str):
"""Run a specific or all ETL processes
Args:
dataset (str): Name of the ETL module to be run (optional)
Returns:
None
"""
etl_runner(dataset)
sys.exit()
@cli.command(
help="Generate Score",
)
def score_run():
"""CLI command to generate the score"""
2021-08-03 18:23:57 -04:00
score_folder_cleanup()
score_generate()
sys.exit()
@cli.command(
help="Run ETL + Score Generation",
)
def score_full_run():
"""CLI command to run ETL and generate the score in one command"""
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
etl_runner()
score_generate()
sys.exit()
@cli.command(help="Generate Geojson files with scores baked in")
def geo_score():
"""CLI command to generate the score"""
2021-08-03 18:23:57 -04:00
score_geo()
sys.exit()
2021-08-03 18:23:57 -04:00
@cli.command(
help="Generate map tiles",
)
def generate_map_tiles():
"""CLI command to generate the map tiles"""
data_path = settings.APP_ROOT / "data"
generate_tiles(data_path)
sys.exit()
@cli.command(
help="Run etl_score_post to create score csv, tile csv, and downloadable zip",
)
def generate_score_post():
"""CLI command to generate score, tile, and downloadable files"""
downloadable_cleanup()
score_post()
sys.exit()
@cli.command(
help="Data Full Run (Census download, ETLs, score, combine and tile generation)",
)
@click.option(
"-c",
"--check",
is_flag=True,
help="Check if data run has been run before, and don't run it if so.",
)
def data_full_run(check):
"""CLI command to run ETL, score, JSON combine and generate tiles in one command
Args:
check (bool): Run the full data run only if the first run sempahore file is not set (optional)
Returns:
None
"""
data_path = settings.APP_ROOT / "data"
if check and not check_first_run():
# check if the data full run has been run before
logger.info("*** The data full run was already executed")
sys.exit()
# census directories
logger.info("*** Initializing all data folders")
census_reset(data_path)
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
logger.info("*** Downloading census data")
etl_runner("census")
logger.info("*** Running all ETLs")
etl_runner()
logger.info("*** Generating Score")
score_generate()
logger.info("*** Combining Score with Census Geojson")
score_geo()
logger.info("*** Generating Map Tiles")
generate_tiles(data_path)
file = "first_run.txt"
cmd = f"touch {data_path}/{file}"
call(cmd, shell=True)
logger.info("*** Map data ready")
sys.exit()
2021-08-03 18:23:57 -04:00
if __name__ == "__main__":
cli()