j40-cejst-2/data/data-pipeline/data_pipeline/application.py

284 lines
6.6 KiB
Python
Raw Normal View History

from subprocess import call
import sys
import click
from data_pipeline.config import settings
Add ETL Contract Checks (#619) * Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
2021-10-13 15:54:15 -04:00
from data_pipeline.etl.runner import (
etl_runner,
score_generate,
score_geo,
score_post,
)
from data_pipeline.etl.sources.census.etl_utils import (
reset_data_directories as census_reset,
zip_census_data,
)
from data_pipeline.tile.generate import generate_tiles
from data_pipeline.utils import (
data_folder_cleanup,
get_module_logger,
score_folder_cleanup,
downloadable_cleanup,
temp_folder_cleanup,
check_first_run,
)
logger = get_module_logger(__name__)
dataset_cli_help = "Grab the data from either 'local' for local access or 'aws' to retrieve from Justice40 S3 repository"
@click.group()
def cli():
"""Defines a click group for the commands below"""
pass
@cli.command(help="Clean up all census data folders")
def census_cleanup():
"""CLI command to clean up the census data folder"""
data_path = settings.APP_ROOT / "data"
# census directories
logger.info("Initializing all census data")
census_reset(data_path)
logger.info("Cleaned up all census data files")
sys.exit()
@cli.command(help="Clean up all data folders")
def data_cleanup():
"""CLI command to clean up the all the data folders"""
data_path = settings.APP_ROOT / "data"
census_reset(data_path)
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
logger.info("Cleaned up all data folders")
sys.exit()
@cli.command(
help="Census data download",
)
@click.option(
"-zc",
"--zip-compress",
is_flag=True,
help="Upload to AWS S3 a zipped archive of the census data.",
)
def census_data_download(zip_compress):
"""CLI command to download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs"""
logger.info("Initializing all census data")
data_path = settings.APP_ROOT / "data"
2021-10-14 13:50:56 -04:00
census_reset(data_path)
logger.info("Downloading census data")
2021-10-14 13:50:56 -04:00
etl_runner("census")
if zip_compress:
zip_census_data()
logger.info("Completed downloading census data")
sys.exit()
@cli.command(
help="Run all ETL processes or a specific one",
)
@click.option(
"-d",
"--dataset",
required=False,
type=str,
help=dataset_cli_help,
)
def etl_run(dataset: str):
"""Run a specific or all ETL processes
Args:
dataset (str): Name of the ETL module to be run (optional)
Returns:
None
"""
etl_runner(dataset)
sys.exit()
@cli.command(
help="Generate Score",
)
def score_run():
"""CLI command to generate the score"""
2021-08-03 18:23:57 -04:00
score_folder_cleanup()
score_generate()
sys.exit()
@cli.command(
help="Run ETL + Score Generation",
)
def score_full_run():
"""CLI command to run ETL and generate the score in one command"""
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
etl_runner()
score_generate()
sys.exit()
@cli.command(help="Generate Geojson files with scores baked in")
@click.option(
"-s",
"--data-source",
default="local",
required=False,
type=str,
help=dataset_cli_help,
)
def geo_score(data_source: str):
"""CLI command to combine score with GeoJSON data and generate low and high files
Args:
data_source (str): Source for the census data (optional)
Options:
- local: fetch census and score data from the local data directory
- aws: fetch census and score from AWS S3 J40 data repository
Returns:
None
"""
2021-08-03 18:23:57 -04:00
score_geo(data_source=data_source)
sys.exit()
2021-08-03 18:23:57 -04:00
@cli.command(
help="Generate map tiles",
)
def generate_map_tiles():
"""CLI command to generate the map tiles"""
data_path = settings.APP_ROOT / "data"
generate_tiles(data_path)
sys.exit()
@cli.command(
help="Run etl_score_post to create score csv, tile csv, and downloadable zip",
)
@click.option(
"-s",
"--data-source",
default="local",
required=False,
type=str,
help=dataset_cli_help,
)
def generate_score_post(data_source: str):
"""CLI command to generate score, tile, and downloadable files
Args:
data_source (str): Source for the census data (optional)
Options:
- local: fetch census and score data from the local data directory
- aws: fetch census and score from AWS S3 J40 data repository
Returns:
None
"""
downloadable_cleanup()
score_post(data_source)
sys.exit()
@cli.command(
help="Data Full Run (Census download, ETLs, score, combine and tile generation)",
)
@click.option(
"-c",
"--check",
is_flag=True,
help="Check if data run has been run before, and don't run it if so.",
)
@click.option(
"-s",
"--data-source",
default="local",
required=False,
type=str,
help=dataset_cli_help,
)
def data_full_run(check: bool, data_source: str):
"""CLI command to run ETL, score, JSON combine and generate tiles in one command
Args:
check (bool): Run the full data run only if the first run sempahore file is not set (optional)
data_source (str): Source for the census data (optional)
Options:
- local: fetch census and score data from the local data directory
- aws: fetch census and score from AWS S3 J40 data repository
Returns:
None
"""
data_path = settings.APP_ROOT / "data"
if check and not check_first_run():
# check if the data full run has been run before
logger.info("*** The data full run was already executed")
sys.exit()
# census directories
logger.info("*** Initializing all data folders")
census_reset(data_path)
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
if data_source == "local":
logger.info("*** Downloading census data")
etl_runner("census")
logger.info("*** Running all ETLs")
etl_runner()
logger.info("*** Generating Score")
score_generate()
logger.info("*** Running Post Score scripts")
downloadable_cleanup()
score_post(data_source)
logger.info("*** Combining Score with Census Geojson")
score_geo(data_source)
logger.info("*** Generating Map Tiles")
generate_tiles(data_path)
file = "first_run.txt"
cmd = f"touch {data_path}/{file}"
call(cmd, shell=True)
logger.info("*** Map data ready")
sys.exit()
2021-08-03 18:23:57 -04:00
if __name__ == "__main__":
cli()