ETL Classes for Data Sets (#260)

* first commit

* checkpoint

* checkpoint

* first extract module 🎉

* completed census acs etl class

* completed ejscreen etl

* completed etl

* score generation ready

* improving census load and separation

* score generation working 🎉

* completed etls

* new score generation

* PR reviews

* run specific etl; starting docstrings

* docstrings work

* more docstrings

* completed docstrings

* adding pyenv version

* more reasonable poetry req for python

* PR comments
This commit is contained in:
Jorge Escobar 2021-07-12 15:50:44 -04:00 committed by GitHub
commit 842312f69f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 2628 additions and 2872 deletions

View file

@ -1,26 +1,31 @@
from config import settings
import click
from pathlib import Path
import sys
from config import settings
from etl.sources.census.etl_utils import reset_data_directories as census_reset
from utils import remove_files_from_dir, remove_all_from_dir, get_module_logger
from utils import (
get_module_logger,
data_folder_cleanup,
score_folder_cleanup,
temp_folder_cleanup,
)
from etl.sources.census.etl import download_census_csvs
from etl.runner import etl_runner, score_generate
settings.APP_ROOT = Path.cwd()
logger = get_module_logger(__name__)
@click.group()
def cli():
"""Defines a click group for the commands below"""
pass
@cli.command(
help="Clean up all data folders",
help="Clean up all census data folders",
)
def data_cleanup():
def census_cleanup():
"""CLI command to clean up the census data folder"""
data_path = settings.APP_ROOT / "data"
@ -28,32 +33,59 @@ def data_cleanup():
logger.info(f"Initializing all census data")
census_reset(data_path)
# dataset directory
logger.info(f"Initializing all dataset directoriees")
remove_all_from_dir(data_path / "dataset")
logger.info("Cleaned up all census data files")
# score directory
logger.info(f"Initializing all score data")
remove_files_from_dir(data_path / "score" / "csv", ".csv")
remove_files_from_dir(data_path / "score" / "geojson", ".json")
# cleanup tmp dir
logger.info(f"Initializing all temp directoriees")
remove_all_from_dir(data_path / "tmp")
@cli.command(
help="Clean up all data folders",
)
def data_cleanup():
"""CLI command to clean up the all the data folders"""
logger.info("Cleaned up all data files")
data_folder_cleanup()
score_folder_cleanup()
temp_folder_cleanup()
logger.info("Cleaned up all data folders")
@cli.command(
help="Census data download",
)
def census_data_download():
"""CLI command to download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs"""
logger.info("Downloading census data")
data_path = settings.APP_ROOT / "data"
download_census_csvs(data_path)
logger.info("Completed downloading census data")
exit()
@cli.command(
help="Run all ETL processes or a specific one",
)
@click.option("-d", "--dataset", required=False, type=str)
def etl_run(dataset: str):
"""Run a specific or all ETL processes
Args:
dataset (str): Name of the ETL module to be run (optional)
Returns:
None
"""
etl_runner(dataset)
@cli.command(
help="Generate Score",
)
def score_run():
"""CLI command to generate the score"""
score_generate()
if __name__ == "__main__":