Combine + Tilefy (#806)

* init

* score-post

* added score csv s3 download; remore poetry cmds from readme

* working census tile fetch

* PR review

* Github Actions Work
This commit is contained in:
Jorge Escobar 2021-11-01 18:05:05 -04:00 committed by GitHub
commit 1b17af84c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 560 additions and 371 deletions

View file

@ -87,17 +87,20 @@ def score_generate() -> None:
score_post()
def score_post() -> None:
def score_post(data_source: str = "local") -> None:
"""Posts the score files to the local directory
Args:
None
data_source (str): Source for the census data (optional)
Options:
- local (default): fetch census data from the local data directory
- aws: fetch census from AWS S3 J40 data repository
Returns:
None
"""
# Post Score Processing
score_post = PostScoreETL()
score_post = PostScoreETL(data_source=data_source)
score_post.extract()
score_post.transform()
score_post.load()
@ -108,10 +111,10 @@ def score_geo(data_source: str = "local") -> None:
"""Generates the geojson files with score data baked in
Args:
census_data_source (str): Source for the census data (optional)
Options:
- local (default): fetch census data from the local data directory
- aws: fetch census from AWS S3 J40 data repository
data_source (str): Source for the census data (optional)
Options:
- local (default): fetch census data from the local data directory
- aws: fetch census from AWS S3 J40 data repository
Returns:
None

View file

@ -6,6 +6,7 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import (
check_census_data_source,
)
from data_pipeline.etl.score.etl_utils import check_score_data_source
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -17,6 +18,7 @@ class GeoScoreETL(ExtractTransformLoad):
"""
def __init__(self, data_source: str = None):
self.DATA_SOURCE = data_source
self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
@ -46,6 +48,12 @@ class GeoScoreETL(ExtractTransformLoad):
census_data_source=self.DATA_SOURCE,
)
# check score data
check_score_data_source(
score_csv_data_path=self.SCORE_CSV_PATH,
score_data_source=self.DATA_SOURCE,
)
logger.info("Reading US GeoJSON (~6 minutes)")
self.geojson_usa_df = gpd.read_file(
self.CENSUS_USA_GEOJSON,

View file

@ -3,6 +3,9 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, zip_files
from data_pipeline.etl.sources.census.etl_utils import (
check_census_data_source,
)
from . import constants
logger = get_module_logger(__name__)
@ -14,7 +17,8 @@ class PostScoreETL(ExtractTransformLoad):
datasets.
"""
def __init__(self):
def __init__(self, data_source: str = None):
self.DATA_SOURCE = data_source
self.input_counties_df: pd.DataFrame
self.input_states_df: pd.DataFrame
self.input_score_df: pd.DataFrame
@ -66,6 +70,13 @@ class PostScoreETL(ExtractTransformLoad):
def extract(self) -> None:
logger.info("Starting Extraction")
# check census data
check_census_data_source(
census_data_path=self.DATA_PATH / "census",
census_data_source=self.DATA_SOURCE,
)
super().extract(
constants.CENSUS_COUNTIES_ZIP_URL,
constants.TMP_PATH,

View file

@ -0,0 +1,50 @@
import os
import sys
from pathlib import Path
from data_pipeline.config import settings
from data_pipeline.utils import (
download_file_from_url,
get_module_logger,
)
logger = get_module_logger(__name__)
def check_score_data_source(
score_csv_data_path: Path,
score_data_source: str,
) -> None:
"""Checks if census data is present, and exits gracefully if it doesn't exist. It will download it from S3
if census_data_source is set to "aws"
Args:
score_csv_data_path (str): Path for local Score CSV data
score_data_source (str): Source for the score data
Options:
- local: fetch census data from the local data directory
- aws: fetch census from AWS S3 J40 data repository
Returns:
None
"""
TILE_SCORE_CSV_S3_URL = (
settings.AWS_JUSTICE40_DATAPIPELINE_URL
+ "/data/score/csv/tiles/usa.csv"
)
TILE_SCORE_CSV = score_csv_data_path / "tiles" / "usa.csv"
# download from s3 if census_data_source is aws
if score_data_source == "aws":
logger.info("Fetching Score Tile data from AWS S3")
download_file_from_url(
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
)
else:
# check if score data is found locally
if not os.path.isfile(TILE_SCORE_CSV):
logger.info(
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
)
sys.exit()

View file

@ -107,13 +107,13 @@ def check_census_data_source(
# check if census data is found locally
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
logger.info(
"No local census data found. Please use '-cds aws` to fetch from AWS"
"No local census data found. Please use '-d aws` to fetch from AWS"
)
sys.exit()
def zip_census_data():
logger.info("Compressing and uploading census files to AWS S3")
logger.info("Compressing census files to data/tmp folder")
CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
TMP_PATH = settings.APP_ROOT / "data" / "tmp"