mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-27 19:31:16 -07:00
Combine + Tilefy (#806)
* init * score-post * added score csv s3 download; remore poetry cmds from readme * working census tile fetch * PR review * Github Actions Work
This commit is contained in:
parent
7b87e0ec99
commit
1b17af84c8
13 changed files with 560 additions and 371 deletions
|
@ -87,17 +87,20 @@ def score_generate() -> None:
|
|||
score_post()
|
||||
|
||||
|
||||
def score_post() -> None:
|
||||
def score_post(data_source: str = "local") -> None:
|
||||
"""Posts the score files to the local directory
|
||||
|
||||
Args:
|
||||
None
|
||||
data_source (str): Source for the census data (optional)
|
||||
Options:
|
||||
- local (default): fetch census data from the local data directory
|
||||
- aws: fetch census from AWS S3 J40 data repository
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# Post Score Processing
|
||||
score_post = PostScoreETL()
|
||||
score_post = PostScoreETL(data_source=data_source)
|
||||
score_post.extract()
|
||||
score_post.transform()
|
||||
score_post.load()
|
||||
|
@ -108,10 +111,10 @@ def score_geo(data_source: str = "local") -> None:
|
|||
"""Generates the geojson files with score data baked in
|
||||
|
||||
Args:
|
||||
census_data_source (str): Source for the census data (optional)
|
||||
Options:
|
||||
- local (default): fetch census data from the local data directory
|
||||
- aws: fetch census from AWS S3 J40 data repository
|
||||
data_source (str): Source for the census data (optional)
|
||||
Options:
|
||||
- local (default): fetch census data from the local data directory
|
||||
- aws: fetch census from AWS S3 J40 data repository
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
|
|
@ -6,6 +6,7 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
|||
from data_pipeline.etl.sources.census.etl_utils import (
|
||||
check_census_data_source,
|
||||
)
|
||||
from data_pipeline.etl.score.etl_utils import check_score_data_source
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
@ -17,6 +18,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
"""
|
||||
|
||||
def __init__(self, data_source: str = None):
|
||||
self.DATA_SOURCE = data_source
|
||||
self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
|
||||
self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
|
||||
self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
|
||||
|
@ -46,6 +48,12 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
census_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
# check score data
|
||||
check_score_data_source(
|
||||
score_csv_data_path=self.SCORE_CSV_PATH,
|
||||
score_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
logger.info("Reading US GeoJSON (~6 minutes)")
|
||||
self.geojson_usa_df = gpd.read_file(
|
||||
self.CENSUS_USA_GEOJSON,
|
||||
|
|
|
@ -3,6 +3,9 @@ import pandas as pd
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger, zip_files
|
||||
|
||||
from data_pipeline.etl.sources.census.etl_utils import (
|
||||
check_census_data_source,
|
||||
)
|
||||
from . import constants
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
@ -14,7 +17,8 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
datasets.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, data_source: str = None):
|
||||
self.DATA_SOURCE = data_source
|
||||
self.input_counties_df: pd.DataFrame
|
||||
self.input_states_df: pd.DataFrame
|
||||
self.input_score_df: pd.DataFrame
|
||||
|
@ -66,6 +70,13 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting Extraction")
|
||||
|
||||
# check census data
|
||||
check_census_data_source(
|
||||
census_data_path=self.DATA_PATH / "census",
|
||||
census_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
super().extract(
|
||||
constants.CENSUS_COUNTIES_ZIP_URL,
|
||||
constants.TMP_PATH,
|
||||
|
|
50
data/data-pipeline/data_pipeline/etl/score/etl_utils.py
Normal file
50
data/data-pipeline/data_pipeline/etl/score/etl_utils.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.utils import (
|
||||
download_file_from_url,
|
||||
get_module_logger,
|
||||
)
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def check_score_data_source(
|
||||
score_csv_data_path: Path,
|
||||
score_data_source: str,
|
||||
) -> None:
|
||||
"""Checks if census data is present, and exits gracefully if it doesn't exist. It will download it from S3
|
||||
if census_data_source is set to "aws"
|
||||
|
||||
Args:
|
||||
score_csv_data_path (str): Path for local Score CSV data
|
||||
score_data_source (str): Source for the score data
|
||||
Options:
|
||||
- local: fetch census data from the local data directory
|
||||
- aws: fetch census from AWS S3 J40 data repository
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
TILE_SCORE_CSV_S3_URL = (
|
||||
settings.AWS_JUSTICE40_DATAPIPELINE_URL
|
||||
+ "/data/score/csv/tiles/usa.csv"
|
||||
)
|
||||
TILE_SCORE_CSV = score_csv_data_path / "tiles" / "usa.csv"
|
||||
|
||||
# download from s3 if census_data_source is aws
|
||||
if score_data_source == "aws":
|
||||
logger.info("Fetching Score Tile data from AWS S3")
|
||||
download_file_from_url(
|
||||
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
|
||||
)
|
||||
else:
|
||||
# check if score data is found locally
|
||||
if not os.path.isfile(TILE_SCORE_CSV):
|
||||
logger.info(
|
||||
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
|
||||
)
|
||||
sys.exit()
|
|
@ -107,13 +107,13 @@ def check_census_data_source(
|
|||
# check if census data is found locally
|
||||
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
|
||||
logger.info(
|
||||
"No local census data found. Please use '-cds aws` to fetch from AWS"
|
||||
"No local census data found. Please use '-d aws` to fetch from AWS"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
def zip_census_data():
|
||||
logger.info("Compressing and uploading census files to AWS S3")
|
||||
logger.info("Compressing census files to data/tmp folder")
|
||||
|
||||
CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
|
||||
TMP_PATH = settings.APP_ROOT / "data" / "tmp"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue