mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 03:31:17 -07:00
Data sources from S3 (#769)
* Started 535 * Data sources from S3 * lint * renove breakpoints * PR comments * lint * census data completed * lint * renaming data source
This commit is contained in:
parent
d1273b63c5
commit
3b04356fb3
10 changed files with 317 additions and 67 deletions
|
@ -104,18 +104,21 @@ def score_post() -> None:
|
|||
score_post.cleanup()
|
||||
|
||||
|
||||
def score_geo() -> None:
|
||||
def score_geo(data_source: str = "local") -> None:
|
||||
"""Generates the geojson files with score data baked in
|
||||
|
||||
Args:
|
||||
None
|
||||
census_data_source (str): Source for the census data (optional)
|
||||
Options:
|
||||
- local (default): fetch census data from the local data directory
|
||||
- aws: fetch census from AWS S3 J40 data repository
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
# Score Geo
|
||||
score_geo = GeoScoreETL()
|
||||
score_geo = GeoScoreETL(data_source=data_source)
|
||||
score_geo.extract()
|
||||
score_geo.transform()
|
||||
score_geo.load()
|
||||
|
|
|
@ -315,6 +315,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
|
||||
def extract(self) -> None:
|
||||
logger.info("Loading data sets from disk.")
|
||||
|
||||
# EJSCreen csv Load
|
||||
ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2019" / "usa.csv"
|
||||
self.ejscreen_df = pd.read_csv(
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import math
|
||||
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.sources.census.etl_utils import (
|
||||
check_census_data_source,
|
||||
)
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
@ -14,7 +16,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
A class used to generate per state and national GeoJson files with the score baked in
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, data_source: str = None):
|
||||
self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
|
||||
self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
|
||||
self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
|
||||
|
@ -22,6 +24,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
||||
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
||||
|
||||
self.DATA_SOURCE = data_source
|
||||
self.CENSUS_USA_GEOJSON = (
|
||||
self.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
)
|
||||
|
@ -37,6 +40,12 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
self.geojson_score_usa_low: gpd.GeoDataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
# check census data
|
||||
check_census_data_source(
|
||||
census_data_path=self.DATA_PATH / "census",
|
||||
census_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
logger.info("Reading US GeoJSON (~6 minutes)")
|
||||
self.geojson_usa_df = gpd.read_file(
|
||||
self.CENSUS_USA_GEOJSON,
|
||||
|
|
|
@ -1,22 +1,10 @@
|
|||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import get_module_logger, get_zip_info
|
||||
from data_pipeline.utils import get_module_logger, zip_files
|
||||
|
||||
from . import constants
|
||||
|
||||
## zlib is not available on all systems
|
||||
try:
|
||||
import zlib # noqa # pylint: disable=unused-import
|
||||
|
||||
compression = zipfile.ZIP_DEFLATED
|
||||
except (ImportError, AttributeError):
|
||||
compression = zipfile.ZIP_STORED
|
||||
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
|
@ -268,11 +256,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
logger.info("Compressing files")
|
||||
files_to_compress = [csv_path, excel_path, pdf_path]
|
||||
with zipfile.ZipFile(zip_path, "w") as zf:
|
||||
for f in files_to_compress:
|
||||
zf.write(f, arcname=Path(f).name, compress_type=compression)
|
||||
zip_info = get_zip_info(zip_path)
|
||||
logger.info(json.dumps(zip_info, indent=4, sort_keys=True, default=str))
|
||||
zip_files(zip_path, files_to_compress)
|
||||
|
||||
def load(self) -> None:
|
||||
self._load_score_csv(
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import csv
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
@ -9,12 +10,14 @@ from data_pipeline.utils import (
|
|||
remove_all_dirs_from_dir,
|
||||
remove_files_from_dir,
|
||||
unzip_file_from_url,
|
||||
zip_directory,
|
||||
)
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def reset_data_directories(data_path: Path) -> None:
|
||||
"""Empties all census folders"""
|
||||
census_data_path = data_path / "census"
|
||||
|
||||
# csv
|
||||
|
@ -31,6 +34,7 @@ def reset_data_directories(data_path: Path) -> None:
|
|||
|
||||
|
||||
def get_state_fips_codes(data_path: Path) -> list:
|
||||
"""Returns a list with state data"""
|
||||
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
||||
|
||||
# check if file exists
|
||||
|
@ -69,3 +73,50 @@ def get_state_information(data_path: Path) -> pd.DataFrame:
|
|||
df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def check_census_data_source(
|
||||
census_data_path: Path, census_data_source: str
|
||||
) -> None:
|
||||
"""Checks if census data is present, and exits gracefully if it doesn't exist. It will download it from S3
|
||||
if census_data_source is set to "aws"
|
||||
|
||||
Args:
|
||||
census_data_path (str): Path for Census data
|
||||
census_data_source (str): Source for the census data
|
||||
Options:
|
||||
- local: fetch census data from the local data directory
|
||||
- aws: fetch census from AWS S3 J40 data repository
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
CENSUS_DATA_S3_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/census.zip"
|
||||
DATA_PATH = settings.APP_ROOT / "data"
|
||||
|
||||
# download from s3 if census_data_source is aws
|
||||
if census_data_source == "aws":
|
||||
logger.info("Fetching Census data from AWS S3")
|
||||
unzip_file_from_url(
|
||||
CENSUS_DATA_S3_URL,
|
||||
DATA_PATH / "tmp",
|
||||
DATA_PATH,
|
||||
)
|
||||
else:
|
||||
# check if census data is found locally
|
||||
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
|
||||
logger.info(
|
||||
"No local census data found. Please use '-cds aws` to fetch from AWS"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
def zip_census_data():
|
||||
logger.info("Compressing and uploading census files to AWS S3")
|
||||
|
||||
CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
|
||||
TMP_PATH = settings.APP_ROOT / "data" / "tmp"
|
||||
|
||||
# zip folder
|
||||
zip_directory(CENSUS_DATA_PATH, TMP_PATH)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue