Data sources from S3 (#769)

* Started 535

* Data sources from S3

* lint

* renove breakpoints

* PR comments

* lint

* census data completed

* lint

* renaming data source
This commit is contained in:
Jorge Escobar 2021-10-13 16:00:33 -04:00 committed by GitHub
commit 3b04356fb3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 317 additions and 67 deletions

View file

@ -315,6 +315,7 @@ class ScoreETL(ExtractTransformLoad):
def extract(self) -> None:
logger.info("Loading data sets from disk.")
# EJSCreen csv Load
ejscreen_csv = self.DATA_PATH / "dataset" / "ejscreen_2019" / "usa.csv"
self.ejscreen_df = pd.read_csv(

View file

@ -1,9 +1,11 @@
import math
import pandas as pd
import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import (
check_census_data_source,
)
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -14,7 +16,7 @@ class GeoScoreETL(ExtractTransformLoad):
A class used to generate per state and national GeoJson files with the score baked in
"""
def __init__(self):
def __init__(self, data_source: str = None):
self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
@ -22,6 +24,7 @@ class GeoScoreETL(ExtractTransformLoad):
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
self.DATA_SOURCE = data_source
self.CENSUS_USA_GEOJSON = (
self.DATA_PATH / "census" / "geojson" / "us.json"
)
@ -37,6 +40,12 @@ class GeoScoreETL(ExtractTransformLoad):
self.geojson_score_usa_low: gpd.GeoDataFrame
def extract(self) -> None:
# check census data
check_census_data_source(
census_data_path=self.DATA_PATH / "census",
census_data_source=self.DATA_SOURCE,
)
logger.info("Reading US GeoJSON (~6 minutes)")
self.geojson_usa_df = gpd.read_file(
self.CENSUS_USA_GEOJSON,

View file

@ -1,22 +1,10 @@
import json
import zipfile
from pathlib import Path
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, get_zip_info
from data_pipeline.utils import get_module_logger, zip_files
from . import constants
## zlib is not available on all systems
try:
import zlib # noqa # pylint: disable=unused-import
compression = zipfile.ZIP_DEFLATED
except (ImportError, AttributeError):
compression = zipfile.ZIP_STORED
logger = get_module_logger(__name__)
@ -268,11 +256,7 @@ class PostScoreETL(ExtractTransformLoad):
logger.info("Compressing files")
files_to_compress = [csv_path, excel_path, pdf_path]
with zipfile.ZipFile(zip_path, "w") as zf:
for f in files_to_compress:
zf.write(f, arcname=Path(f).name, compress_type=compression)
zip_info = get_zip_info(zip_path)
logger.info(json.dumps(zip_info, indent=4, sort_keys=True, default=str))
zip_files(zip_path, files_to_compress)
def load(self) -> None:
self._load_score_csv(