Data sources from S3 (#769)

* Started 535

* Data sources from S3

* lint

* renove breakpoints

* PR comments

* lint

* census data completed

* lint

* renaming data source
This commit is contained in:
Jorge Escobar 2021-10-13 16:00:33 -04:00 committed by GitHub
commit 3b04356fb3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 317 additions and 67 deletions

View file

@ -1,5 +1,6 @@
import csv
import os
import sys
from pathlib import Path
import pandas as pd
@ -9,12 +10,14 @@ from data_pipeline.utils import (
remove_all_dirs_from_dir,
remove_files_from_dir,
unzip_file_from_url,
zip_directory,
)
logger = get_module_logger(__name__)
def reset_data_directories(data_path: Path) -> None:
"""Empties all census folders"""
census_data_path = data_path / "census"
# csv
@ -31,6 +34,7 @@ def reset_data_directories(data_path: Path) -> None:
def get_state_fips_codes(data_path: Path) -> list:
"""Returns a list with state data"""
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
# check if file exists
@ -69,3 +73,50 @@ def get_state_information(data_path: Path) -> pd.DataFrame:
df["fips"] = df["fips"].astype(str).apply(lambda x: x.zfill(2))
return df
def check_census_data_source(
census_data_path: Path, census_data_source: str
) -> None:
"""Checks if census data is present, and exits gracefully if it doesn't exist. It will download it from S3
if census_data_source is set to "aws"
Args:
census_data_path (str): Path for Census data
census_data_source (str): Source for the census data
Options:
- local: fetch census data from the local data directory
- aws: fetch census from AWS S3 J40 data repository
Returns:
None
"""
CENSUS_DATA_S3_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/census.zip"
DATA_PATH = settings.APP_ROOT / "data"
# download from s3 if census_data_source is aws
if census_data_source == "aws":
logger.info("Fetching Census data from AWS S3")
unzip_file_from_url(
CENSUS_DATA_S3_URL,
DATA_PATH / "tmp",
DATA_PATH,
)
else:
# check if census data is found locally
if not os.path.isfile(census_data_path / "geojson" / "us.json"):
logger.info(
"No local census data found. Please use '-cds aws` to fetch from AWS"
)
sys.exit()
def zip_census_data():
logger.info("Compressing and uploading census files to AWS S3")
CENSUS_DATA_PATH = settings.APP_ROOT / "data" / "census"
TMP_PATH = settings.APP_ROOT / "data" / "tmp"
# zip folder
zip_directory(CENSUS_DATA_PATH, TMP_PATH)