j40-cejst-2/data/data-pipeline/data_pipeline/utils.py

1261 lines
22 KiB
Python
Raw Normal View History

import datetime
import logging
import os
import sys
import shutil
import zipfile
from pathlib import Path
import requests
import urllib3
from data_pipeline.config import settings
def get_module_logger(module_name: str) -> logging.Logger:
"""Instantiates a logger object on stdout
Args:
module_name (str): Name of the module outputting the logs
Returns:
logger (Logging.logger): A logger object
"""
logger = logging.getLogger(module_name)
handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s [%(name)-12s] %(levelname)-8s %(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
return logger
logger = get_module_logger(__name__)
def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
"""Removes all files from a specific directory with the exception of __init__.py
files or files with a specific extension
Args:
files_path (pathlib.Path): Name of the directory where the files will be deleted
extension (str): Extension of the file pattern to delete, example "json" (optional)
Returns:
None
"""
for file in os.listdir(files_path):
if extension:
if not file.endswith(extension):
continue
else:
# don't rempove __init__ files as they conserve dir structure
if file == "__init__.py":
continue
os.remove(files_path / file)
logger.info(f"Removing {file}")
def remove_all_from_dir(files_path: Path) -> None:
"""Removes all files and directories from a specific directory, except __init__.py files
Args:
files_path (pathlib.Path): Name of the directory where the files and directories will be deleted
Returns:
None
"""
for file in os.listdir(files_path):
# don't rempove __init__ files as they conserve dir structure
if file == "__init__.py":
continue
if os.path.isfile(files_path / file):
os.remove(files_path / file)
else:
shutil.rmtree(files_path / file)
logger.info(f"Removing {file}")
def remove_all_dirs_from_dir(dir_path: Path) -> None:
"""Removes all directories from a speficic directory
Args:
dir_path (pathlib.Path): Name of the directory where the directories will be deleted
Returns:
None
"""
for filename in os.listdir(dir_path):
file_path = os.path.join(dir_path, filename)
if os.path.isdir(file_path):
shutil.rmtree(file_path)
logging.info(f"Removing directory {file_path}")
def download_file_from_url(
file_url: str,
download_file_name: Path,
verify: bool = True,
) -> str:
"""Downloads a file from a remote URL location and returns the file location.
Args:
file_url (str): URL where the zip file is located
download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an error (optional, default to False)
Returns:
None
"""
# disable https warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
if not os.path.isdir(download_file_name.parent):
os.mkdir(download_file_name.parent)
logger.info(f"Downloading {file_url}")
response = requests.get(file_url, verify=verify)
if response.status_code == 200:
file_contents = response.content
else:
sys.exit(
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
)
# Write the contents to disk.
file = open(download_file_name, "wb")
file.write(file_contents)
file.close()
return download_file_name
def unzip_file_from_url(
file_url: str,
download_path: Path,
unzipped_file_path: Path,
verify: bool = True,
) -> None:
"""Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
Args:
file_url (str): URL where the zip file is located
download_path (pathlib.Path): directory where the temporary file will be downloaded (called downloaded.zip by default)
unzipped_file_path (pathlib.Path): directory and name of the extracted file
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an error (optional, default to False)
Returns:
None
"""
zip_file_path = download_file_from_url(
file_url=file_url,
download_file_name=download_path / "downloaded.zip",
verify=verify,
)
logger.info(f"Extracting {zip_file_path}")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(unzipped_file_path)
# cleanup temporary file
os.remove(zip_file_path)
def data_folder_cleanup() -> None:
"""Remove all files and directories from the local data/dataset path"""
data_path = settings.APP_ROOT / "data"
logger.info("Initializing all dataset directoriees")
remove_all_from_dir(data_path / "dataset")
def score_folder_cleanup() -> None:
"""Remove all files and directories from the local data/score path"""
data_path = settings.APP_ROOT / "data"
logger.info("Initializing all score data")
remove_all_from_dir(data_path / "score" / "csv")
remove_all_from_dir(data_path / "score" / "geojson")
remove_all_from_dir(data_path / "score" / "tiles")
downloadable_cleanup()
Add ETL Contract Checks (#619) * Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
2021-10-13 15:54:15 -04:00
def downloadable_cleanup() -> None:
"""Remove all files from downloadable directory in the local data/score path"""
data_path = settings.APP_ROOT / "data"
remove_all_from_dir(data_path / "score" / "downloadable")
Add ETL Contract Checks (#619) * Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
2021-10-13 15:54:15 -04:00
def temp_folder_cleanup() -> None:
"""Remove all files and directories from the local data/tmp temporary path"""
data_path = settings.APP_ROOT / "data"
logger.info("Initializing all temp directoriees")
remove_all_from_dir(data_path / "tmp")
def check_first_run() -> bool:
"""Checks if a local flag file has been set and returns False
if it hasn't"""
data_path = settings.APP_ROOT / "data"
file = "first_run.txt"
if not os.path.isfile(data_path / file):
return True
return False
def get_excel_column_name(index: int) -> str:
"""Map a numeric index to the appropriate column in Excel. E.g., column #95 is "CR".
Only works for the first 1000 columns.
Args:
index (int): the index of the column
Returns:
str: the excel column name
"""
excel_column_names = [
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"R",
"S",
"T",
"U",
"V",
"W",
"X",
"Y",
"Z",
"AA",
"AB",
"AC",
"AD",
"AE",
"AF",
"AG",
"AH",
"AI",
"AJ",
"AK",
"AL",
"AM",
"AN",
"AO",
"AP",
"AQ",
"AR",
"AS",
"AT",
"AU",
"AV",
"AW",
"AX",
"AY",
"AZ",
"BA",
"BB",
"BC",
"BD",
"BE",
"BF",
"BG",
"BH",
"BI",
"BJ",
"BK",
"BL",
"BM",
"BN",
"BO",
"BP",
"BQ",
"BR",
"BS",
"BT",
"BU",
"BV",
"BW",
"BX",
"BY",
"BZ",
"CA",
"CB",
"CC",
"CD",
"CE",
"CF",
"CG",
"CH",
"CI",
"CJ",
"CK",
"CL",
"CM",
"CN",
"CO",
"CP",
"CQ",
"CR",
"CS",
"CT",
"CU",
"CV",
"CW",
"CX",
"CY",
"CZ",
"DA",
"DB",
"DC",
"DD",
"DE",
"DF",
"DG",
"DH",
"DI",
"DJ",
"DK",
"DL",
"DM",
"DN",
"DO",
"DP",
"DQ",
"DR",
"DS",
"DT",
"DU",
"DV",
"DW",
"DX",
"DY",
"DZ",
"EA",
"EB",
"EC",
"ED",
"EE",
"EF",
"EG",
"EH",
"EI",
"EJ",
"EK",
"EL",
"EM",
"EN",
"EO",
"EP",
"EQ",
"ER",
"ES",
"ET",
"EU",
"EV",
"EW",
"EX",
"EY",
"EZ",
"FA",
"FB",
"FC",
"FD",
"FE",
"FF",
"FG",
"FH",
"FI",
"FJ",
"FK",
"FL",
"FM",
"FN",
"FO",
"FP",
"FQ",
"FR",
"FS",
"FT",
"FU",
"FV",
"FW",
"FX",
"FY",
"FZ",
"GA",
"GB",
"GC",
"GD",
"GE",
"GF",
"GG",
"GH",
"GI",
"GJ",
"GK",
"GL",
"GM",
"GN",
"GO",
"GP",
"GQ",
"GR",
"GS",
"GT",
"GU",
"GV",
"GW",
"GX",
"GY",
"GZ",
"HA",
"HB",
"HC",
"HD",
"HE",
"HF",
"HG",
"HH",
"HI",
"HJ",
"HK",
"HL",
"HM",
"HN",
"HO",
"HP",
"HQ",
"HR",
"HS",
"HT",
"HU",
"HV",
"HW",
"HX",
"HY",
"HZ",
"IA",
"IB",
"IC",
"ID",
"IE",
"IF",
"IG",
"IH",
"II",
"IJ",
"IK",
"IL",
"IM",
"IN",
"IO",
"IP",
"IQ",
"IR",
"IS",
"IT",
"IU",
"IV",
"IW",
"IX",
"IY",
"IZ",
"JA",
"JB",
"JC",
"JD",
"JE",
"JF",
"JG",
"JH",
"JI",
"JJ",
"JK",
"JL",
"JM",
"JN",
"JO",
"JP",
"JQ",
"JR",
"JS",
"JT",
"JU",
"JV",
"JW",
"JX",
"JY",
"JZ",
"KA",
"KB",
"KC",
"KD",
"KE",
"KF",
"KG",
"KH",
"KI",
"KJ",
"KK",
"KL",
"KM",
"KN",
"KO",
"KP",
"KQ",
"KR",
"KS",
"KT",
"KU",
"KV",
"KW",
"KX",
"KY",
"KZ",
"LA",
"LB",
"LC",
"LD",
"LE",
"LF",
"LG",
"LH",
"LI",
"LJ",
"LK",
"LL",
"LM",
"LN",
"LO",
"LP",
"LQ",
"LR",
"LS",
"LT",
"LU",
"LV",
"LW",
"LX",
"LY",
"LZ",
"MA",
"MB",
"MC",
"MD",
"ME",
"MF",
"MG",
"MH",
"MI",
"MJ",
"MK",
"ML",
"MM",
"MN",
"MO",
"MP",
"MQ",
"MR",
"MS",
"MT",
"MU",
"MV",
"MW",
"MX",
"MY",
"MZ",
"NA",
"NB",
"NC",
"ND",
"NE",
"NF",
"NG",
"NH",
"NI",
"NJ",
"NK",
"NL",
"NM",
"NN",
"NO",
"NP",
"NQ",
"NR",
"NS",
"NT",
"NU",
"NV",
"NW",
"NX",
"NY",
"NZ",
"OA",
"OB",
"OC",
"OD",
"OE",
"OF",
"OG",
"OH",
"OI",
"OJ",
"OK",
"OL",
"OM",
"ON",
"OO",
"OP",
"OQ",
"OR",
"OS",
"OT",
"OU",
"OV",
"OW",
"OX",
"OY",
"OZ",
"PA",
"PB",
"PC",
"PD",
"PE",
"PF",
"PG",
"PH",
"PI",
"PJ",
"PK",
"PL",
"PM",
"PN",
"PO",
"PP",
"PQ",
"PR",
"PS",
"PT",
"PU",
"PV",
"PW",
"PX",
"PY",
"PZ",
"QA",
"QB",
"QC",
"QD",
"QE",
"QF",
"QG",
"QH",
"QI",
"QJ",
"QK",
"QL",
"QM",
"QN",
"QO",
"QP",
"QQ",
"QR",
"QS",
"QT",
"QU",
"QV",
"QW",
"QX",
"QY",
"QZ",
"RA",
"RB",
"RC",
"RD",
"RE",
"RF",
"RG",
"RH",
"RI",
"RJ",
"RK",
"RL",
"RM",
"RN",
"RO",
"RP",
"RQ",
"RR",
"RS",
"RT",
"RU",
"RV",
"RW",
"RX",
"RY",
"RZ",
"SA",
"SB",
"SC",
"SD",
"SE",
"SF",
"SG",
"SH",
"SI",
"SJ",
"SK",
"SL",
"SM",
"SN",
"SO",
"SP",
"SQ",
"SR",
"SS",
"ST",
"SU",
"SV",
"SW",
"SX",
"SY",
"SZ",
"TA",
"TB",
"TC",
"TD",
"TE",
"TF",
"TG",
"TH",
"TI",
"TJ",
"TK",
"TL",
"TM",
"TN",
"TO",
"TP",
"TQ",
"TR",
"TS",
"TT",
"TU",
"TV",
"TW",
"TX",
"TY",
"TZ",
"UA",
"UB",
"UC",
"UD",
"UE",
"UF",
"UG",
"UH",
"UI",
"UJ",
"UK",
"UL",
"UM",
"UN",
"UO",
"UP",
"UQ",
"UR",
"US",
"UT",
"UU",
"UV",
"UW",
"UX",
"UY",
"UZ",
"VA",
"VB",
"VC",
"VD",
"VE",
"VF",
"VG",
"VH",
"VI",
"VJ",
"VK",
"VL",
"VM",
"VN",
"VO",
"VP",
"VQ",
"VR",
"VS",
"VT",
"VU",
"VV",
"VW",
"VX",
"VY",
"VZ",
"WA",
"WB",
"WC",
"WD",
"WE",
"WF",
"WG",
"WH",
"WI",
"WJ",
"WK",
"WL",
"WM",
"WN",
"WO",
"WP",
"WQ",
"WR",
"WS",
"WT",
"WU",
"WV",
"WW",
"WX",
"WY",
"WZ",
"XA",
"XB",
"XC",
"XD",
"XE",
"XF",
"XG",
"XH",
"XI",
"XJ",
"XK",
"XL",
"XM",
"XN",
"XO",
"XP",
"XQ",
"XR",
"XS",
"XT",
"XU",
"XV",
"XW",
"XX",
"XY",
"XZ",
"YA",
"YB",
"YC",
"YD",
"YE",
"YF",
"YG",
"YH",
"YI",
"YJ",
"YK",
"YL",
"YM",
"YN",
"YO",
"YP",
"YQ",
"YR",
"YS",
"YT",
"YU",
"YV",
"YW",
"YX",
"YY",
"YZ",
"ZA",
"ZB",
"ZC",
"ZD",
"ZE",
"ZF",
"ZG",
"ZH",
"ZI",
"ZJ",
"ZK",
"ZL",
"ZM",
"ZN",
"ZO",
"ZP",
"ZQ",
"ZR",
"ZS",
"ZT",
"ZU",
"ZV",
"ZW",
"ZX",
"ZY",
"ZZ",
"AAA",
"AAB",
"AAC",
"AAD",
"AAE",
"AAF",
"AAG",
"AAH",
"AAI",
"AAJ",
"AAK",
"AAL",
"AAM",
"AAN",
"AAO",
"AAP",
"AAQ",
"AAR",
"AAS",
"AAT",
"AAU",
"AAV",
"AAW",
"AAX",
"AAY",
"AAZ",
"ABA",
"ABB",
"ABC",
"ABD",
"ABE",
"ABF",
"ABG",
"ABH",
"ABI",
"ABJ",
"ABK",
"ABL",
"ABM",
"ABN",
"ABO",
"ABP",
"ABQ",
"ABR",
"ABS",
"ABT",
"ABU",
"ABV",
"ABW",
"ABX",
"ABY",
"ABZ",
"ACA",
"ACB",
"ACC",
"ACD",
"ACE",
"ACF",
"ACG",
"ACH",
"ACI",
"ACJ",
"ACK",
"ACL",
"ACM",
"ACN",
"ACO",
"ACP",
"ACQ",
"ACR",
"ACS",
"ACT",
"ACU",
"ACV",
"ACW",
"ACX",
"ACY",
"ACZ",
"ADA",
"ADB",
"ADC",
"ADD",
"ADE",
"ADF",
"ADG",
"ADH",
"ADI",
"ADJ",
"ADK",
"ADL",
"ADM",
"ADN",
"ADO",
"ADP",
"ADQ",
"ADR",
"ADS",
"ADT",
"ADU",
"ADV",
"ADW",
"ADX",
"ADY",
"ADZ",
"AEA",
"AEB",
"AEC",
"AED",
"AEE",
"AEF",
"AEG",
"AEH",
"AEI",
"AEJ",
"AEK",
"AEL",
"AEM",
"AEN",
"AEO",
"AEP",
"AEQ",
"AER",
"AES",
"AET",
"AEU",
"AEV",
"AEW",
"AEX",
"AEY",
"AEZ",
"AFA",
"AFB",
"AFC",
"AFD",
"AFE",
"AFF",
"AFG",
"AFH",
"AFI",
"AFJ",
"AFK",
"AFL",
"AFM",
"AFN",
"AFO",
"AFP",
"AFQ",
"AFR",
"AFS",
"AFT",
"AFU",
"AFV",
"AFW",
"AFX",
"AFY",
"AFZ",
"AGA",
"AGB",
"AGC",
"AGD",
"AGE",
"AGF",
"AGG",
"AGH",
"AGI",
"AGJ",
"AGK",
"AGL",
"AGM",
"AGN",
"AGO",
"AGP",
"AGQ",
"AGR",
"AGS",
"AGT",
"AGU",
"AGV",
"AGW",
"AGX",
"AGY",
"AGZ",
"AHA",
"AHB",
"AHC",
"AHD",
"AHE",
"AHF",
"AHG",
"AHH",
"AHI",
"AHJ",
"AHK",
"AHL",
"AHM",
"AHN",
"AHO",
"AHP",
"AHQ",
"AHR",
"AHS",
"AHT",
"AHU",
"AHV",
"AHW",
"AHX",
"AHY",
"AHZ",
"AIA",
"AIB",
"AIC",
"AID",
"AIE",
"AIF",
"AIG",
"AIH",
"AII",
"AIJ",
"AIK",
"AIL",
"AIM",
"AIN",
"AIO",
"AIP",
"AIQ",
"AIR",
"AIS",
"AIT",
"AIU",
"AIV",
"AIW",
"AIX",
"AIY",
"AIZ",
"AJA",
"AJB",
"AJC",
"AJD",
"AJE",
"AJF",
"AJG",
"AJH",
"AJI",
"AJJ",
"AJK",
"AJL",
"AJM",
"AJN",
"AJO",
"AJP",
"AJQ",
"AJR",
"AJS",
"AJT",
"AJU",
"AJV",
"AJW",
"AJX",
"AJY",
"AJZ",
"AKA",
"AKB",
"AKC",
"AKD",
"AKE",
"AKF",
"AKG",
"AKH",
"AKI",
"AKJ",
"AKK",
"AKL",
"AKM",
"AKN",
"AKO",
"AKP",
"AKQ",
"AKR",
"AKS",
"AKT",
"AKU",
"AKV",
"AKW",
"AKX",
"AKY",
"AKZ",
"ALA",
"ALB",
"ALC",
"ALD",
"ALE",
"ALF",
"ALG",
"ALH",
"ALI",
"ALJ",
"ALK",
]
return excel_column_names[index]
def get_zip_info(archive_path: Path) -> list:
"""
Returns information about a provided archive
Args:
archive_path (pathlib.Path): Path of the archive to be inspected
Returns:
a list of information about every file in the zipfile
"""
zf = zipfile.ZipFile(archive_path)
info_list = []
for info in zf.infolist():
info_dict = {}
info_dict["Filename"] = info.filename
info_dict["Comment"] = info.comment.decode("utf8")
info_dict["Modified"] = datetime.datetime(*info.date_time).isoformat()
info_dict["System"] = f"{info.create_system} (0 = Windows, 3 = Unix)"
info_dict["ZIP version"] = info.create_version
info_dict["Compressed"] = f"{info.compress_size} bytes"
info_dict["Uncompressed"] = f"{info.file_size} bytes"
info_list.append(info_dict)
return info_list