User Story 2152 – Clean up logging (#2155)

Update logging messages and message consistency

This update includes changes to the level of many log messages. Rather than everything being logged at the info level, it differentiates between debug, info, warning, and error messages. It also changes the default log level to info to avoid much of the noise previously in the logs.

It also removes many extra log messages, and adds additional decorators at the beginning of each pipeline run.
This commit is contained in:
Travis Newby 2023-02-08 13:08:55 -06:00 committed by GitHub
commit 03a6d3c660
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
63 changed files with 307 additions and 339 deletions

View file

@ -56,8 +56,6 @@ class ScoreETL(ExtractTransformLoad):
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
def extract(self) -> None:
logger.info("Loading data sets from disk.")
# EJSCreen csv Load
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
self.ejscreen_df = pd.read_csv(
@ -200,7 +198,7 @@ class ScoreETL(ExtractTransformLoad):
)
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
logger.info("Joining Census Tract dataframes")
logger.debug("Joining Census Tract dataframes")
def merge_function(
left: pd.DataFrame, right: pd.DataFrame
@ -317,7 +315,7 @@ class ScoreETL(ExtractTransformLoad):
~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts),
np.nan,
)
logger.info(
logger.debug(
f"Creating special case column for percentiles from {input_column_name}"
)
df[
@ -335,7 +333,7 @@ class ScoreETL(ExtractTransformLoad):
# TODO Move a lot of this to the ETL part of the pipeline
def _prepare_initial_df(self) -> pd.DataFrame:
logger.info("Preparing initial dataframe")
logger.debug("Preparing initial dataframe")
# Join all the data sources that use census tracts
census_tract_dfs = [
@ -377,7 +375,7 @@ class ScoreETL(ExtractTransformLoad):
assert (
census_tract_df.shape[0] <= pre_join_len
), "Join against national tract list ADDED rows"
logger.info(
logger.debug(
"Dropped %s tracts not in the 2010 tract data",
pre_join_len
- census_tract_df[field_names.GEOID_TRACT_FIELD].nunique(),
@ -560,7 +558,7 @@ class ScoreETL(ExtractTransformLoad):
for col in boolean_columns:
tmp = df_copy[col].copy()
df_copy[col] = np.where(tmp.notna(), tmp.astype(bool), None)
logger.info(f"{col} contains {df_copy[col].isna().sum()} nulls.")
logger.debug(f"{col} contains {df_copy[col].isna().sum()} nulls.")
# Convert all columns to numeric and do math
# Note that we have a few special conditions here and we handle them explicitly.
@ -591,7 +589,7 @@ class ScoreETL(ExtractTransformLoad):
.astype(bool)
.fillna(False)
][field_names.GEOID_TRACT_FIELD].to_list()
logger.info(
logger.debug(
f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
)
elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
@ -599,7 +597,7 @@ class ScoreETL(ExtractTransformLoad):
# 72 is the FIPS code for Puerto Rico
df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72")
][field_names.GEOID_TRACT_FIELD].to_list()
logger.info(
logger.debug(
f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
)
@ -615,7 +613,7 @@ class ScoreETL(ExtractTransformLoad):
df_copy[field_names.TOTAL_POP_FIELD].fillna(0)
<= low_population
][field_names.GEOID_TRACT_FIELD].to_list()
logger.info(
logger.debug(
f"Dropping {len(drop_tracts)} tracts from DOT traffic burden"
)
@ -666,7 +664,7 @@ class ScoreETL(ExtractTransformLoad):
)
def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Backfilling island demographic data")
logger.debug("Backfilling island demographic data")
island_index = self._get_island_areas(df)
for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS:
actual_field_name = backfill_field_name.replace(
@ -684,8 +682,6 @@ class ScoreETL(ExtractTransformLoad):
return df
def transform(self) -> None:
logger.info("Transforming Score Data")
# prepare the df with the right CBG/tract IDs, column names/types, and percentiles
self.df = self._prepare_initial_df()
@ -696,9 +692,6 @@ class ScoreETL(ExtractTransformLoad):
self.df = self._backfill_island_demographics(self.df)
def load(self) -> None:
logger.info(
f"Saving Score CSV to {constants.DATA_SCORE_CSV_FULL_FILE_PATH}."
)
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)

View file

@ -118,7 +118,7 @@ class GeoScoreETL(ExtractTransformLoad):
fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
# TODO update this join
logger.info("Merging and compressing score CSV with USA GeoJSON")
logger.info("Merging and compressing score csv with USA GeoJSON")
self.geojson_score_usa_high = self.score_usa_df.set_index(
self.GEOID_FIELD_NAME
).merge(
@ -143,7 +143,7 @@ class GeoScoreETL(ExtractTransformLoad):
columns={self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO}
)
logger.info("Converting geojson into geodf with tracts")
logger.info("Converting GeoJSON into GeoDataFrame with tracts")
usa_tracts = gpd.GeoDataFrame(
usa_tracts,
columns=[
@ -154,15 +154,15 @@ class GeoScoreETL(ExtractTransformLoad):
crs="EPSG:4326",
)
logger.info("Creating buckets from tracts")
logger.debug("Creating buckets from tracts")
usa_bucketed, keep_high_zoom_df = self._create_buckets_from_tracts(
usa_tracts, self.NUMBER_OF_BUCKETS
)
logger.info("Aggregating buckets")
logger.debug("Aggregating buckets")
usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean")
logger.info("Breaking up polygons")
logger.debug("Breaking up polygons")
compressed = self._breakup_multipolygons(
usa_aggregated, self.NUMBER_OF_BUCKETS
)
@ -220,7 +220,7 @@ class GeoScoreETL(ExtractTransformLoad):
len(state_tracts.index) / self.NUMBER_OF_BUCKETS
)
logger.info(
logger.debug(
f"The number of buckets has increased to {self.NUMBER_OF_BUCKETS}"
)
for i in range(len(state_tracts.index)):

View file

@ -62,7 +62,7 @@ class PostScoreETL(ExtractTransformLoad):
# End YAML definition constants
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
logger.info("Reading Counties CSV")
logger.debug("Reading Counties CSV")
return pd.read_csv(
county_path,
sep="\t",
@ -75,7 +75,7 @@ class PostScoreETL(ExtractTransformLoad):
)
def _extract_states(self, state_path: Path) -> pd.DataFrame:
logger.info("Reading States CSV")
logger.debug("Reading States CSV")
return pd.read_csv(
state_path,
dtype={"fips": "string", "state_abbreviation": "string"},
@ -83,7 +83,7 @@ class PostScoreETL(ExtractTransformLoad):
)
def _extract_score(self, score_path: Path) -> pd.DataFrame:
logger.info("Reading Score CSV")
logger.debug("Reading Score CSV")
df = pd.read_csv(
score_path,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
@ -98,8 +98,6 @@ class PostScoreETL(ExtractTransformLoad):
return df
def extract(self) -> None:
logger.info("Starting Extraction")
# check census data
check_census_data_source(
census_data_path=self.DATA_PATH / "census",
@ -170,7 +168,7 @@ class PostScoreETL(ExtractTransformLoad):
score_df: pd.DataFrame,
) -> pd.DataFrame:
logger.info("Merging county info with score info")
logger.debug("Merging county info with score info")
score_county_merged = score_df.merge(
# We drop state abbreviation so we don't get it twice
counties_df[["GEOID", "County Name"]],
@ -178,7 +176,7 @@ class PostScoreETL(ExtractTransformLoad):
how="left",
)
logger.info("Merging state info with county-score info")
logger.debug("Merging state info with county-score info")
# Here, we need to join on a separate key, since there's no
# entry for the island areas in the counties df (there are no
# counties!) Thus, unless we join state separately from county,
@ -207,7 +205,7 @@ class PostScoreETL(ExtractTransformLoad):
score_county_state_merged_df: pd.DataFrame,
) -> pd.DataFrame:
logger.info("Rounding Decimals")
logger.debug("Rounding Decimals")
# grab all the keys from tiles score columns
tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())
@ -218,7 +216,7 @@ class PostScoreETL(ExtractTransformLoad):
# We may not want some states/territories on the map, so this will drop all
# rows with those FIPS codes (first two digits of the census tract)
logger.info(
logger.debug(
f"Dropping specified FIPS codes from tile data: {constants.DROP_FIPS_CODES}"
)
tracts_to_drop = []
@ -236,12 +234,12 @@ class PostScoreETL(ExtractTransformLoad):
for col, col_dtype in score_tiles.dtypes.items()
if col_dtype == np.dtype("float64")
]
scale_factor = 10**constants.TILES_ROUND_NUM_DECIMALS
scale_factor = 10 ** constants.TILES_ROUND_NUM_DECIMALS
score_tiles[float_cols] = (
score_tiles[float_cols] * scale_factor
).apply(np.floor) / scale_factor
logger.info("Adding fields for island areas and Puerto Rico")
logger.debug("Adding fields for island areas and Puerto Rico")
# The below operation constructs variables for the front end.
# Since the Island Areas, Puerto Rico, and the nation all have a different
# set of available data, each has its own user experience.
@ -381,8 +379,6 @@ class PostScoreETL(ExtractTransformLoad):
return final_df
def transform(self) -> None:
logger.info("Transforming data sources for Score + County CSVs")
transformed_counties = self._transform_counties(self.input_counties_df)
transformed_states = self._transform_states(self.input_states_df)
transformed_score = self._transform_score(self.input_score_df)
@ -403,7 +399,7 @@ class PostScoreETL(ExtractTransformLoad):
def _load_score_csv_full(
self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
) -> None:
logger.info("Saving Full Score CSV with County Information")
logger.debug("Saving Full Score CSV with County Information")
score_csv_path.parent.mkdir(parents=True, exist_ok=True)
score_county_state_merged.to_csv(
score_csv_path,
@ -476,7 +472,7 @@ class PostScoreETL(ExtractTransformLoad):
def _load_tile_csv(
self, score_tiles_df: pd.DataFrame, tile_score_path: Path
) -> None:
logger.info("Saving Tile Score CSV")
logger.debug("Saving Tile Score CSV")
tile_score_path.parent.mkdir(parents=True, exist_ok=True)
score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")
@ -498,13 +494,13 @@ class PostScoreETL(ExtractTransformLoad):
constants.SCORE_VERSIONING_DATA_DOCUMENTATION_ZIP_FILE_PATH
)
logger.info("Writing downloadable excel")
logger.debug("Writing downloadable excel")
excel_config = self._load_excel_from_df(
excel_df=self.output_score_county_state_merged_df,
excel_path=excel_path,
)
logger.info("Writing downloadable csv")
logger.debug("Writing downloadable csv")
# open yaml config
downloadable_csv_config = load_yaml_dict_from_file(
self.CONTENT_CONFIG / "csv.yml", CSVConfig
@ -516,7 +512,7 @@ class PostScoreETL(ExtractTransformLoad):
)
downloadable_df.to_csv(csv_path, index=False)
logger.info("Creating codebook for download zip")
logger.debug("Creating codebook for download zip")
# consolidate all excel fields from the config yml. The codebook
# code takes in a list of fields, but the excel config file
@ -562,17 +558,17 @@ class PostScoreETL(ExtractTransformLoad):
codebook_df.to_csv(codebook_path, index=False)
# zip assets
logger.info("Compressing csv files")
logger.debug("Compressing csv files")
files_to_compress = [csv_path, codebook_path, readme_path]
zip_files(csv_zip_path, files_to_compress)
logger.info("Compressing xls files")
logger.debug("Compressing xls files")
files_to_compress = [excel_path, codebook_path, readme_path]
zip_files(xls_zip_path, files_to_compress)
# Per #1557
# zip file that contains the .xls, .csv, .pdf, tech support document, checksum file
logger.info("Compressing data and documentation files")
logger.debug("Compressing data and documentation files")
files_to_compress = [
excel_path,
csv_path,

View file

@ -47,14 +47,14 @@ def check_score_data_source(
# download from s3 if census_data_source is aws
if score_data_source == "aws":
logger.info("Fetching Score Tile data from AWS S3")
logger.debug("Fetching Score Tile data from AWS S3")
download_file_from_url(
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
)
else:
# check if score data is found locally
if not os.path.isfile(TILE_SCORE_CSV):
logger.info(
logger.warning(
"No local score tiles data found. Please use '-s aws` to fetch from AWS"
)
sys.exit()
@ -96,7 +96,7 @@ def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
if series.isin(unacceptable_values).any():
series.replace(mapping, regex=False, inplace=True)
multiplication_factor = 10**number_of_decimals
multiplication_factor = 10 ** number_of_decimals
# In order to safely cast NaNs
# First coerce series to float type: series.astype(float)
@ -409,7 +409,7 @@ def compare_to_list_of_expected_state_fips_codes(
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
)
else:
logger.info(
logger.debug(
"Data matches expected state and territory representation"
f"{dataset_name_phrase}."
)