User Story 2152 – Clean up logging (#2155)

Update logging messages and message consistency This update includes changes to the level of many log messages. Rather than everything being logged at the info level, it differentiates between debug, info, warning, and error messages. It also changes the default log level to info to avoid much of the noise previously in the logs. It also removes many extra log messages, and adds additional decorators at the beginning of each pipeline run.
2025-09-30 03:33:19 -07:00 · 2023-02-08 13:08:55 -06:00 · 2023-02-08 13:08:55 -06:00 · 03a6d3c660
commit 03a6d3c660
parent 7cfb56476e
63 changed files with 307 additions and 339 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -56,8 +56,6 @@ class ScoreETL(ExtractTransformLoad):
        self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []

    def extract(self) -> None:
-        logger.info("Loading data sets from disk.")
-
        # EJSCreen csv Load
        ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
        self.ejscreen_df = pd.read_csv(
@ -200,7 +198,7 @@ class ScoreETL(ExtractTransformLoad):
        )

    def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
-        logger.info("Joining Census Tract dataframes")
+        logger.debug("Joining Census Tract dataframes")

        def merge_function(
            left: pd.DataFrame, right: pd.DataFrame
@ -317,7 +315,7 @@ class ScoreETL(ExtractTransformLoad):
                ~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts),
                np.nan,
            )
-            logger.info(
+            logger.debug(
                f"Creating special case column for percentiles from {input_column_name}"
            )
            df[
@ -335,7 +333,7 @@ class ScoreETL(ExtractTransformLoad):

    # TODO Move a lot of this to the ETL part of the pipeline
    def _prepare_initial_df(self) -> pd.DataFrame:
-        logger.info("Preparing initial dataframe")
+        logger.debug("Preparing initial dataframe")

        # Join all the data sources that use census tracts
        census_tract_dfs = [
@ -377,7 +375,7 @@ class ScoreETL(ExtractTransformLoad):
        assert (
            census_tract_df.shape[0] <= pre_join_len
        ), "Join against national tract list ADDED rows"
-        logger.info(
+        logger.debug(
            "Dropped %s tracts not in the 2010 tract data",
            pre_join_len
            - census_tract_df[field_names.GEOID_TRACT_FIELD].nunique(),
@ -560,7 +558,7 @@ class ScoreETL(ExtractTransformLoad):
        for col in boolean_columns:
            tmp = df_copy[col].copy()
            df_copy[col] = np.where(tmp.notna(), tmp.astype(bool), None)
-            logger.info(f"{col} contains {df_copy[col].isna().sum()} nulls.")
+            logger.debug(f"{col} contains {df_copy[col].isna().sum()} nulls.")

        # Convert all columns to numeric and do math
        # Note that we have a few special conditions here and we handle them explicitly.
@ -591,7 +589,7 @@ class ScoreETL(ExtractTransformLoad):
                    .astype(bool)
                    .fillna(False)
                ][field_names.GEOID_TRACT_FIELD].to_list()
-                logger.info(
+                logger.debug(
                    f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
                )
            elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
@ -599,7 +597,7 @@ class ScoreETL(ExtractTransformLoad):
                    # 72 is the FIPS code for Puerto Rico
                    df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72")
                ][field_names.GEOID_TRACT_FIELD].to_list()
-                logger.info(
+                logger.debug(
                    f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
                )

@ -615,7 +613,7 @@ class ScoreETL(ExtractTransformLoad):
                    df_copy[field_names.TOTAL_POP_FIELD].fillna(0)
                    <= low_population
                ][field_names.GEOID_TRACT_FIELD].to_list()
-                logger.info(
+                logger.debug(
                    f"Dropping {len(drop_tracts)} tracts from DOT traffic burden"
                )

@ -666,7 +664,7 @@ class ScoreETL(ExtractTransformLoad):
        )

    def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame:
-        logger.info("Backfilling island demographic data")
+        logger.debug("Backfilling island demographic data")
        island_index = self._get_island_areas(df)
        for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS:
            actual_field_name = backfill_field_name.replace(
@ -684,8 +682,6 @@ class ScoreETL(ExtractTransformLoad):
        return df

    def transform(self) -> None:
-        logger.info("Transforming Score Data")
-
        # prepare the df with the right CBG/tract IDs, column names/types, and percentiles
        self.df = self._prepare_initial_df()

@ -696,9 +692,6 @@ class ScoreETL(ExtractTransformLoad):
        self.df = self._backfill_island_demographics(self.df)

    def load(self) -> None:
-        logger.info(
-            f"Saving Score CSV to {constants.DATA_SCORE_CSV_FULL_FILE_PATH}."
-        )
        constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)

        self.df.to_csv(constants.DATA_SCORE_CSV_FULL_FILE_PATH, index=False)
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -118,7 +118,7 @@ class GeoScoreETL(ExtractTransformLoad):
        fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]

        # TODO update this join
-        logger.info("Merging and compressing score CSV with USA GeoJSON")
+        logger.info("Merging and compressing score csv with USA GeoJSON")
        self.geojson_score_usa_high = self.score_usa_df.set_index(
            self.GEOID_FIELD_NAME
        ).merge(
@ -143,7 +143,7 @@ class GeoScoreETL(ExtractTransformLoad):
            columns={self.TARGET_SCORE_SHORT_FIELD: self.TARGET_SCORE_RENAME_TO}
        )

-        logger.info("Converting geojson into geodf with tracts")
+        logger.info("Converting GeoJSON into GeoDataFrame with tracts")
        usa_tracts = gpd.GeoDataFrame(
            usa_tracts,
            columns=[
@ -154,15 +154,15 @@ class GeoScoreETL(ExtractTransformLoad):
            crs="EPSG:4326",
        )

-        logger.info("Creating buckets from tracts")
+        logger.debug("Creating buckets from tracts")
        usa_bucketed, keep_high_zoom_df = self._create_buckets_from_tracts(
            usa_tracts, self.NUMBER_OF_BUCKETS
        )

-        logger.info("Aggregating buckets")
+        logger.debug("Aggregating buckets")
        usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean")

-        logger.info("Breaking up polygons")
+        logger.debug("Breaking up polygons")
        compressed = self._breakup_multipolygons(
            usa_aggregated, self.NUMBER_OF_BUCKETS
        )
@ -220,7 +220,7 @@ class GeoScoreETL(ExtractTransformLoad):
                len(state_tracts.index) / self.NUMBER_OF_BUCKETS
            )

-        logger.info(
+        logger.debug(
            f"The number of buckets has increased to {self.NUMBER_OF_BUCKETS}"
        )
        for i in range(len(state_tracts.index)):
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -62,7 +62,7 @@ class PostScoreETL(ExtractTransformLoad):
        # End YAML definition constants

    def _extract_counties(self, county_path: Path) -> pd.DataFrame:
-        logger.info("Reading Counties CSV")
+        logger.debug("Reading Counties CSV")
        return pd.read_csv(
            county_path,
            sep="\t",
@ -75,7 +75,7 @@ class PostScoreETL(ExtractTransformLoad):
        )

    def _extract_states(self, state_path: Path) -> pd.DataFrame:
-        logger.info("Reading States CSV")
+        logger.debug("Reading States CSV")
        return pd.read_csv(
            state_path,
            dtype={"fips": "string", "state_abbreviation": "string"},
@ -83,7 +83,7 @@ class PostScoreETL(ExtractTransformLoad):
        )

    def _extract_score(self, score_path: Path) -> pd.DataFrame:
-        logger.info("Reading Score CSV")
+        logger.debug("Reading Score CSV")
        df = pd.read_csv(
            score_path,
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
@ -98,8 +98,6 @@ class PostScoreETL(ExtractTransformLoad):
        return df

    def extract(self) -> None:
-        logger.info("Starting Extraction")
-
        # check census data
        check_census_data_source(
            census_data_path=self.DATA_PATH / "census",
@ -170,7 +168,7 @@ class PostScoreETL(ExtractTransformLoad):
        score_df: pd.DataFrame,
    ) -> pd.DataFrame:

-        logger.info("Merging county info with score info")
+        logger.debug("Merging county info with score info")
        score_county_merged = score_df.merge(
            # We drop state abbreviation so we don't get it twice
            counties_df[["GEOID", "County Name"]],
@ -178,7 +176,7 @@ class PostScoreETL(ExtractTransformLoad):
            how="left",
        )

-        logger.info("Merging state info with county-score info")
+        logger.debug("Merging state info with county-score info")
        # Here, we need to join on a separate key, since there's no
        # entry for the island areas in the counties df (there are no
        # counties!) Thus, unless we join state separately from county,
@ -207,7 +205,7 @@ class PostScoreETL(ExtractTransformLoad):
        score_county_state_merged_df: pd.DataFrame,
    ) -> pd.DataFrame:

-        logger.info("Rounding Decimals")
+        logger.debug("Rounding Decimals")
        # grab all the keys from tiles score columns
        tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys())

@ -218,7 +216,7 @@ class PostScoreETL(ExtractTransformLoad):

        # We may not want some states/territories on the map, so this will drop all
        # rows with those FIPS codes (first two digits of the census tract)
-        logger.info(
+        logger.debug(
            f"Dropping specified FIPS codes from tile data: {constants.DROP_FIPS_CODES}"
        )
        tracts_to_drop = []
@ -236,12 +234,12 @@ class PostScoreETL(ExtractTransformLoad):
            for col, col_dtype in score_tiles.dtypes.items()
            if col_dtype == np.dtype("float64")
        ]
-        scale_factor = 10**constants.TILES_ROUND_NUM_DECIMALS
+        scale_factor = 10 ** constants.TILES_ROUND_NUM_DECIMALS
        score_tiles[float_cols] = (
            score_tiles[float_cols] * scale_factor
        ).apply(np.floor) / scale_factor

-        logger.info("Adding fields for island areas and Puerto Rico")
+        logger.debug("Adding fields for island areas and Puerto Rico")
        # The below operation constructs variables for the front end.
        # Since the Island Areas, Puerto Rico, and the nation all have a different
        # set of available data, each has its own user experience.
@ -381,8 +379,6 @@ class PostScoreETL(ExtractTransformLoad):
        return final_df

    def transform(self) -> None:
-        logger.info("Transforming data sources for Score + County CSVs")
-
        transformed_counties = self._transform_counties(self.input_counties_df)
        transformed_states = self._transform_states(self.input_states_df)
        transformed_score = self._transform_score(self.input_score_df)
@ -403,7 +399,7 @@ class PostScoreETL(ExtractTransformLoad):
    def _load_score_csv_full(
        self, score_county_state_merged: pd.DataFrame, score_csv_path: Path
    ) -> None:
-        logger.info("Saving Full Score CSV with County Information")
+        logger.debug("Saving Full Score CSV with County Information")
        score_csv_path.parent.mkdir(parents=True, exist_ok=True)
        score_county_state_merged.to_csv(
            score_csv_path,
@ -476,7 +472,7 @@ class PostScoreETL(ExtractTransformLoad):
    def _load_tile_csv(
        self, score_tiles_df: pd.DataFrame, tile_score_path: Path
    ) -> None:
-        logger.info("Saving Tile Score CSV")
+        logger.debug("Saving Tile Score CSV")
        tile_score_path.parent.mkdir(parents=True, exist_ok=True)
        score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")

@ -498,13 +494,13 @@ class PostScoreETL(ExtractTransformLoad):
            constants.SCORE_VERSIONING_DATA_DOCUMENTATION_ZIP_FILE_PATH
        )

-        logger.info("Writing downloadable excel")
+        logger.debug("Writing downloadable excel")
        excel_config = self._load_excel_from_df(
            excel_df=self.output_score_county_state_merged_df,
            excel_path=excel_path,
        )

-        logger.info("Writing downloadable csv")
+        logger.debug("Writing downloadable csv")
        # open yaml config
        downloadable_csv_config = load_yaml_dict_from_file(
            self.CONTENT_CONFIG / "csv.yml", CSVConfig
@ -516,7 +512,7 @@ class PostScoreETL(ExtractTransformLoad):
        )
        downloadable_df.to_csv(csv_path, index=False)

-        logger.info("Creating codebook for download zip")
+        logger.debug("Creating codebook for download zip")

        # consolidate all excel fields from the config yml. The codebook
        # code takes in a list of fields, but the excel config file
@ -562,17 +558,17 @@ class PostScoreETL(ExtractTransformLoad):
        codebook_df.to_csv(codebook_path, index=False)

        # zip assets
-        logger.info("Compressing csv files")
+        logger.debug("Compressing csv files")
        files_to_compress = [csv_path, codebook_path, readme_path]
        zip_files(csv_zip_path, files_to_compress)

-        logger.info("Compressing xls files")
+        logger.debug("Compressing xls files")
        files_to_compress = [excel_path, codebook_path, readme_path]
        zip_files(xls_zip_path, files_to_compress)

        # Per #1557
        # zip file that contains the .xls, .csv, .pdf, tech support document, checksum file
-        logger.info("Compressing data and documentation files")
+        logger.debug("Compressing data and documentation files")
        files_to_compress = [
            excel_path,
            csv_path,
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -47,14 +47,14 @@ def check_score_data_source(

    # download from s3 if census_data_source is aws
    if score_data_source == "aws":
-        logger.info("Fetching Score Tile data from AWS S3")
+        logger.debug("Fetching Score Tile data from AWS S3")
        download_file_from_url(
            file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
        )
    else:
        # check if score data is found locally
        if not os.path.isfile(TILE_SCORE_CSV):
-            logger.info(
+            logger.warning(
                "No local score tiles data found. Please use '-s aws` to fetch from AWS"
            )
            sys.exit()
@ -96,7 +96,7 @@ def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
    if series.isin(unacceptable_values).any():
        series.replace(mapping, regex=False, inplace=True)

-    multiplication_factor = 10**number_of_decimals
+    multiplication_factor = 10 ** number_of_decimals

    # In order to safely cast NaNs
    # First coerce series to float type: series.astype(float)
@ -409,7 +409,7 @@ def compare_to_list_of_expected_state_fips_codes(
            f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
        )
    else:
-        logger.info(
+        logger.debug(
            "Data matches expected state and territory representation"
            f"{dataset_name_phrase}."
        )