Excel formatting and tract id ordering (#1172)

* excel formatting and tract id ordering * lint * lint try $2 * lint 3 * addressed comments * typo
2025-02-22 09:41:26 -08:00 · 2022-02-04 18:35:45 -05:00 · 2022-02-04 18:35:45 -05:00 · f5fe8d90e2
commit f5fe8d90e2
parent 6a00b29f5d
1 changed files with 40 additions and 2 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -271,7 +271,10 @@ class PostScoreETL(ExtractTransformLoad):
                    number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
                )

-        return df
+        # sort by tract id
+        df_sorted = df.sort_values(self.GEOID_TRACT_FIELD_NAME)
+
+        return df_sorted

    def transform(self) -> None:
        logger.info("Transforming data sources for Score + County CSVs")
@ -307,6 +310,41 @@ class PostScoreETL(ExtractTransformLoad):
            encoding="utf-8-sig",  # windows compat https://stackoverflow.com/a/43684587
        )

+    def _load_excel_from_df(
+        self, excel_df: pd.DataFrame, excel_path: Path
+    ) -> None:
+        # Define Excel Columns Column Width
+        num_excel_cols_width = 30
+
+        # Create a Pandas Excel writer using XlsxWriter as the engine.
+        with pd.ExcelWriter(  # pylint: disable=abstract-class-instantiated
+            # (https://github.com/PyCQA/pylint/issues/3060)
+            excel_path,
+            engine="xlsxwriter",
+        ) as writer:
+
+            # Convert the dataframe to an XlsxWriter Excel object. We also turn off the
+            # index column at the left of the output dataframe.
+            excel_df.to_excel(writer, sheet_name="Data", index=False)
+
+            # Get the xlsxwriter workbook and worksheet objects.
+            workbook = writer.book
+            worksheet = writer.sheets["Data"]
+
+            # set header format
+            header_format = workbook.add_format(
+                {"bold": True, "text_wrap": True, "valign": "bottom"}
+            )
+
+            # write headers
+            for col_num, value in enumerate(excel_df.columns.array):
+                worksheet.write(0, col_num, value, header_format)
+
+            num_cols = len(excel_df.columns)
+            worksheet.set_column(0, num_cols - 1, num_excel_cols_width)
+
+            writer.save()
+
    def _load_tile_csv(
        self, score_tiles_df: pd.DataFrame, tile_score_path: Path
    ) -> None:
@ -334,7 +372,7 @@ class PostScoreETL(ExtractTransformLoad):
        )

        logger.info("Writing downloadable excel")
-        downloadable_df_copy.to_excel(excel_path, index=False)
+        self._load_excel_from_df(downloadable_df_copy, excel_path)

        logger.info("Writing downloadable csv")
        downloadable_df_copy[self.GEOID_TRACT_FIELD_NAME] = (