Consolidate columns.csv data into codebook (#2197)

* Consolidate esri codebook data into main codebook, and write that instead of the esri codebook with the shapefile * Format and fix any issues from linter * Remove final reference to columns.csv * Add more info to logging for geo-score
2025-07-31 20:21:17 -07:00 · 2023-03-13 08:33:30 -05:00 · 2023-03-13 08:33:30 -05:00 · 699d6b3641
commit 699d6b3641
parent ee961b3210
2 changed files with 61 additions and 10 deletions
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -268,7 +268,10 @@ def geo_score(data_source: str):
    Returns:
        None
    """
-    log_title("Generate GeoJSON", "Combine Score and GeoJSON")
+    log_title(
+        "Generate GeoJSON",
+        "Combine Score and GeoJSON, Add Shapefile Data to Codebook",
+    )

    log_info("Cleaning up geo score folder")
    geo_score_folder_cleanup()
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -33,7 +33,6 @@ class GeoScoreETL(ExtractTransformLoad):

        self.SCORE_SHP_PATH = self.DATA_PATH / "score" / "shapefile"
        self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
-        self.SCORE_SHP_CODE_CSV = self.SCORE_SHP_PATH / "columns.csv"

        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
        self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
@ -300,14 +299,14 @@ class GeoScoreETL(ExtractTransformLoad):
            )
            logger.info("Completed writing usa-low")

-        def create_esri_codebook(codebook):
+        def create_esri_codebook(codebook) -> pd.DataFrame:
            """temporary: helper to make a codebook for esri shapefile only"""

            shapefile_column_field = "shapefile_column"
            internal_column_name_field = "column_name"
            column_description_field = "column_description"

-            logger.info("Creating a codebook that uses the csv names")
+            logger.info("Creating an ESRI codebook with shortened column names")
            codebook = (
                pd.Series(codebook)
                .reset_index()
@ -333,17 +332,64 @@ class GeoScoreETL(ExtractTransformLoad):
                internal_column_name_field
            ].map(column_rename_dict)

-            codebook[
+            codebook = codebook[
                [
                    shapefile_column_field,
                    internal_column_name_field,
                    column_description_field,
                ]
-            ].to_csv(
-                self.SCORE_SHP_CODE_CSV,
-                index=False,
+            ]
+            logger.info("Completed creating ESRI codebook")
+
+            return codebook
+
+        def combine_esri_codebook_with_original_codebook(esri_codebook_df):
+            """Combines the ESRI codebook generated above with the original codebook generated
+            during score-post. Essentially we want to include the shapefile column name in the
+            original codebook."""
+
+            logger.info("Combining ESRI codebook with original codebook")
+
+            # load up the original codebook
+            original_codebook_df = pd.read_csv(
+                constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH,
+                low_memory=False,
+            )
+
+            # if we've already combined these files in the past, go ahead and remove the columns so we can do it again
+            original_codebook_df.drop(
+                "shapefile_label", axis=1, errors="ignore", inplace=True
+            )
+
+            # add the esri (shapefile) columns to the original codebook by joining the two dataframes
+            combined_codebook_df = original_codebook_df.merge(
+                esri_codebook_df[["shapefile_column", "column_name"]],
+                how="outer",
+                left_on="Description",
+                right_on="column_name",
+            )
+
+            # if any descriptions are blank, replace them with the column_name description from the esri codebook
+            combined_codebook_df["Description"].mask(
+                combined_codebook_df["Description"].isnull(),
+                combined_codebook_df["column_name"],
+                inplace=True,
+            )
+            combined_codebook_df = combined_codebook_df.drop(
+                "column_name", axis=1
+            )
+
+            # move some stuff around to make it easier to read the output
+            shapefile_col = combined_codebook_df.pop("shapefile_column")
+            combined_codebook_df.insert(2, "shapefile_label", shapefile_col)
+
+            # save the combined codebook
+            combined_codebook_df.to_csv(
+                constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH, index=False
+            )
+            logger.info(
+                "Completed combining ESRI codebook with original codebook"
            )
-            logger.info("Completed writing codebook")

        def write_esri_shapefile():
            logger.info("Producing ESRI shapefiles")
@ -374,7 +420,8 @@ class GeoScoreETL(ExtractTransformLoad):
            )
            logger.info("Completed writing shapefile")

-            create_esri_codebook(codebook)
+            esri_codebook_df = create_esri_codebook(codebook)
+            combine_esri_codebook_with_original_codebook(esri_codebook_df)

            arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip"
            arcgis_files = []
@ -382,6 +429,7 @@ class GeoScoreETL(ExtractTransformLoad):
                # don't remove __init__ files as they conserve dir structure
                if file != "__init__.py":
                    arcgis_files.append(self.SCORE_SHP_PATH / file)
+            arcgis_files.append(constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH)
            zip_files(arcgis_zip_file_path, arcgis_files)
            logger.info("Completed zipping shapefiles")