Consolidate columns.csv data into codebook (#2197)

* Consolidate esri codebook data into main codebook, and write that instead of the esri codebook with the shapefile * Format and fix any issues from linter * Remove final reference to columns.csv * Add more info to logging for geo-score
2025-08-05 16:04:19 -07:00 · 2023-03-13 08:33:30 -05:00 · 2023-03-13 08:33:30 -05:00 · 699d6b3641
commit 699d6b3641
parent ee961b3210
2 changed files with 61 additions and 10 deletions
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -268,7 +268,10 @@ def geo_score(data_source: str):
    Returns:
        None
    """
-    log_title("Generate GeoJSON", "Combine Score and GeoJSON")
+    log_title(
        "Generate GeoJSON",
        "Combine Score and GeoJSON, Add Shapefile Data to Codebook",
    )
    log_info("Cleaning up geo score folder")
    geo_score_folder_cleanup()
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -33,7 +33,6 @@ class GeoScoreETL(ExtractTransformLoad):
        self.SCORE_SHP_PATH = self.DATA_PATH / "score" / "shapefile"
        self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
        self.SCORE_SHP_CODE_CSV = self.SCORE_SHP_PATH / "columns.csv"
        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
        self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
@ -300,14 +299,14 @@ class GeoScoreETL(ExtractTransformLoad):
            )
            logger.info("Completed writing usa-low")
-        def create_esri_codebook(codebook):
+        def create_esri_codebook(codebook) -> pd.DataFrame:
            """temporary: helper to make a codebook for esri shapefile only"""
            shapefile_column_field = "shapefile_column"
            internal_column_name_field = "column_name"
            column_description_field = "column_description"
-            logger.info("Creating a codebook that uses the csv names")
+            logger.info("Creating an ESRI codebook with shortened column names")
            codebook = (
                pd.Series(codebook)
                .reset_index()
@ -333,17 +332,64 @@ class GeoScoreETL(ExtractTransformLoad):
                internal_column_name_field
            ].map(column_rename_dict)
-            codebook[
+            codebook = codebook[
                [
                    shapefile_column_field,
                    internal_column_name_field,
                    column_description_field,
                ]
-            ].to_csv(
+            ]
-                self.SCORE_SHP_CODE_CSV,
+            logger.info("Completed creating ESRI codebook")
-                index=False,
+
            return codebook
        def combine_esri_codebook_with_original_codebook(esri_codebook_df):
            """Combines the ESRI codebook generated above with the original codebook generated
            during score-post. Essentially we want to include the shapefile column name in the
            original codebook."""
            logger.info("Combining ESRI codebook with original codebook")
            # load up the original codebook
            original_codebook_df = pd.read_csv(
                constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH,
                low_memory=False,
            )
            # if we've already combined these files in the past, go ahead and remove the columns so we can do it again
            original_codebook_df.drop(
                "shapefile_label", axis=1, errors="ignore", inplace=True
            )
            # add the esri (shapefile) columns to the original codebook by joining the two dataframes
            combined_codebook_df = original_codebook_df.merge(
                esri_codebook_df[["shapefile_column", "column_name"]],
                how="outer",
                left_on="Description",
                right_on="column_name",
            )
            # if any descriptions are blank, replace them with the column_name description from the esri codebook
            combined_codebook_df["Description"].mask(
                combined_codebook_df["Description"].isnull(),
                combined_codebook_df["column_name"],
                inplace=True,
            )
            combined_codebook_df = combined_codebook_df.drop(
                "column_name", axis=1
            )
            # move some stuff around to make it easier to read the output
            shapefile_col = combined_codebook_df.pop("shapefile_column")
            combined_codebook_df.insert(2, "shapefile_label", shapefile_col)
            # save the combined codebook
            combined_codebook_df.to_csv(
                constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH, index=False
            )
            logger.info(
                "Completed combining ESRI codebook with original codebook"
            )
            logger.info("Completed writing codebook")
        def write_esri_shapefile():
            logger.info("Producing ESRI shapefiles")
@ -374,7 +420,8 @@ class GeoScoreETL(ExtractTransformLoad):
            )
            logger.info("Completed writing shapefile")
-            create_esri_codebook(codebook)
+            esri_codebook_df = create_esri_codebook(codebook)
            combine_esri_codebook_with_original_codebook(esri_codebook_df)
            arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip"
            arcgis_files = []
@ -382,6 +429,7 @@ class GeoScoreETL(ExtractTransformLoad):
                # don't remove __init__ files as they conserve dir structure
                if file != "__init__.py":
                    arcgis_files.append(self.SCORE_SHP_PATH / file)
            arcgis_files.append(constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH)
            zip_files(arcgis_zip_file_path, arcgis_files)
            logger.info("Completed zipping shapefiles")