Creating shapefiles for ArcGIS users (#1275)

Added shapefiles to the files generated when the pipeline is run. Produces both shapefile and a key for column names.
2025-02-23 01:54:18 -08:00 · 2022-02-24 10:32:49 -05:00 · 2022-02-24 10:32:49 -05:00 · f0a4e40a79
commit f0a4e40a79
parent 521c61dff3
2 changed files with 40 additions and 5 deletions
--- a/data/data-pipeline/data_pipeline/data/score/shapefile/init.py
+++ b/data/data-pipeline/data_pipeline/data/score/shapefile/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -27,6 +27,10 @@ class GeoScoreETL(ExtractTransformLoad):
        self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
        self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"

+        self.SCORE_SHP_PATH = self.DATA_PATH / "score" / "shapefile"
+        self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
+        self.SCORE_SHP_CODE_CSV = self.SCORE_SHP_PATH / "columns.csv"
+
        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
        self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"

@ -94,6 +98,7 @@ class GeoScoreETL(ExtractTransformLoad):
        fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
        self.geojson_usa_df = self.geojson_usa_df[fields]

+        # TODO update this join
        logger.info("Merging and compressing score CSV with USA GeoJSON")
        self.geojson_score_usa_high = self.score_usa_df.merge(
            self.geojson_usa_df, on=self.GEOID_FIELD_NAME, how="left"
@ -103,8 +108,6 @@ class GeoScoreETL(ExtractTransformLoad):
            self.geojson_score_usa_high, crs="EPSG:4326"
        )

-        logger.info(f"Columns: {self.geojson_score_usa_high.columns}")
-
        usa_simplified = self.geojson_score_usa_high[
            [
                self.GEOID_FIELD_NAME,
@ -148,8 +151,9 @@ class GeoScoreETL(ExtractTransformLoad):
        )

        # round to 2 decimals
-        decimals = pd.Series([2], index=[self.TARGET_SCORE_RENAME_TO])
-        self.geojson_score_usa_low = self.geojson_score_usa_low.round(decimals)
+        self.geojson_score_usa_low = self.geojson_score_usa_low.round(
+            {self.TARGET_SCORE_RENAME_TO: 2}
+        )

    def _aggregate_to_tracts(
        self, block_group_df: gpd.GeoDataFrame
@ -221,10 +225,41 @@ class GeoScoreETL(ExtractTransformLoad):
            )
            logger.info("Completed writing usa-low")

+        def write_esri_shapefile():
+            logger.info("Producing ESRI shapefiles")
+            # Note that esri shapefiles can't have long column names, so we borrow from the
+            # shorten some tile names (renaming map) and print out a codebook for the user
+            codebook = {}
+            renaming_map = {}
+
+            # allows us to quickly rename / describe columns
+            reversed_tiles = {
+                short: long
+                for long, short in constants.TILES_SCORE_COLUMNS.items()
+            }
+            for column in self.geojson_score_usa_high.columns:
+                # take first 10 characters, max due to ESRI constraints
+                new_col = column[:10]
+                codebook[new_col] = reversed_tiles.get(column, column)
+                if new_col != column:
+                    renaming_map[column] = new_col
+            pd.Series(codebook).reset_index().rename(
+                # kept as strings because no downstream impacts
+                columns={0: "column", "index": "meaning"}
+            ).to_csv(self.SCORE_SHP_CODE_CSV, index=False)
+            self.geojson_score_usa_high.rename(columns=renaming_map).to_file(
+                self.SCORE_SHP_FILE
+            )
+            logger.info("Completed writing shapefile")
+
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(task)
-                for task in [write_high_to_file, write_low_to_file]
+                for task in [
+                    write_high_to_file,
+                    write_low_to_file,
+                    write_esri_shapefile,
+                ]
            }

            for fut in concurrent.futures.as_completed(futures):