Creating shapefiles for ArcGIS users (#1275)

Added shapefiles to the files generated when the pipeline is run. Produces both shapefile and a key for column names.
2025-07-29 05:11:16 -07:00 · 2022-02-24 10:32:49 -05:00 · 2022-02-24 10:32:49 -05:00 · f0a4e40a79
commit f0a4e40a79
parent 521c61dff3
2 changed files with 40 additions and 5 deletions
--- a/data/data-pipeline/data_pipeline/data/score/shapefile/init.py
+++ b/data/data-pipeline/data_pipeline/data/score/shapefile/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -27,6 +27,10 @@ class GeoScoreETL(ExtractTransformLoad):
        self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
        self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
        self.SCORE_SHP_PATH = self.DATA_PATH / "score" / "shapefile"
        self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
        self.SCORE_SHP_CODE_CSV = self.SCORE_SHP_PATH / "columns.csv"
        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
        self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
@ -94,6 +98,7 @@ class GeoScoreETL(ExtractTransformLoad):
        fields = [self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME]
        self.geojson_usa_df = self.geojson_usa_df[fields]
        # TODO update this join
        logger.info("Merging and compressing score CSV with USA GeoJSON")
        self.geojson_score_usa_high = self.score_usa_df.merge(
            self.geojson_usa_df, on=self.GEOID_FIELD_NAME, how="left"
@ -103,8 +108,6 @@ class GeoScoreETL(ExtractTransformLoad):
            self.geojson_score_usa_high, crs="EPSG:4326"
        )
        logger.info(f"Columns: {self.geojson_score_usa_high.columns}")
        usa_simplified = self.geojson_score_usa_high[
            [
                self.GEOID_FIELD_NAME,
@ -148,8 +151,9 @@ class GeoScoreETL(ExtractTransformLoad):
        )
        # round to 2 decimals
-        decimals = pd.Series([2], index=[self.TARGET_SCORE_RENAME_TO])
+        self.geojson_score_usa_low = self.geojson_score_usa_low.round(
-        self.geojson_score_usa_low = self.geojson_score_usa_low.round(decimals)
+            {self.TARGET_SCORE_RENAME_TO: 2}
        )
    def _aggregate_to_tracts(
        self, block_group_df: gpd.GeoDataFrame
@ -221,10 +225,41 @@ class GeoScoreETL(ExtractTransformLoad):
            )
            logger.info("Completed writing usa-low")
        def write_esri_shapefile():
            logger.info("Producing ESRI shapefiles")
            # Note that esri shapefiles can't have long column names, so we borrow from the
            # shorten some tile names (renaming map) and print out a codebook for the user
            codebook = {}
            renaming_map = {}
            # allows us to quickly rename / describe columns
            reversed_tiles = {
                short: long
                for long, short in constants.TILES_SCORE_COLUMNS.items()
            }
            for column in self.geojson_score_usa_high.columns:
                # take first 10 characters, max due to ESRI constraints
                new_col = column[:10]
                codebook[new_col] = reversed_tiles.get(column, column)
                if new_col != column:
                    renaming_map[column] = new_col
            pd.Series(codebook).reset_index().rename(
                # kept as strings because no downstream impacts
                columns={0: "column", "index": "meaning"}
            ).to_csv(self.SCORE_SHP_CODE_CSV, index=False)
            self.geojson_score_usa_high.rename(columns=renaming_map).to_file(
                self.SCORE_SHP_FILE
            )
            logger.info("Completed writing shapefile")
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(task)
-                for task in [write_high_to_file, write_low_to_file]
+                for task in [
                    write_high_to_file,
                    write_low_to_file,
                    write_esri_shapefile,
                ]
            }
            for fut in concurrent.futures.as_completed(futures):