From ae725f0a3eaf23c1992f4009ae8eccfcd5e6cc52 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com> Date: Fri, 22 Apr 2022 14:09:12 -0400 Subject: [PATCH] arcgis column name fix (#1581) eliminates duplicate column and ensures all column names are unique. --- .../data_pipeline/etl/score/constants.py | 4 +- .../data_pipeline/etl/score/etl_score_geo.py | 43 ++++++++++++++---- .../tests/snapshots/tile_data_expected.pkl | Bin 3903 -> 3891 bytes 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index e68c4868..e29ba6e1 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -246,7 +246,6 @@ TILES_SCORE_COLUMNS = { field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD: "IA_LMI_ET", field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD: "IA_UN_ET", field_names.ISLAND_POVERTY_PCTILE_THRESHOLD: "IA_POV_ET", - field_names.FPL_200_SERIES: "FPL200S", field_names.THRESHOLD_COUNT: "TC", field_names.CATEGORY_COUNT: "CC", field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE", @@ -269,7 +268,6 @@ TILES_SCORE_COLUMNS = { field_names.COLLEGE_NON_ATTENDANCE_FIELD: "NCA", # This is logically equivalent to "non-college greater than 80%" field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD: "CA_LT20", - field_names.LOW_INCOME_THRESHOLD: "FPL200S", # Booleans for the front end about the types of thresholds exceeded field_names.CLIMATE_THRESHOLD_EXCEEDED: "M_CLT_EOMI", field_names.ENERGY_THRESHOLD_EXCEEDED: "M_ENY_EOMI", @@ -280,6 +278,8 @@ TILES_SCORE_COLUMNS = { field_names.HEALTH_THRESHOLD_EXCEEDED: "M_HLTH_EOMI", field_names.WORKFORCE_THRESHOLD_EXCEEDED: "M_WKFC_EOMI", # These are the booleans for socioeconomic indicators + ## this measures low income boolean + field_names.FPL_200_SERIES: "FPL200S", ## Low high school and low higher ed for t&wd field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI", ## FPL 200 and low higher ed for all others diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index 9ea18073..a4d3bef2 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -284,13 +284,21 @@ class GeoScoreETL(ExtractTransformLoad): def create_esri_codebook(codebook): """temporary: helper to make a codebook for esri shapefile only""" + + shapefile_column_field = "shapefile_column" + internal_column_name_field = "column_name" + column_description_field = "column_description" + logger.info("Creating a codebook that uses the csv names") codebook = ( pd.Series(codebook) .reset_index() .rename( # kept as strings because no downstream impacts - columns={0: "column_name", "index": "shapefile_column"} + columns={ + 0: internal_column_name_field, + "index": shapefile_column_field, + } ) ) @@ -304,10 +312,21 @@ class GeoScoreETL(ExtractTransformLoad): object_value="label", ) - codebook["column_description"] = codebook["column_name"].map( - column_rename_dict + codebook[column_description_field] = codebook[ + internal_column_name_field + ].map(column_rename_dict) + + codebook[ + [ + shapefile_column_field, + internal_column_name_field, + column_description_field, + ] + ].to_csv( + self.SCORE_SHP_CODE_CSV, + index=False, ) - codebook.to_csv(self.SCORE_SHP_CODE_CSV, index=False) + logger.info("Completed writing codebook") def write_esri_shapefile(): logger.info("Producing ESRI shapefiles") @@ -321,19 +340,25 @@ class GeoScoreETL(ExtractTransformLoad): short: long for long, short in constants.TILES_SCORE_COLUMNS.items() } - for column in self.geojson_score_usa_high.columns: - # take first 10 characters, max due to ESRI constraints - new_col = column[:10] + + for i, column in enumerate(self.geojson_score_usa_high.columns): + # take first 6 characters and add a number to ensure uniqueness + # this is the max due to esri (index can be 3-digits) + if len(column) > 10: + new_col = column[:6] + f"_{i}" + else: + new_col = column codebook[new_col] = reversed_tiles.get(column, column) if new_col != column: renaming_map[column] = new_col - create_esri_codebook(codebook) - self.geojson_score_usa_high.rename(columns=renaming_map).to_file( self.SCORE_SHP_FILE ) logger.info("Completed writing shapefile") + + create_esri_codebook(codebook) + arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip" arcgis_files = [] for file in os.listdir(self.SCORE_SHP_PATH): diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl index 4798eba69d6bed0e5e783a0ae2e4c4af4256ca5b..c458baf26dfdef6b7b7c36e057c0d6f5fc0bbbf8 100644 GIT binary patch delta 174 zcmdllw^@#*fn}=3MiwC^M%T&WOj?Xxlbx7yWf?$#kr9Z17{~%aFacsr7Gx6xs(uPo zt+82vc{8JAC|ErMLl~3}htdg?@36#Ac4qw|7z`B&fzr8?ZCF(|=d$TDGe%6F%4yG- nGx;uO24mi4H?BualZ|*ACvWA==3sXV@G&wl2;MBmr^pBZh*uy9 delta 206 zcmdliw_lE>fn}=6MiwC^Mz_i0Oj?XRlbx7y%@`TLfRT{_2*D(X0c1j`DJ4^qGGsF3 zy%}1kWGJ>z37Vqe&FJkm`7zJ{gUx)*n;9j8Ato?{LFsTP9WnV9OZ?AcN$Y$42(XK`9^CP0O9Ctv2wV9eia!}W-XgWWB_$H>4Sc(M#{GgHR> O$@6)$H;eGeGXemNcPXF%