diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index e68c4868..e29ba6e1 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -246,7 +246,6 @@ TILES_SCORE_COLUMNS = { field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD: "IA_LMI_ET", field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD: "IA_UN_ET", field_names.ISLAND_POVERTY_PCTILE_THRESHOLD: "IA_POV_ET", - field_names.FPL_200_SERIES: "FPL200S", field_names.THRESHOLD_COUNT: "TC", field_names.CATEGORY_COUNT: "CC", field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE", @@ -269,7 +268,6 @@ TILES_SCORE_COLUMNS = { field_names.COLLEGE_NON_ATTENDANCE_FIELD: "NCA", # This is logically equivalent to "non-college greater than 80%" field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD: "CA_LT20", - field_names.LOW_INCOME_THRESHOLD: "FPL200S", # Booleans for the front end about the types of thresholds exceeded field_names.CLIMATE_THRESHOLD_EXCEEDED: "M_CLT_EOMI", field_names.ENERGY_THRESHOLD_EXCEEDED: "M_ENY_EOMI", @@ -280,6 +278,8 @@ TILES_SCORE_COLUMNS = { field_names.HEALTH_THRESHOLD_EXCEEDED: "M_HLTH_EOMI", field_names.WORKFORCE_THRESHOLD_EXCEEDED: "M_WKFC_EOMI", # These are the booleans for socioeconomic indicators + ## this measures low income boolean + field_names.FPL_200_SERIES: "FPL200S", ## Low high school and low higher ed for t&wd field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI", ## FPL 200 and low higher ed for all others diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index 9ea18073..a4d3bef2 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -284,13 +284,21 @@ class GeoScoreETL(ExtractTransformLoad): def create_esri_codebook(codebook): """temporary: helper to make a codebook for esri shapefile only""" + + shapefile_column_field = "shapefile_column" + internal_column_name_field = "column_name" + column_description_field = "column_description" + logger.info("Creating a codebook that uses the csv names") codebook = ( pd.Series(codebook) .reset_index() .rename( # kept as strings because no downstream impacts - columns={0: "column_name", "index": "shapefile_column"} + columns={ + 0: internal_column_name_field, + "index": shapefile_column_field, + } ) ) @@ -304,10 +312,21 @@ class GeoScoreETL(ExtractTransformLoad): object_value="label", ) - codebook["column_description"] = codebook["column_name"].map( - column_rename_dict + codebook[column_description_field] = codebook[ + internal_column_name_field + ].map(column_rename_dict) + + codebook[ + [ + shapefile_column_field, + internal_column_name_field, + column_description_field, + ] + ].to_csv( + self.SCORE_SHP_CODE_CSV, + index=False, ) - codebook.to_csv(self.SCORE_SHP_CODE_CSV, index=False) + logger.info("Completed writing codebook") def write_esri_shapefile(): logger.info("Producing ESRI shapefiles") @@ -321,19 +340,25 @@ class GeoScoreETL(ExtractTransformLoad): short: long for long, short in constants.TILES_SCORE_COLUMNS.items() } - for column in self.geojson_score_usa_high.columns: - # take first 10 characters, max due to ESRI constraints - new_col = column[:10] + + for i, column in enumerate(self.geojson_score_usa_high.columns): + # take first 6 characters and add a number to ensure uniqueness + # this is the max due to esri (index can be 3-digits) + if len(column) > 10: + new_col = column[:6] + f"_{i}" + else: + new_col = column codebook[new_col] = reversed_tiles.get(column, column) if new_col != column: renaming_map[column] = new_col - create_esri_codebook(codebook) - self.geojson_score_usa_high.rename(columns=renaming_map).to_file( self.SCORE_SHP_FILE ) logger.info("Completed writing shapefile") + + create_esri_codebook(codebook) + arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip" arcgis_files = [] for file in os.listdir(self.SCORE_SHP_PATH): diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl index 4798eba6..c458baf2 100644 Binary files a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl and b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl differ