arcgis column name fix (#1581)

eliminates duplicate column and ensures all column names are unique.
This commit is contained in:
Emma Nechamkin 2022-04-22 14:09:12 -04:00 committed by GitHub
parent fbd56e3bd5
commit ae725f0a3e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 11 deletions

View file

@ -246,7 +246,6 @@ TILES_SCORE_COLUMNS = {
field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD: "IA_LMI_ET", field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD: "IA_LMI_ET",
field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD: "IA_UN_ET", field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD: "IA_UN_ET",
field_names.ISLAND_POVERTY_PCTILE_THRESHOLD: "IA_POV_ET", field_names.ISLAND_POVERTY_PCTILE_THRESHOLD: "IA_POV_ET",
field_names.FPL_200_SERIES: "FPL200S",
field_names.THRESHOLD_COUNT: "TC", field_names.THRESHOLD_COUNT: "TC",
field_names.CATEGORY_COUNT: "CC", field_names.CATEGORY_COUNT: "CC",
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE", field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD: "IAULHSE",
@ -269,7 +268,6 @@ TILES_SCORE_COLUMNS = {
field_names.COLLEGE_NON_ATTENDANCE_FIELD: "NCA", field_names.COLLEGE_NON_ATTENDANCE_FIELD: "NCA",
# This is logically equivalent to "non-college greater than 80%" # This is logically equivalent to "non-college greater than 80%"
field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD: "CA_LT20", field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD: "CA_LT20",
field_names.LOW_INCOME_THRESHOLD: "FPL200S",
# Booleans for the front end about the types of thresholds exceeded # Booleans for the front end about the types of thresholds exceeded
field_names.CLIMATE_THRESHOLD_EXCEEDED: "M_CLT_EOMI", field_names.CLIMATE_THRESHOLD_EXCEEDED: "M_CLT_EOMI",
field_names.ENERGY_THRESHOLD_EXCEEDED: "M_ENY_EOMI", field_names.ENERGY_THRESHOLD_EXCEEDED: "M_ENY_EOMI",
@ -280,6 +278,8 @@ TILES_SCORE_COLUMNS = {
field_names.HEALTH_THRESHOLD_EXCEEDED: "M_HLTH_EOMI", field_names.HEALTH_THRESHOLD_EXCEEDED: "M_HLTH_EOMI",
field_names.WORKFORCE_THRESHOLD_EXCEEDED: "M_WKFC_EOMI", field_names.WORKFORCE_THRESHOLD_EXCEEDED: "M_WKFC_EOMI",
# These are the booleans for socioeconomic indicators # These are the booleans for socioeconomic indicators
## this measures low income boolean
field_names.FPL_200_SERIES: "FPL200S",
## Low high school and low higher ed for t&wd ## Low high school and low higher ed for t&wd
field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI", field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
## FPL 200 and low higher ed for all others ## FPL 200 and low higher ed for all others

View file

@ -284,13 +284,21 @@ class GeoScoreETL(ExtractTransformLoad):
def create_esri_codebook(codebook): def create_esri_codebook(codebook):
"""temporary: helper to make a codebook for esri shapefile only""" """temporary: helper to make a codebook for esri shapefile only"""
shapefile_column_field = "shapefile_column"
internal_column_name_field = "column_name"
column_description_field = "column_description"
logger.info("Creating a codebook that uses the csv names") logger.info("Creating a codebook that uses the csv names")
codebook = ( codebook = (
pd.Series(codebook) pd.Series(codebook)
.reset_index() .reset_index()
.rename( .rename(
# kept as strings because no downstream impacts # kept as strings because no downstream impacts
columns={0: "column_name", "index": "shapefile_column"} columns={
0: internal_column_name_field,
"index": shapefile_column_field,
}
) )
) )
@ -304,10 +312,21 @@ class GeoScoreETL(ExtractTransformLoad):
object_value="label", object_value="label",
) )
codebook["column_description"] = codebook["column_name"].map( codebook[column_description_field] = codebook[
column_rename_dict internal_column_name_field
].map(column_rename_dict)
codebook[
[
shapefile_column_field,
internal_column_name_field,
column_description_field,
]
].to_csv(
self.SCORE_SHP_CODE_CSV,
index=False,
) )
codebook.to_csv(self.SCORE_SHP_CODE_CSV, index=False) logger.info("Completed writing codebook")
def write_esri_shapefile(): def write_esri_shapefile():
logger.info("Producing ESRI shapefiles") logger.info("Producing ESRI shapefiles")
@ -321,19 +340,25 @@ class GeoScoreETL(ExtractTransformLoad):
short: long short: long
for long, short in constants.TILES_SCORE_COLUMNS.items() for long, short in constants.TILES_SCORE_COLUMNS.items()
} }
for column in self.geojson_score_usa_high.columns:
# take first 10 characters, max due to ESRI constraints for i, column in enumerate(self.geojson_score_usa_high.columns):
new_col = column[:10] # take first 6 characters and add a number to ensure uniqueness
# this is the max due to esri (index can be 3-digits)
if len(column) > 10:
new_col = column[:6] + f"_{i}"
else:
new_col = column
codebook[new_col] = reversed_tiles.get(column, column) codebook[new_col] = reversed_tiles.get(column, column)
if new_col != column: if new_col != column:
renaming_map[column] = new_col renaming_map[column] = new_col
create_esri_codebook(codebook)
self.geojson_score_usa_high.rename(columns=renaming_map).to_file( self.geojson_score_usa_high.rename(columns=renaming_map).to_file(
self.SCORE_SHP_FILE self.SCORE_SHP_FILE
) )
logger.info("Completed writing shapefile") logger.info("Completed writing shapefile")
create_esri_codebook(codebook)
arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip" arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip"
arcgis_files = [] arcgis_files = []
for file in os.listdir(self.SCORE_SHP_PATH): for file in os.listdir(self.SCORE_SHP_PATH):