mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Consolidate columns.csv data into codebook (#2197)
* Consolidate esri codebook data into main codebook, and write that instead of the esri codebook with the shapefile * Format and fix any issues from linter * Remove final reference to columns.csv * Add more info to logging for geo-score
This commit is contained in:
parent
ee961b3210
commit
699d6b3641
2 changed files with 61 additions and 10 deletions
|
@ -268,7 +268,10 @@ def geo_score(data_source: str):
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
log_title("Generate GeoJSON", "Combine Score and GeoJSON")
|
log_title(
|
||||||
|
"Generate GeoJSON",
|
||||||
|
"Combine Score and GeoJSON, Add Shapefile Data to Codebook",
|
||||||
|
)
|
||||||
|
|
||||||
log_info("Cleaning up geo score folder")
|
log_info("Cleaning up geo score folder")
|
||||||
geo_score_folder_cleanup()
|
geo_score_folder_cleanup()
|
||||||
|
|
|
@ -33,7 +33,6 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.SCORE_SHP_PATH = self.DATA_PATH / "score" / "shapefile"
|
self.SCORE_SHP_PATH = self.DATA_PATH / "score" / "shapefile"
|
||||||
self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
|
self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
|
||||||
self.SCORE_SHP_CODE_CSV = self.SCORE_SHP_PATH / "columns.csv"
|
|
||||||
|
|
||||||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
||||||
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
||||||
|
@ -300,14 +299,14 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
logger.info("Completed writing usa-low")
|
logger.info("Completed writing usa-low")
|
||||||
|
|
||||||
def create_esri_codebook(codebook):
|
def create_esri_codebook(codebook) -> pd.DataFrame:
|
||||||
"""temporary: helper to make a codebook for esri shapefile only"""
|
"""temporary: helper to make a codebook for esri shapefile only"""
|
||||||
|
|
||||||
shapefile_column_field = "shapefile_column"
|
shapefile_column_field = "shapefile_column"
|
||||||
internal_column_name_field = "column_name"
|
internal_column_name_field = "column_name"
|
||||||
column_description_field = "column_description"
|
column_description_field = "column_description"
|
||||||
|
|
||||||
logger.info("Creating a codebook that uses the csv names")
|
logger.info("Creating an ESRI codebook with shortened column names")
|
||||||
codebook = (
|
codebook = (
|
||||||
pd.Series(codebook)
|
pd.Series(codebook)
|
||||||
.reset_index()
|
.reset_index()
|
||||||
|
@ -333,17 +332,64 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
internal_column_name_field
|
internal_column_name_field
|
||||||
].map(column_rename_dict)
|
].map(column_rename_dict)
|
||||||
|
|
||||||
codebook[
|
codebook = codebook[
|
||||||
[
|
[
|
||||||
shapefile_column_field,
|
shapefile_column_field,
|
||||||
internal_column_name_field,
|
internal_column_name_field,
|
||||||
column_description_field,
|
column_description_field,
|
||||||
]
|
]
|
||||||
].to_csv(
|
]
|
||||||
self.SCORE_SHP_CODE_CSV,
|
logger.info("Completed creating ESRI codebook")
|
||||||
index=False,
|
|
||||||
|
return codebook
|
||||||
|
|
||||||
|
def combine_esri_codebook_with_original_codebook(esri_codebook_df):
|
||||||
|
"""Combines the ESRI codebook generated above with the original codebook generated
|
||||||
|
during score-post. Essentially we want to include the shapefile column name in the
|
||||||
|
original codebook."""
|
||||||
|
|
||||||
|
logger.info("Combining ESRI codebook with original codebook")
|
||||||
|
|
||||||
|
# load up the original codebook
|
||||||
|
original_codebook_df = pd.read_csv(
|
||||||
|
constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH,
|
||||||
|
low_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# if we've already combined these files in the past, go ahead and remove the columns so we can do it again
|
||||||
|
original_codebook_df.drop(
|
||||||
|
"shapefile_label", axis=1, errors="ignore", inplace=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# add the esri (shapefile) columns to the original codebook by joining the two dataframes
|
||||||
|
combined_codebook_df = original_codebook_df.merge(
|
||||||
|
esri_codebook_df[["shapefile_column", "column_name"]],
|
||||||
|
how="outer",
|
||||||
|
left_on="Description",
|
||||||
|
right_on="column_name",
|
||||||
|
)
|
||||||
|
|
||||||
|
# if any descriptions are blank, replace them with the column_name description from the esri codebook
|
||||||
|
combined_codebook_df["Description"].mask(
|
||||||
|
combined_codebook_df["Description"].isnull(),
|
||||||
|
combined_codebook_df["column_name"],
|
||||||
|
inplace=True,
|
||||||
|
)
|
||||||
|
combined_codebook_df = combined_codebook_df.drop(
|
||||||
|
"column_name", axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# move some stuff around to make it easier to read the output
|
||||||
|
shapefile_col = combined_codebook_df.pop("shapefile_column")
|
||||||
|
combined_codebook_df.insert(2, "shapefile_label", shapefile_col)
|
||||||
|
|
||||||
|
# save the combined codebook
|
||||||
|
combined_codebook_df.to_csv(
|
||||||
|
constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH, index=False
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Completed combining ESRI codebook with original codebook"
|
||||||
)
|
)
|
||||||
logger.info("Completed writing codebook")
|
|
||||||
|
|
||||||
def write_esri_shapefile():
|
def write_esri_shapefile():
|
||||||
logger.info("Producing ESRI shapefiles")
|
logger.info("Producing ESRI shapefiles")
|
||||||
|
@ -374,7 +420,8 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
logger.info("Completed writing shapefile")
|
logger.info("Completed writing shapefile")
|
||||||
|
|
||||||
create_esri_codebook(codebook)
|
esri_codebook_df = create_esri_codebook(codebook)
|
||||||
|
combine_esri_codebook_with_original_codebook(esri_codebook_df)
|
||||||
|
|
||||||
arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip"
|
arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip"
|
||||||
arcgis_files = []
|
arcgis_files = []
|
||||||
|
@ -382,6 +429,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
# don't remove __init__ files as they conserve dir structure
|
# don't remove __init__ files as they conserve dir structure
|
||||||
if file != "__init__.py":
|
if file != "__init__.py":
|
||||||
arcgis_files.append(self.SCORE_SHP_PATH / file)
|
arcgis_files.append(self.SCORE_SHP_PATH / file)
|
||||||
|
arcgis_files.append(constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH)
|
||||||
zip_files(arcgis_zip_file_path, arcgis_files)
|
zip_files(arcgis_zip_file_path, arcgis_files)
|
||||||
logger.info("Completed zipping shapefiles")
|
logger.info("Completed zipping shapefiles")
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue