mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 01:31:25 -08:00
Consolidate columns.csv data into codebook (#2197)
* Consolidate esri codebook data into main codebook, and write that instead of the esri codebook with the shapefile * Format and fix any issues from linter * Remove final reference to columns.csv * Add more info to logging for geo-score
This commit is contained in:
parent
ee961b3210
commit
699d6b3641
2 changed files with 61 additions and 10 deletions
|
@ -268,7 +268,10 @@ def geo_score(data_source: str):
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
log_title("Generate GeoJSON", "Combine Score and GeoJSON")
|
||||
log_title(
|
||||
"Generate GeoJSON",
|
||||
"Combine Score and GeoJSON, Add Shapefile Data to Codebook",
|
||||
)
|
||||
|
||||
log_info("Cleaning up geo score folder")
|
||||
geo_score_folder_cleanup()
|
||||
|
|
|
@ -33,7 +33,6 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
|
||||
self.SCORE_SHP_PATH = self.DATA_PATH / "score" / "shapefile"
|
||||
self.SCORE_SHP_FILE = self.SCORE_SHP_PATH / "usa.shp"
|
||||
self.SCORE_SHP_CODE_CSV = self.SCORE_SHP_PATH / "columns.csv"
|
||||
|
||||
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
|
||||
self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
|
||||
|
@ -300,14 +299,14 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
logger.info("Completed writing usa-low")
|
||||
|
||||
def create_esri_codebook(codebook):
|
||||
def create_esri_codebook(codebook) -> pd.DataFrame:
|
||||
"""temporary: helper to make a codebook for esri shapefile only"""
|
||||
|
||||
shapefile_column_field = "shapefile_column"
|
||||
internal_column_name_field = "column_name"
|
||||
column_description_field = "column_description"
|
||||
|
||||
logger.info("Creating a codebook that uses the csv names")
|
||||
logger.info("Creating an ESRI codebook with shortened column names")
|
||||
codebook = (
|
||||
pd.Series(codebook)
|
||||
.reset_index()
|
||||
|
@ -333,17 +332,64 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
internal_column_name_field
|
||||
].map(column_rename_dict)
|
||||
|
||||
codebook[
|
||||
codebook = codebook[
|
||||
[
|
||||
shapefile_column_field,
|
||||
internal_column_name_field,
|
||||
column_description_field,
|
||||
]
|
||||
].to_csv(
|
||||
self.SCORE_SHP_CODE_CSV,
|
||||
index=False,
|
||||
]
|
||||
logger.info("Completed creating ESRI codebook")
|
||||
|
||||
return codebook
|
||||
|
||||
def combine_esri_codebook_with_original_codebook(esri_codebook_df):
|
||||
"""Combines the ESRI codebook generated above with the original codebook generated
|
||||
during score-post. Essentially we want to include the shapefile column name in the
|
||||
original codebook."""
|
||||
|
||||
logger.info("Combining ESRI codebook with original codebook")
|
||||
|
||||
# load up the original codebook
|
||||
original_codebook_df = pd.read_csv(
|
||||
constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH,
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
# if we've already combined these files in the past, go ahead and remove the columns so we can do it again
|
||||
original_codebook_df.drop(
|
||||
"shapefile_label", axis=1, errors="ignore", inplace=True
|
||||
)
|
||||
|
||||
# add the esri (shapefile) columns to the original codebook by joining the two dataframes
|
||||
combined_codebook_df = original_codebook_df.merge(
|
||||
esri_codebook_df[["shapefile_column", "column_name"]],
|
||||
how="outer",
|
||||
left_on="Description",
|
||||
right_on="column_name",
|
||||
)
|
||||
|
||||
# if any descriptions are blank, replace them with the column_name description from the esri codebook
|
||||
combined_codebook_df["Description"].mask(
|
||||
combined_codebook_df["Description"].isnull(),
|
||||
combined_codebook_df["column_name"],
|
||||
inplace=True,
|
||||
)
|
||||
combined_codebook_df = combined_codebook_df.drop(
|
||||
"column_name", axis=1
|
||||
)
|
||||
|
||||
# move some stuff around to make it easier to read the output
|
||||
shapefile_col = combined_codebook_df.pop("shapefile_column")
|
||||
combined_codebook_df.insert(2, "shapefile_label", shapefile_col)
|
||||
|
||||
# save the combined codebook
|
||||
combined_codebook_df.to_csv(
|
||||
constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH, index=False
|
||||
)
|
||||
logger.info(
|
||||
"Completed combining ESRI codebook with original codebook"
|
||||
)
|
||||
logger.info("Completed writing codebook")
|
||||
|
||||
def write_esri_shapefile():
|
||||
logger.info("Producing ESRI shapefiles")
|
||||
|
@ -374,7 +420,8 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
logger.info("Completed writing shapefile")
|
||||
|
||||
create_esri_codebook(codebook)
|
||||
esri_codebook_df = create_esri_codebook(codebook)
|
||||
combine_esri_codebook_with_original_codebook(esri_codebook_df)
|
||||
|
||||
arcgis_zip_file_path = self.SCORE_SHP_PATH / "usa.zip"
|
||||
arcgis_files = []
|
||||
|
@ -382,6 +429,7 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
# don't remove __init__ files as they conserve dir structure
|
||||
if file != "__init__.py":
|
||||
arcgis_files.append(self.SCORE_SHP_PATH / file)
|
||||
arcgis_files.append(constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH)
|
||||
zip_files(arcgis_zip_file_path, arcgis_files)
|
||||
logger.info("Completed zipping shapefiles")
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue