mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 14:21:16 -07:00
Creating a data dictionary for the download packet (#1469)
Adding automated codebook creation. Future ticket to refactor.
This commit is contained in:
parent
db6b5de24e
commit
2628afacf9
4 changed files with 330 additions and 4 deletions
|
@ -55,10 +55,44 @@ SCORE_DOWNLOADABLE_CSV_FILE_PATH = (
|
|||
SCORE_DOWNLOADABLE_EXCEL_FILE_PATH = (
|
||||
SCORE_DOWNLOADABLE_DIR / f"communities-{timestamp_str}.xlsx"
|
||||
)
|
||||
SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH = (
|
||||
SCORE_DOWNLOADABLE_DIR / f"codebook-{timestamp_str}.csv"
|
||||
)
|
||||
SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
|
||||
SCORE_DOWNLOADABLE_DIR / "Screening_Tool_Data.zip"
|
||||
)
|
||||
|
||||
# For the codebook
|
||||
CEJST_SCORE_COLUMN_NAME = "score_name"
|
||||
CSV_FORMAT = "csv_format"
|
||||
CSV_LABEL_FIELD = "csv_label"
|
||||
EXCEL_LABEL_FIELD = "excel_label"
|
||||
NOTES_FIELD = "notes"
|
||||
THRESHOLD_CATEGORY_FIELD = "threshold_category"
|
||||
CALCULATION_NOTES_FIELD = "calculation_notes"
|
||||
CSV_FIELD_TYPE_FIELD = "csv_field_type"
|
||||
CODEBOOK_COLUMNS = [
|
||||
CSV_LABEL_FIELD,
|
||||
EXCEL_LABEL_FIELD,
|
||||
CEJST_SCORE_COLUMN_NAME,
|
||||
CSV_FIELD_TYPE_FIELD,
|
||||
CALCULATION_NOTES_FIELD,
|
||||
THRESHOLD_CATEGORY_FIELD,
|
||||
NOTES_FIELD,
|
||||
]
|
||||
LOSS_RATE_STRING = "loss rate"
|
||||
LOW_STRING = "Low "
|
||||
ISLAND_STRING = "island areas"
|
||||
PERCENTILE_EXPLANATION = (
|
||||
"All percentiles are floored (rounded down to the nearest percentile). "
|
||||
+ "For example, 89.7th percentile is rounded down to 89 for this field."
|
||||
)
|
||||
LOW_PERCENTILE_EXPLANATION = "This percentile is reversed, meaning the lowest raw numbers become the highest percentiles."
|
||||
ISLAND_AREAS_EXPLANATION = (
|
||||
"Because not all data is available for the Nation, Puerto Rico, "
|
||||
+ "and the Island Areas, this uses different underlying data for the island areas."
|
||||
)
|
||||
|
||||
# Column subsets
|
||||
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.score.etl_utils import floor_series
|
||||
from data_pipeline.etl.score.etl_utils import floor_series, create_codebook
|
||||
from data_pipeline.utils import (
|
||||
get_module_logger,
|
||||
zip_files,
|
||||
|
@ -448,7 +448,15 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
def _load_excel_from_df(
|
||||
self, excel_df: pd.DataFrame, excel_path: Path
|
||||
) -> None:
|
||||
) -> dict:
|
||||
"""Creates excel file from score data using configs from yml file and returns
|
||||
contents of the yml file.
|
||||
|
||||
First it reads the yaml dictionary from the excel.yml config and adjusts the
|
||||
format of the excel file.
|
||||
|
||||
Then it produces the excel file from the score data.
|
||||
"""
|
||||
|
||||
# open excel yaml config
|
||||
excel_csv_config = load_yaml_dict_from_file(
|
||||
|
@ -498,6 +506,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
worksheet.set_column(0, num_cols - 1, num_excel_cols_width)
|
||||
|
||||
writer.save()
|
||||
return excel_csv_config
|
||||
|
||||
def _load_tile_csv(
|
||||
self, score_tiles_df: pd.DataFrame, tile_score_path: Path
|
||||
|
@ -512,12 +521,13 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
downloadable_info_path.mkdir(parents=True, exist_ok=True)
|
||||
csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
|
||||
excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH
|
||||
codebook_path = constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH
|
||||
zip_path = constants.SCORE_DOWNLOADABLE_ZIP_FILE_PATH
|
||||
# TODO: reinstate when PDF is added back
|
||||
# pdf_path = constants.SCORE_DOWNLOADABLE_PDF_FILE_PATH
|
||||
|
||||
logger.info("Writing downloadable excel")
|
||||
self._load_excel_from_df(
|
||||
excel_config = self._load_excel_from_df(
|
||||
excel_df=self.output_score_county_state_merged_df,
|
||||
excel_path=excel_path,
|
||||
)
|
||||
|
@ -534,10 +544,39 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
)
|
||||
downloadable_df.to_csv(csv_path, index=False)
|
||||
|
||||
logger.info("Creating codebook for download zip")
|
||||
|
||||
# consolidate all excel fields from the config yml. The codebook
|
||||
# code takes in a list of fields, but the excel config file
|
||||
# has a slightly different format to allow for sheets within the
|
||||
# workbook. This pulls all fields from all potential sheets into one
|
||||
# list of dictionaries that specify information on each field.
|
||||
excel_fields = []
|
||||
for sheet in excel_config["sheets"]:
|
||||
excel_fields.extend(sheet["fields"])
|
||||
|
||||
# load supplemental codebook yml
|
||||
field_descriptions_for_codebook_config = load_yaml_dict_from_file(
|
||||
self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml"
|
||||
)
|
||||
|
||||
# create codebook
|
||||
codebook_df = create_codebook(
|
||||
downloadable_csv_config=downloadable_csv_config["fields"],
|
||||
excel_config=excel_fields,
|
||||
field_descriptions_for_codebook=field_descriptions_for_codebook_config[
|
||||
"fields"
|
||||
],
|
||||
)
|
||||
|
||||
# load codebook to disk
|
||||
codebook_df.to_csv(codebook_path, index=False)
|
||||
|
||||
logger.info("Compressing files")
|
||||
files_to_compress = [
|
||||
csv_path,
|
||||
excel_path,
|
||||
codebook_path,
|
||||
] # add pdf_path here to include PDF
|
||||
zip_files(zip_path, files_to_compress)
|
||||
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.utils import (
|
||||
download_file_from_url,
|
||||
get_module_logger,
|
||||
)
|
||||
from data_pipeline.score import field_names
|
||||
from . import constants
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -108,3 +110,198 @@ def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
|
|||
)
|
||||
|
||||
return floored_series
|
||||
|
||||
|
||||
def _create_df_from_yaml_contents(
|
||||
fields_list_from_yaml: list,
|
||||
fields_to_store_in_codebook: list,
|
||||
) -> pd.DataFrame:
|
||||
"""Helper function to create a dataframe from yaml fields to get used for
|
||||
all three configs: csv, excel, and supplemental codebook information yaml
|
||||
|
||||
This function does:
|
||||
1. Creates a dictionary to be converted to a dataframe. Pandas easily converts
|
||||
dictionaries of the form {column_name: [value_1, value_2, value_3]} to
|
||||
dataframes, where column_name is the name of the column and the list of values
|
||||
is (by numerical index) the values for the series.
|
||||
Column names here are dictated by the fields_to_store_in_codebook list, a named
|
||||
tuple that includes the name of the field in the yaml and the name the field will
|
||||
take in the codebook. For example, both the csv and excel configs use the name "label",
|
||||
but in the codebook, we want one of these fields to be "csv_label" and the other
|
||||
to be "excel_label".
|
||||
2. Cycles through the fields specified in the yaml fields list. Each field includes
|
||||
some additional details, and so the function appends that information to the dictionary
|
||||
lists, as described above. If the field is missing, appends a null value so that the row's
|
||||
value is blank. This is an artifact of constructing a dataframe from a dictionary of lists.
|
||||
3. Returns a dataframe indexed by the column name used in CEJST data (i.e., the
|
||||
score name field that is consistent across all yamls and in our own usa.csv).
|
||||
"""
|
||||
# this becomes the codebook frame for each yaml source. In particular,
|
||||
# the key becomes column names, and the lists store their values. We hard-set the
|
||||
# first column name to be the CEJST_SCORE_COLUMN_NAME because this should be
|
||||
# the same across the board for every component codebook.
|
||||
codebook_dictionary = {
|
||||
field.new_label_in_codebook: [] for field in fields_to_store_in_codebook
|
||||
}
|
||||
codebook_dictionary[constants.CEJST_SCORE_COLUMN_NAME] = []
|
||||
|
||||
# we reshape the data from a list of dictionaries to a dictionary of lists
|
||||
# so that we can cast it as a dataframe
|
||||
for single_field_details in fields_list_from_yaml:
|
||||
assert constants.CEJST_SCORE_COLUMN_NAME in single_field_details, (
|
||||
"Error: the yaml codebook should crosswalk to the native column "
|
||||
+ f"from the CEJST pipeline, called {constants.CEJST_SCORE_COLUMN_NAME}"
|
||||
)
|
||||
# Since every single YAML file should have a score column name
|
||||
# that is the the same, this appends each to the list in the dictionary.
|
||||
# When pandas converts a dictionary of form {column_name: [val_1, val_2, val_3]},
|
||||
# the dataframe has a column named "column_name" with sequential val_1, 2, and 3.
|
||||
codebook_dictionary[constants.CEJST_SCORE_COLUMN_NAME].append(
|
||||
single_field_details[constants.CEJST_SCORE_COLUMN_NAME]
|
||||
)
|
||||
for field_information in fields_to_store_in_codebook:
|
||||
try:
|
||||
codebook_dictionary[
|
||||
field_information.new_label_in_codebook
|
||||
].append(
|
||||
single_field_details[field_information.existing_yaml_label]
|
||||
)
|
||||
# a key error occurs when the field is not specified for the
|
||||
# column in the yaml file; when this happens, a null value should be
|
||||
# appended to the list in the dictionary, since the dataframe will
|
||||
# use the keys as column names and lists as values.
|
||||
# this allows us to have optional fields in the yaml file.
|
||||
except KeyError:
|
||||
assert (
|
||||
field_information.new_label_in_codebook
|
||||
!= constants.CEJST_SCORE_COLUMN_NAME
|
||||
)
|
||||
codebook_dictionary[
|
||||
field_information.new_label_in_codebook
|
||||
].append(np.nan)
|
||||
return pd.DataFrame(codebook_dictionary).set_index(
|
||||
constants.CEJST_SCORE_COLUMN_NAME
|
||||
)
|
||||
|
||||
|
||||
def _get_datatype(
|
||||
input_column_name: str,
|
||||
input_column_type: str,
|
||||
percentile_string: str = field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
loss_rate_string: str = constants.LOSS_RATE_STRING,
|
||||
) -> str:
|
||||
"""Helper to convert datatype
|
||||
|
||||
Note: eventually, this will either be programmatically set, or will be included in the yaml, depending on
|
||||
the refactor that we do
|
||||
"""
|
||||
return_column_type = input_column_type
|
||||
if percentile_string in input_column_name:
|
||||
return_column_type = "percentile"
|
||||
elif loss_rate_string in input_column_name:
|
||||
return_column_type = "rate"
|
||||
return return_column_type
|
||||
|
||||
|
||||
def _get_calculation_notes(column_name: str) -> str:
|
||||
"""Produces calculation notes
|
||||
|
||||
Note: eventually, this will either be programmatically set, or will be included in the yaml, depending on
|
||||
the refactor that we do
|
||||
"""
|
||||
calculation_notes = []
|
||||
if field_names.PERCENTILE_FIELD_SUFFIX in column_name:
|
||||
calculation_notes += [constants.PERCENTILE_EXPLANATION]
|
||||
if constants.LOW_STRING in column_name:
|
||||
calculation_notes += [constants.LOW_PERCENTILE_EXPLANATION]
|
||||
if constants.ISLAND_STRING in column_name:
|
||||
calculation_notes += [constants.ISLAND_AREAS_EXPLANATION]
|
||||
return " ".join(calculation_notes)
|
||||
|
||||
|
||||
def create_codebook(
|
||||
downloadable_csv_config: dict,
|
||||
excel_config: dict,
|
||||
field_descriptions_for_codebook: dict,
|
||||
) -> pd.DataFrame:
|
||||
"""Runs through all logic of creating the codebook.
|
||||
|
||||
First it reads in each component yaml file for the codebook.
|
||||
Then it merges all of them.
|
||||
Finally, it applies any transforms to the columns (like getting the
|
||||
datatype or adding calculation_notes.
|
||||
"""
|
||||
CodebookLabelFields = namedtuple(
|
||||
"CodebookLabelFields",
|
||||
["new_label_in_codebook", "existing_yaml_label"],
|
||||
)
|
||||
|
||||
# parse data from component yamls
|
||||
csv_codes_df = _create_df_from_yaml_contents(
|
||||
fields_list_from_yaml=downloadable_csv_config,
|
||||
fields_to_store_in_codebook=[
|
||||
CodebookLabelFields(
|
||||
new_label_in_codebook=constants.CSV_LABEL_FIELD,
|
||||
existing_yaml_label="label",
|
||||
),
|
||||
CodebookLabelFields(
|
||||
new_label_in_codebook=constants.CSV_FORMAT,
|
||||
existing_yaml_label="format",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
excel_codes_df = _create_df_from_yaml_contents(
|
||||
fields_list_from_yaml=excel_config,
|
||||
fields_to_store_in_codebook=[
|
||||
CodebookLabelFields(
|
||||
new_label_in_codebook=constants.EXCEL_LABEL_FIELD,
|
||||
existing_yaml_label="label",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
field_descriptions_for_codebook_df = _create_df_from_yaml_contents(
|
||||
fields_list_from_yaml=field_descriptions_for_codebook,
|
||||
fields_to_store_in_codebook=[
|
||||
CodebookLabelFields(
|
||||
new_label_in_codebook=constants.NOTES_FIELD,
|
||||
existing_yaml_label="notes",
|
||||
),
|
||||
CodebookLabelFields(
|
||||
new_label_in_codebook=constants.THRESHOLD_CATEGORY_FIELD,
|
||||
existing_yaml_label="category",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# join all sources on the column name
|
||||
merged_codebook_df = pd.concat(
|
||||
[csv_codes_df, excel_codes_df, field_descriptions_for_codebook_df],
|
||||
join="outer",
|
||||
axis=1,
|
||||
).reset_index()
|
||||
|
||||
# add field type column
|
||||
merged_codebook_df[
|
||||
constants.CSV_FIELD_TYPE_FIELD
|
||||
] = merged_codebook_df.apply(
|
||||
lambda x: _get_datatype(
|
||||
input_column_name=x[constants.CEJST_SCORE_COLUMN_NAME],
|
||||
input_column_type=x[constants.CSV_FORMAT],
|
||||
),
|
||||
axis=1,
|
||||
)
|
||||
|
||||
# get calculation notes column
|
||||
merged_codebook_df[constants.CALCULATION_NOTES_FIELD] = merged_codebook_df[
|
||||
constants.CEJST_SCORE_COLUMN_NAME
|
||||
].apply(_get_calculation_notes)
|
||||
|
||||
# This is temporary. Right now, our variable names are all
|
||||
# plain English. After the refactor, we will have new names
|
||||
# that are programmatic, and the CEJST_SCORE_COLUMN will
|
||||
# be dropped in favor of the explanation.
|
||||
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
|
||||
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue