Creating a data dictionary for the download packet (#1469)

Adding automated codebook creation. Future ticket to refactor.
2025-07-28 14:21:16 -07:00 · 2022-03-30 11:01:43 -04:00 · 2022-03-30 11:01:43 -04:00 · 2628afacf9
commit 2628afacf9
parent db6b5de24e
4 changed files with 330 additions and 4 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -55,10 +55,44 @@ SCORE_DOWNLOADABLE_CSV_FILE_PATH = (
 SCORE_DOWNLOADABLE_EXCEL_FILE_PATH = (
    SCORE_DOWNLOADABLE_DIR / f"communities-{timestamp_str}.xlsx"
 )
+SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH = (
+    SCORE_DOWNLOADABLE_DIR / f"codebook-{timestamp_str}.csv"
+)
 SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
    SCORE_DOWNLOADABLE_DIR / "Screening_Tool_Data.zip"
 )

+# For the codebook
+CEJST_SCORE_COLUMN_NAME = "score_name"
+CSV_FORMAT = "csv_format"
+CSV_LABEL_FIELD = "csv_label"
+EXCEL_LABEL_FIELD = "excel_label"
+NOTES_FIELD = "notes"
+THRESHOLD_CATEGORY_FIELD = "threshold_category"
+CALCULATION_NOTES_FIELD = "calculation_notes"
+CSV_FIELD_TYPE_FIELD = "csv_field_type"
+CODEBOOK_COLUMNS = [
+    CSV_LABEL_FIELD,
+    EXCEL_LABEL_FIELD,
+    CEJST_SCORE_COLUMN_NAME,
+    CSV_FIELD_TYPE_FIELD,
+    CALCULATION_NOTES_FIELD,
+    THRESHOLD_CATEGORY_FIELD,
+    NOTES_FIELD,
+]
+LOSS_RATE_STRING = "loss rate"
+LOW_STRING = "Low "
+ISLAND_STRING = "island areas"
+PERCENTILE_EXPLANATION = (
+    "All percentiles are floored (rounded down to the nearest percentile). "
+    + "For example, 89.7th percentile is rounded down to 89 for this field."
+)
+LOW_PERCENTILE_EXPLANATION = "This percentile is reversed, meaning the lowest raw numbers become the highest percentiles."
+ISLAND_AREAS_EXPLANATION = (
+    "Because not all data is available for the Nation, Puerto Rico, "
+    + "and the Island Areas, this uses different underlying data for the island areas."
+)
+
 # Column subsets
 CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]

--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -5,7 +5,7 @@ import numpy as np
 import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.score.etl_utils import floor_series
+from data_pipeline.etl.score.etl_utils import floor_series, create_codebook
 from data_pipeline.utils import (
    get_module_logger,
    zip_files,
@ -448,7 +448,15 @@ class PostScoreETL(ExtractTransformLoad):

    def _load_excel_from_df(
        self, excel_df: pd.DataFrame, excel_path: Path
-    ) -> None:
+    ) -> dict:
+        """Creates excel file from score data using configs from yml file and returns
+        contents of the yml file.
+
+        First it reads the yaml dictionary from the excel.yml config and adjusts the
+        format of the excel file.
+
+        Then it produces the excel file from the score data.
+        """

        # open excel yaml config
        excel_csv_config = load_yaml_dict_from_file(
@ -498,6 +506,7 @@ class PostScoreETL(ExtractTransformLoad):
                worksheet.set_column(0, num_cols - 1, num_excel_cols_width)

            writer.save()
+        return excel_csv_config

    def _load_tile_csv(
        self, score_tiles_df: pd.DataFrame, tile_score_path: Path
@ -512,12 +521,13 @@ class PostScoreETL(ExtractTransformLoad):
        downloadable_info_path.mkdir(parents=True, exist_ok=True)
        csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
        excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH
+        codebook_path = constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH
        zip_path = constants.SCORE_DOWNLOADABLE_ZIP_FILE_PATH
        # TODO: reinstate when PDF is added back
        # pdf_path = constants.SCORE_DOWNLOADABLE_PDF_FILE_PATH

        logger.info("Writing downloadable excel")
-        self._load_excel_from_df(
+        excel_config = self._load_excel_from_df(
            excel_df=self.output_score_county_state_merged_df,
            excel_path=excel_path,
        )
@ -534,10 +544,39 @@ class PostScoreETL(ExtractTransformLoad):
        )
        downloadable_df.to_csv(csv_path, index=False)

+        logger.info("Creating codebook for download zip")
+
+        # consolidate all excel fields from the config yml. The codebook
+        # code takes in a list of fields, but the excel config file
+        # has a slightly different format to allow for sheets within the
+        # workbook. This pulls all fields from all potential sheets into one
+        # list of dictionaries that specify information on each field.
+        excel_fields = []
+        for sheet in excel_config["sheets"]:
+            excel_fields.extend(sheet["fields"])
+
+        # load supplemental codebook yml
+        field_descriptions_for_codebook_config = load_yaml_dict_from_file(
+            self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml"
+        )
+
+        # create codebook
+        codebook_df = create_codebook(
+            downloadable_csv_config=downloadable_csv_config["fields"],
+            excel_config=excel_fields,
+            field_descriptions_for_codebook=field_descriptions_for_codebook_config[
+                "fields"
+            ],
+        )
+
+        # load codebook to disk
+        codebook_df.to_csv(codebook_path, index=False)
+
        logger.info("Compressing files")
        files_to_compress = [
            csv_path,
            excel_path,
+            codebook_path,
        ]  # add pdf_path here to include PDF
        zip_files(zip_path, files_to_compress)

--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -1,15 +1,17 @@
 import os
 import sys
 from pathlib import Path
+from collections import namedtuple
 import numpy as np
 import pandas as pd

-
 from data_pipeline.config import settings
 from data_pipeline.utils import (
    download_file_from_url,
    get_module_logger,
 )
+from data_pipeline.score import field_names
+from . import constants

 logger = get_module_logger(__name__)

@ -108,3 +110,198 @@ def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
    )

    return floored_series
+
+
+def _create_df_from_yaml_contents(
+    fields_list_from_yaml: list,
+    fields_to_store_in_codebook: list,
+) -> pd.DataFrame:
+    """Helper function to create a dataframe from yaml fields to get used for
+    all three configs: csv, excel, and supplemental codebook information yaml
+
+    This function does:
+        1. Creates a dictionary to be converted to a dataframe. Pandas easily converts
+           dictionaries of the form {column_name: [value_1, value_2, value_3]} to
+           dataframes, where column_name is the name of the column and the list of values
+           is (by numerical index) the values for the series.
+                Column names here are dictated by the fields_to_store_in_codebook list, a named
+                tuple that includes the name of the field in the yaml and the name the field will
+                take in the codebook. For example, both the csv and excel configs use the name "label",
+                but in the codebook, we want one of these fields to be "csv_label" and the other
+                to be "excel_label".
+        2. Cycles through the fields specified in the yaml fields list. Each field includes
+           some additional details, and so the function appends that information to the dictionary
+           lists, as described above. If the field is missing, appends a null value so that the row's
+           value is blank. This is an artifact of constructing a dataframe from a dictionary of lists.
+        3. Returns a dataframe indexed by the column name used in CEJST data (i.e., the
+           score name field that is consistent across all yamls and in our own usa.csv).
+    """
+    # this becomes the codebook frame for each  yaml source. In particular,
+    # the key becomes column names, and the lists store their values. We hard-set the
+    # first column name to be the CEJST_SCORE_COLUMN_NAME because this should be
+    # the same across the board for every component codebook.
+    codebook_dictionary = {
+        field.new_label_in_codebook: [] for field in fields_to_store_in_codebook
+    }
+    codebook_dictionary[constants.CEJST_SCORE_COLUMN_NAME] = []
+
+    # we reshape the data from a list of dictionaries to a dictionary of lists
+    # so that we can cast it as a dataframe
+    for single_field_details in fields_list_from_yaml:
+        assert constants.CEJST_SCORE_COLUMN_NAME in single_field_details, (
+            "Error: the yaml codebook should crosswalk to the native column "
+            + f"from the CEJST pipeline, called {constants.CEJST_SCORE_COLUMN_NAME}"
+        )
+        # Since every single YAML file should have a score column name
+        # that is the the same, this appends each to the list in the dictionary.
+        # When pandas converts a dictionary of form {column_name: [val_1, val_2, val_3]},
+        # the dataframe has a column named "column_name" with sequential val_1, 2, and 3.
+        codebook_dictionary[constants.CEJST_SCORE_COLUMN_NAME].append(
+            single_field_details[constants.CEJST_SCORE_COLUMN_NAME]
+        )
+        for field_information in fields_to_store_in_codebook:
+            try:
+                codebook_dictionary[
+                    field_information.new_label_in_codebook
+                ].append(
+                    single_field_details[field_information.existing_yaml_label]
+                )
+            # a key error occurs when the field is not specified for the
+            # column in the yaml file; when this happens, a null value should be
+            # appended to the list in the dictionary, since the dataframe will
+            # use the keys as column names and lists as values.
+            # this allows us to have optional fields in the yaml file.
+            except KeyError:
+                assert (
+                    field_information.new_label_in_codebook
+                    != constants.CEJST_SCORE_COLUMN_NAME
+                )
+                codebook_dictionary[
+                    field_information.new_label_in_codebook
+                ].append(np.nan)
+    return pd.DataFrame(codebook_dictionary).set_index(
+        constants.CEJST_SCORE_COLUMN_NAME
+    )
+
+
+def _get_datatype(
+    input_column_name: str,
+    input_column_type: str,
+    percentile_string: str = field_names.PERCENTILE_FIELD_SUFFIX,
+    loss_rate_string: str = constants.LOSS_RATE_STRING,
+) -> str:
+    """Helper to convert datatype
+
+    Note: eventually, this will either be programmatically set, or will be included in the yaml, depending on
+    the refactor that we do
+    """
+    return_column_type = input_column_type
+    if percentile_string in input_column_name:
+        return_column_type = "percentile"
+    elif loss_rate_string in input_column_name:
+        return_column_type = "rate"
+    return return_column_type
+
+
+def _get_calculation_notes(column_name: str) -> str:
+    """Produces calculation notes
+
+    Note: eventually, this will either be programmatically set, or will be included in the yaml, depending on
+    the refactor that we do
+    """
+    calculation_notes = []
+    if field_names.PERCENTILE_FIELD_SUFFIX in column_name:
+        calculation_notes += [constants.PERCENTILE_EXPLANATION]
+    if constants.LOW_STRING in column_name:
+        calculation_notes += [constants.LOW_PERCENTILE_EXPLANATION]
+    if constants.ISLAND_STRING in column_name:
+        calculation_notes += [constants.ISLAND_AREAS_EXPLANATION]
+    return " ".join(calculation_notes)
+
+
+def create_codebook(
+    downloadable_csv_config: dict,
+    excel_config: dict,
+    field_descriptions_for_codebook: dict,
+) -> pd.DataFrame:
+    """Runs through all logic of creating the codebook.
+
+    First it reads in each component yaml file for the codebook.
+    Then it merges all of them.
+    Finally, it applies any transforms to the columns (like getting the
+        datatype or adding calculation_notes.
+    """
+    CodebookLabelFields = namedtuple(
+        "CodebookLabelFields",
+        ["new_label_in_codebook", "existing_yaml_label"],
+    )
+
+    # parse data from component yamls
+    csv_codes_df = _create_df_from_yaml_contents(
+        fields_list_from_yaml=downloadable_csv_config,
+        fields_to_store_in_codebook=[
+            CodebookLabelFields(
+                new_label_in_codebook=constants.CSV_LABEL_FIELD,
+                existing_yaml_label="label",
+            ),
+            CodebookLabelFields(
+                new_label_in_codebook=constants.CSV_FORMAT,
+                existing_yaml_label="format",
+            ),
+        ],
+    )
+
+    excel_codes_df = _create_df_from_yaml_contents(
+        fields_list_from_yaml=excel_config,
+        fields_to_store_in_codebook=[
+            CodebookLabelFields(
+                new_label_in_codebook=constants.EXCEL_LABEL_FIELD,
+                existing_yaml_label="label",
+            )
+        ],
+    )
+
+    field_descriptions_for_codebook_df = _create_df_from_yaml_contents(
+        fields_list_from_yaml=field_descriptions_for_codebook,
+        fields_to_store_in_codebook=[
+            CodebookLabelFields(
+                new_label_in_codebook=constants.NOTES_FIELD,
+                existing_yaml_label="notes",
+            ),
+            CodebookLabelFields(
+                new_label_in_codebook=constants.THRESHOLD_CATEGORY_FIELD,
+                existing_yaml_label="category",
+            ),
+        ],
+    )
+
+    # join all sources on the column name
+    merged_codebook_df = pd.concat(
+        [csv_codes_df, excel_codes_df, field_descriptions_for_codebook_df],
+        join="outer",
+        axis=1,
+    ).reset_index()
+
+    # add field type column
+    merged_codebook_df[
+        constants.CSV_FIELD_TYPE_FIELD
+    ] = merged_codebook_df.apply(
+        lambda x: _get_datatype(
+            input_column_name=x[constants.CEJST_SCORE_COLUMN_NAME],
+            input_column_type=x[constants.CSV_FORMAT],
+        ),
+        axis=1,
+    )
+
+    # get calculation notes column
+    merged_codebook_df[constants.CALCULATION_NOTES_FIELD] = merged_codebook_df[
+        constants.CEJST_SCORE_COLUMN_NAME
+    ].apply(_get_calculation_notes)
+
+    # This is temporary. Right now, our variable names are all
+    # plain English. After the refactor, we will have new names
+    # that are programmatic, and the CEJST_SCORE_COLUMN will
+    # be dropped in favor of the explanation.
+    return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
+        columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
+    )