Creating a data dictionary for the download packet (#1469)

Adding automated codebook creation. Future ticket to refactor.
2025-02-23 01:54:18 -08:00 · 2022-03-30 11:01:43 -04:00 · 2022-03-30 11:01:43 -04:00 · 2628afacf9
commit 2628afacf9
parent db6b5de24e
4 changed files with 330 additions and 4 deletions
--- a/data/data-pipeline/data_pipeline/content/config/field_descriptions_for_codebook.yml
+++ b/data/data-pipeline/data_pipeline/content/config/field_descriptions_for_codebook.yml
@ -0,0 +1,56 @@
+# This is a temporary file. We should make sure this *type* of information is maintained when we refactor. 
+fields:
+    - score_name: Total threshold criteria exceeded
+      notes: Lists out the total number of criteria (where each category has one or more criteria) exceeded. For example, a tract that exceeds the 90th percentile for linguistic isolation (1) and unemployment (2), and meets the training and workforce development socioeconomic criteria (high school attainment rate and low percentage of higher ed students) would have a 2 in this field. 
+    - score_name: Definition M (communities)
+      notes: True / False variable for whether a tract is a Disadvantaged Community (DAC)
+    - score_name: Is low income and has a low percent of higher ed students?
+      notes: Associated socioeconomic criterion for all thresholds except those included in training and workforce development
+    - score_name: Greater than or equal to the 90th percentile for expected agriculture loss rate, is low income, and has a low percent of higher ed students?
+      category: climate change
+    - score_name: Greater than or equal to the 90th percentile for expected building loss rate, is low income, and has a low percent of higher ed students?
+      category: climate change
+    - score_name: Greater than or equal to the 90th percentile for expected population loss rate, is low income, and has a low percent of higher ed students?
+      category: climate change
+    - score_name: Greater than or equal to the 90th percentile for energy burden, is low income, and has a low percent of higher ed students?
+      category: clean energy and energy efficiency
+    - score_name: Greater than or equal to the 90th percentile for PM2.5 exposure, is low income, and has a low percent of higher ed students?
+      category: clean energy and energy efficiency
+    - score_name: Greater than or equal to the 90th percentile for diesel particulate matter, is low income, and has a low percent of higher ed students?
+      category: clean transit
+    - score_name: Greater than or equal to the 90th percentile for traffic proximity, is low income, and has a low percent of higher ed students?
+      category: clean transit
+    - score_name: Greater than or equal to the 90th percentile for housing burden, is low income, and has a low percent of higher ed students?
+      category: affordable and sustainable housing
+    - score_name: Greater than or equal to the 90th percentile for lead paint, the median house value is less than 90th percentile, is low income, and has a low percent of higher ed students?
+      category: affordable and sustainable housing
+    - score_name: Greater than or equal to the 90th percentile for proximity to hazardous waste facilities, is low income, and has a low percent of higher ed students?
+      category: reduction and remediation of legacy pollution
+    - score_name: Greater than or equal to the 90th percentile for proximity to superfund sites, is low income, and has a low percent of higher ed students?
+      category: reduction and remediation of legacy pollution
+    - score_name: Greater than or equal to the 90th percentile for proximity to RMP sites, is low income, and has a low percent of higher ed students?
+      category: reduction and remediation of legacy pollution
+    - score_name: Greater than or equal to the 90th percentile for wastewater discharge, is low income, and has a low percent of higher ed students?
+      category: critical clean water and waste infrastructure
+    - score_name: Greater than or equal to the 90th percentile for asthma, is low income, and has a low percent of higher ed students?
+      category: health burdens
+    - score_name: Greater than or equal to the 90th percentile for diabetes, is low income, and has a low percent of higher ed students?
+      category: health burdens
+    - score_name: Greater than or equal to the 90th percentile for heart disease, is low income, and has a low percent of higher ed students?
+      category: health burdens
+    - score_name: Greater than or equal to the 90th percentile for low life expectancy, is low income, and has a low percent of higher ed students?
+      category: health burdens
+    - score_name: Greater than or equal to the 90th percentile for low median household income as a percent of area median income, has low HS attainment, and has a low percent of higher ed students?
+      category: training and workforce development
+    - score_name: Greater than or equal to the 90th percentile for households in linguistic isolation, has low HS attainment, and has a low percent of higher ed students?
+      category: training and workforce development    
+    - score_name: Greater than or equal to the 90th percentile for unemployment, has low HS attainment, and has a low percent of higher ed students?
+      category: training and workforce development
+    - score_name: Greater than or equal to the 90th percentile for households at or below 100% federal poverty level, has low HS attainment, and has a low percent of higher ed students?
+      category: training and workforce development
+    - score_name: Greater than or equal to the 90th percentile for unemployment and has low HS education in 2009 (island areas)?
+      category: training and workforce development
+    - score_name: Greater than or equal to the 90th percentile for households at or below 100% federal poverty level and has low HS education in 2009 (island areas)?
+      category: training and workforce development
+    - score_name: Greater than or equal to the 90th percentile for low median household income as a percent of area median income and has low HS education in 2009 (island areas)?
+      category: training and workforce development
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -55,10 +55,44 @@ SCORE_DOWNLOADABLE_CSV_FILE_PATH = (
 SCORE_DOWNLOADABLE_EXCEL_FILE_PATH = (
    SCORE_DOWNLOADABLE_DIR / f"communities-{timestamp_str}.xlsx"
 )
+SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH = (
+    SCORE_DOWNLOADABLE_DIR / f"codebook-{timestamp_str}.csv"
+)
 SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
    SCORE_DOWNLOADABLE_DIR / "Screening_Tool_Data.zip"
 )

+# For the codebook
+CEJST_SCORE_COLUMN_NAME = "score_name"
+CSV_FORMAT = "csv_format"
+CSV_LABEL_FIELD = "csv_label"
+EXCEL_LABEL_FIELD = "excel_label"
+NOTES_FIELD = "notes"
+THRESHOLD_CATEGORY_FIELD = "threshold_category"
+CALCULATION_NOTES_FIELD = "calculation_notes"
+CSV_FIELD_TYPE_FIELD = "csv_field_type"
+CODEBOOK_COLUMNS = [
+    CSV_LABEL_FIELD,
+    EXCEL_LABEL_FIELD,
+    CEJST_SCORE_COLUMN_NAME,
+    CSV_FIELD_TYPE_FIELD,
+    CALCULATION_NOTES_FIELD,
+    THRESHOLD_CATEGORY_FIELD,
+    NOTES_FIELD,
+]
+LOSS_RATE_STRING = "loss rate"
+LOW_STRING = "Low "
+ISLAND_STRING = "island areas"
+PERCENTILE_EXPLANATION = (
+    "All percentiles are floored (rounded down to the nearest percentile). "
+    + "For example, 89.7th percentile is rounded down to 89 for this field."
+)
+LOW_PERCENTILE_EXPLANATION = "This percentile is reversed, meaning the lowest raw numbers become the highest percentiles."
+ISLAND_AREAS_EXPLANATION = (
+    "Because not all data is available for the Nation, Puerto Rico, "
+    + "and the Island Areas, this uses different underlying data for the island areas."
+)
+
 # Column subsets
 CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]

--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -5,7 +5,7 @@ import numpy as np
 import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.score.etl_utils import floor_series
+from data_pipeline.etl.score.etl_utils import floor_series, create_codebook
 from data_pipeline.utils import (
    get_module_logger,
    zip_files,
@ -448,7 +448,15 @@ class PostScoreETL(ExtractTransformLoad):

    def _load_excel_from_df(
        self, excel_df: pd.DataFrame, excel_path: Path
-    ) -> None:
+    ) -> dict:
+        """Creates excel file from score data using configs from yml file and returns
+        contents of the yml file.
+
+        First it reads the yaml dictionary from the excel.yml config and adjusts the
+        format of the excel file.
+
+        Then it produces the excel file from the score data.
+        """

        # open excel yaml config
        excel_csv_config = load_yaml_dict_from_file(
@ -498,6 +506,7 @@ class PostScoreETL(ExtractTransformLoad):
                worksheet.set_column(0, num_cols - 1, num_excel_cols_width)

            writer.save()
+        return excel_csv_config

    def _load_tile_csv(
        self, score_tiles_df: pd.DataFrame, tile_score_path: Path
@ -512,12 +521,13 @@ class PostScoreETL(ExtractTransformLoad):
        downloadable_info_path.mkdir(parents=True, exist_ok=True)
        csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
        excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH
+        codebook_path = constants.SCORE_DOWNLOADABLE_CODEBOOK_FILE_PATH
        zip_path = constants.SCORE_DOWNLOADABLE_ZIP_FILE_PATH
        # TODO: reinstate when PDF is added back
        # pdf_path = constants.SCORE_DOWNLOADABLE_PDF_FILE_PATH

        logger.info("Writing downloadable excel")
-        self._load_excel_from_df(
+        excel_config = self._load_excel_from_df(
            excel_df=self.output_score_county_state_merged_df,
            excel_path=excel_path,
        )
@ -534,10 +544,39 @@ class PostScoreETL(ExtractTransformLoad):
        )
        downloadable_df.to_csv(csv_path, index=False)

+        logger.info("Creating codebook for download zip")
+
+        # consolidate all excel fields from the config yml. The codebook
+        # code takes in a list of fields, but the excel config file
+        # has a slightly different format to allow for sheets within the
+        # workbook. This pulls all fields from all potential sheets into one
+        # list of dictionaries that specify information on each field.
+        excel_fields = []
+        for sheet in excel_config["sheets"]:
+            excel_fields.extend(sheet["fields"])
+
+        # load supplemental codebook yml
+        field_descriptions_for_codebook_config = load_yaml_dict_from_file(
+            self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml"
+        )
+
+        # create codebook
+        codebook_df = create_codebook(
+            downloadable_csv_config=downloadable_csv_config["fields"],
+            excel_config=excel_fields,
+            field_descriptions_for_codebook=field_descriptions_for_codebook_config[
+                "fields"
+            ],
+        )
+
+        # load codebook to disk
+        codebook_df.to_csv(codebook_path, index=False)
+
        logger.info("Compressing files")
        files_to_compress = [
            csv_path,
            excel_path,
+            codebook_path,
        ]  # add pdf_path here to include PDF
        zip_files(zip_path, files_to_compress)

--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -1,15 +1,17 @@
 import os
 import sys
 from pathlib import Path
+from collections import namedtuple
 import numpy as np
 import pandas as pd

-
 from data_pipeline.config import settings
 from data_pipeline.utils import (
    download_file_from_url,
    get_module_logger,
 )
+from data_pipeline.score import field_names
+from . import constants

 logger = get_module_logger(__name__)

@ -108,3 +110,198 @@ def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
    )

    return floored_series
+
+
+def _create_df_from_yaml_contents(
+    fields_list_from_yaml: list,
+    fields_to_store_in_codebook: list,
+) -> pd.DataFrame:
+    """Helper function to create a dataframe from yaml fields to get used for
+    all three configs: csv, excel, and supplemental codebook information yaml
+
+    This function does:
+        1. Creates a dictionary to be converted to a dataframe. Pandas easily converts
+           dictionaries of the form {column_name: [value_1, value_2, value_3]} to
+           dataframes, where column_name is the name of the column and the list of values
+           is (by numerical index) the values for the series.
+                Column names here are dictated by the fields_to_store_in_codebook list, a named
+                tuple that includes the name of the field in the yaml and the name the field will
+                take in the codebook. For example, both the csv and excel configs use the name "label",
+                but in the codebook, we want one of these fields to be "csv_label" and the other
+                to be "excel_label".
+        2. Cycles through the fields specified in the yaml fields list. Each field includes
+           some additional details, and so the function appends that information to the dictionary
+           lists, as described above. If the field is missing, appends a null value so that the row's
+           value is blank. This is an artifact of constructing a dataframe from a dictionary of lists.
+        3. Returns a dataframe indexed by the column name used in CEJST data (i.e., the
+           score name field that is consistent across all yamls and in our own usa.csv).
+    """
+    # this becomes the codebook frame for each  yaml source. In particular,
+    # the key becomes column names, and the lists store their values. We hard-set the
+    # first column name to be the CEJST_SCORE_COLUMN_NAME because this should be
+    # the same across the board for every component codebook.
+    codebook_dictionary = {
+        field.new_label_in_codebook: [] for field in fields_to_store_in_codebook
+    }
+    codebook_dictionary[constants.CEJST_SCORE_COLUMN_NAME] = []
+
+    # we reshape the data from a list of dictionaries to a dictionary of lists
+    # so that we can cast it as a dataframe
+    for single_field_details in fields_list_from_yaml:
+        assert constants.CEJST_SCORE_COLUMN_NAME in single_field_details, (
+            "Error: the yaml codebook should crosswalk to the native column "
+            + f"from the CEJST pipeline, called {constants.CEJST_SCORE_COLUMN_NAME}"
+        )
+        # Since every single YAML file should have a score column name
+        # that is the the same, this appends each to the list in the dictionary.
+        # When pandas converts a dictionary of form {column_name: [val_1, val_2, val_3]},
+        # the dataframe has a column named "column_name" with sequential val_1, 2, and 3.
+        codebook_dictionary[constants.CEJST_SCORE_COLUMN_NAME].append(
+            single_field_details[constants.CEJST_SCORE_COLUMN_NAME]
+        )
+        for field_information in fields_to_store_in_codebook:
+            try:
+                codebook_dictionary[
+                    field_information.new_label_in_codebook
+                ].append(
+                    single_field_details[field_information.existing_yaml_label]
+                )
+            # a key error occurs when the field is not specified for the
+            # column in the yaml file; when this happens, a null value should be
+            # appended to the list in the dictionary, since the dataframe will
+            # use the keys as column names and lists as values.
+            # this allows us to have optional fields in the yaml file.
+            except KeyError:
+                assert (
+                    field_information.new_label_in_codebook
+                    != constants.CEJST_SCORE_COLUMN_NAME
+                )
+                codebook_dictionary[
+                    field_information.new_label_in_codebook
+                ].append(np.nan)
+    return pd.DataFrame(codebook_dictionary).set_index(
+        constants.CEJST_SCORE_COLUMN_NAME
+    )
+
+
+def _get_datatype(
+    input_column_name: str,
+    input_column_type: str,
+    percentile_string: str = field_names.PERCENTILE_FIELD_SUFFIX,
+    loss_rate_string: str = constants.LOSS_RATE_STRING,
+) -> str:
+    """Helper to convert datatype
+
+    Note: eventually, this will either be programmatically set, or will be included in the yaml, depending on
+    the refactor that we do
+    """
+    return_column_type = input_column_type
+    if percentile_string in input_column_name:
+        return_column_type = "percentile"
+    elif loss_rate_string in input_column_name:
+        return_column_type = "rate"
+    return return_column_type
+
+
+def _get_calculation_notes(column_name: str) -> str:
+    """Produces calculation notes
+
+    Note: eventually, this will either be programmatically set, or will be included in the yaml, depending on
+    the refactor that we do
+    """
+    calculation_notes = []
+    if field_names.PERCENTILE_FIELD_SUFFIX in column_name:
+        calculation_notes += [constants.PERCENTILE_EXPLANATION]
+    if constants.LOW_STRING in column_name:
+        calculation_notes += [constants.LOW_PERCENTILE_EXPLANATION]
+    if constants.ISLAND_STRING in column_name:
+        calculation_notes += [constants.ISLAND_AREAS_EXPLANATION]
+    return " ".join(calculation_notes)
+
+
+def create_codebook(
+    downloadable_csv_config: dict,
+    excel_config: dict,
+    field_descriptions_for_codebook: dict,
+) -> pd.DataFrame:
+    """Runs through all logic of creating the codebook.
+
+    First it reads in each component yaml file for the codebook.
+    Then it merges all of them.
+    Finally, it applies any transforms to the columns (like getting the
+        datatype or adding calculation_notes.
+    """
+    CodebookLabelFields = namedtuple(
+        "CodebookLabelFields",
+        ["new_label_in_codebook", "existing_yaml_label"],
+    )
+
+    # parse data from component yamls
+    csv_codes_df = _create_df_from_yaml_contents(
+        fields_list_from_yaml=downloadable_csv_config,
+        fields_to_store_in_codebook=[
+            CodebookLabelFields(
+                new_label_in_codebook=constants.CSV_LABEL_FIELD,
+                existing_yaml_label="label",
+            ),
+            CodebookLabelFields(
+                new_label_in_codebook=constants.CSV_FORMAT,
+                existing_yaml_label="format",
+            ),
+        ],
+    )
+
+    excel_codes_df = _create_df_from_yaml_contents(
+        fields_list_from_yaml=excel_config,
+        fields_to_store_in_codebook=[
+            CodebookLabelFields(
+                new_label_in_codebook=constants.EXCEL_LABEL_FIELD,
+                existing_yaml_label="label",
+            )
+        ],
+    )
+
+    field_descriptions_for_codebook_df = _create_df_from_yaml_contents(
+        fields_list_from_yaml=field_descriptions_for_codebook,
+        fields_to_store_in_codebook=[
+            CodebookLabelFields(
+                new_label_in_codebook=constants.NOTES_FIELD,
+                existing_yaml_label="notes",
+            ),
+            CodebookLabelFields(
+                new_label_in_codebook=constants.THRESHOLD_CATEGORY_FIELD,
+                existing_yaml_label="category",
+            ),
+        ],
+    )
+
+    # join all sources on the column name
+    merged_codebook_df = pd.concat(
+        [csv_codes_df, excel_codes_df, field_descriptions_for_codebook_df],
+        join="outer",
+        axis=1,
+    ).reset_index()
+
+    # add field type column
+    merged_codebook_df[
+        constants.CSV_FIELD_TYPE_FIELD
+    ] = merged_codebook_df.apply(
+        lambda x: _get_datatype(
+            input_column_name=x[constants.CEJST_SCORE_COLUMN_NAME],
+            input_column_type=x[constants.CSV_FORMAT],
+        ),
+        axis=1,
+    )
+
+    # get calculation notes column
+    merged_codebook_df[constants.CALCULATION_NOTES_FIELD] = merged_codebook_df[
+        constants.CEJST_SCORE_COLUMN_NAME
+    ].apply(_get_calculation_notes)
+
+    # This is temporary. Right now, our variable names are all
+    # plain English. After the refactor, we will have new names
+    # that are programmatic, and the CEJST_SCORE_COLUMN will
+    # be dropped in favor of the explanation.
+    return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
+        columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
+    )