From 7415a15bc02411e2e333ef15a2dfd9ab2d08a1c9 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Mon, 27 Jun 2022 11:19:11 -0400 Subject: [PATCH] docsctrings --- data/data-pipeline/data_pipeline/etl/base.py | 5 +++ .../etl/score/schemas/datasets.py | 40 ++++++++++++++----- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 532bc8fb..b1aed335 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -33,6 +33,9 @@ class ExtractTransformLoad: Attributes: DATA_PATH (pathlib.Path): Local path where all data will be stored TMP_PATH (pathlib.Path): Local path where temporary data will be stored + + TODO: Fill missing attrs here + GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier """ @@ -97,6 +100,8 @@ class ExtractTransformLoad: @classmethod def yaml_config_load(cls) -> dict: + """Generate config dictionary and set instance variables from YAML dataset.""" + # check if the class instance has score YAML definitions datasets_config = load_yaml_dict_from_file( cls.DATASET_CONFIG / "datasets.yml", diff --git a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py index b78c1445..0d1e6f84 100644 --- a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py +++ b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py @@ -16,24 +16,46 @@ class FieldType(Enum): class DatasetsConfig: @dataclass class Dataset: + """A class that defines a dataset and its load variables. + + Attributes: + long_name (str): A human readable title for the dataset. + short_name (str): used to compose the short variable names for tiles/arcgis. All short variable names will be prepended + with the short name of the data set it comes from, i.e. `nri__ex_loss`. + module_name (str): A string that matches both the Python module name for the dataset and the `NAME` property on the ETL class. + description (str): A human readable description of the dataset. + load_fields (LoadField): A list of type LoadField that will drive the score ETL and side effects (tiles, downloadables). + """ + @dataclass class LoadField: + """A class to define the fields to be saved on the dataset's output. + + These fields will be then imported by the score generation ETL. + + Attributes: + short_name (str): Used in conjunction with the dataset's `short_name` for files where short names are needed. + df_field_name (str): Name for the field in the etl class. + long_name (str): Column name for the dataset's output csv. + field_type (FieldType): An enum that dictates what type of field this is. This will be used on the `etl_score_post` + for the data manipulation. + The `by_value` metadata prop will load the field type's Enum value instead of the index, i.e. "string" and not STRING + include_in_tiles (bool): Include this field on the tile export. + include_in_csv (bool): Include this field on the CSV export. + include_in_excel (bool): Include this field on the Excel export. + """ + short_name: str df_field_name: str long_name: str - field_type: FieldType = field( - metadata={"by_value": True} - ) # this will load the field type's Enum value - # instead of the index, i.e. "string" and not - # STRING - tile_include: bool - csv_download: bool - excel_download: bool + field_type: FieldType = field(metadata={"by_value": True}) + include_in_tiles: bool + include_in_csv: bool + include_in_excel: bool long_name: str short_name: str module_name: str - last_updated_year: int description: str input_geoid_tract_field_name: str load_fields: List[LoadField]