fixing up validation

2025-09-30 01:03:17 -07:00 · 2022-09-21 16:07:54 -04:00 · 2022-09-21 16:07:54 -04:00 · a3ad7e0a5a
commit a3ad7e0a5a
parent 7ceab512c1
2 changed files with 19 additions and 8 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -119,6 +119,12 @@ class ExtractTransformLoad:
    # the YAML files?
    LOAD_YAML_CONFIG: bool = False
    # Some data sets will have multiple rows of data per tract. For those data sets,
    # set this variable to `True` to skip two validation steps.
    # However, note that datasets with multiple rows per tract *cannot* be used
    # in the score process.
    VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT: bool = False
    # We use output_df as the final dataframe to use to write to the CSV
    # It is used on the "load" base class method
    output_df: pd.DataFrame = None
@ -276,7 +282,10 @@ class ExtractTransformLoad:
                        f"Must have `{geo_field}` in columns if "
                        f"specifying geo level as `{geo_level} "
                    )
-                if self.output_df.shape[0] > expected_rows:
+                if (
                    self.output_df.shape[0] > expected_rows
                    and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
                ):
                    raise ValueError(
                        f"Too many rows: `{self.output_df.shape[0]}` rows in "
                        f"output exceeds expectation of `{expected_rows}` "
@ -302,7 +311,10 @@ class ExtractTransformLoad:
                    self.output_df[geo_field].shape[0]
                    - self.output_df[geo_field].nunique()
                )
-                if duplicate_geo_field_values > 0:
+                if (
                    duplicate_geo_field_values > 0
                    and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
                ):
                    raise ValueError(
                        f"Duplicate values: There are {duplicate_geo_field_values} "
                        f"duplicate values in "
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py
@ -24,12 +24,17 @@ class GeoCorrAlternativesETL(ExtractTransformLoad):
    # Metadata for the baseclass
    NAME = "geocorr_alternatives"
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    INPUT_GEOCORR_TRACT_FIELD = "tract"
    INPUT_GEOCORR_COUNTY_FIELD = "county"
    INPUT_GEOCORR_ZIP_FIELD = "zcta5"
    INPUT_GEOCORR_ALLOCATION_FIELD = "afact"
    # Skip some validation checks, because there will be multiple rows per tract in this
    # geocorr dataset.
    VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT = True
    # GeoCorr downloads have a field definition in the second row of the CSV.
    # This parameter skips the second row for pandas `read_csv`.
    GEOCORR_SKIP_ROWS: typing.List[int] = [1]
@ -98,10 +103,4 @@ class GeoCorrAlternativesETL(ExtractTransformLoad):
            ".", "", regex=False
        )
        logger.info(zip_codes_to_tracts_df.head())
        self.output_df = zip_codes_to_tracts_df
    # TODO: DELETE
    def validate(self) -> None:
        pass