diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 7026c36f..99ac679a 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -119,6 +119,12 @@ class ExtractTransformLoad: # the YAML files? LOAD_YAML_CONFIG: bool = False + # Some data sets will have multiple rows of data per tract. For those data sets, + # set this variable to `True` to skip two validation steps. + # However, note that datasets with multiple rows per tract *cannot* be used + # in the score process. + VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT: bool = False + # We use output_df as the final dataframe to use to write to the CSV # It is used on the "load" base class method output_df: pd.DataFrame = None @@ -276,7 +282,10 @@ class ExtractTransformLoad: f"Must have `{geo_field}` in columns if " f"specifying geo level as `{geo_level} " ) - if self.output_df.shape[0] > expected_rows: + if ( + self.output_df.shape[0] > expected_rows + and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT + ): raise ValueError( f"Too many rows: `{self.output_df.shape[0]}` rows in " f"output exceeds expectation of `{expected_rows}` " @@ -302,7 +311,10 @@ class ExtractTransformLoad: self.output_df[geo_field].shape[0] - self.output_df[geo_field].nunique() ) - if duplicate_geo_field_values > 0: + if ( + duplicate_geo_field_values > 0 + and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT + ): raise ValueError( f"Duplicate values: There are {duplicate_geo_field_values} " f"duplicate values in " diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py index c1e2d0cb..30e5b6eb 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py @@ -24,12 +24,17 @@ class GeoCorrAlternativesETL(ExtractTransformLoad): # Metadata for the baseclass NAME = "geocorr_alternatives" GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + PUERTO_RICO_EXPECTED_IN_DATA = False INPUT_GEOCORR_TRACT_FIELD = "tract" INPUT_GEOCORR_COUNTY_FIELD = "county" INPUT_GEOCORR_ZIP_FIELD = "zcta5" INPUT_GEOCORR_ALLOCATION_FIELD = "afact" + # Skip some validation checks, because there will be multiple rows per tract in this + # geocorr dataset. + VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT = True + # GeoCorr downloads have a field definition in the second row of the CSV. # This parameter skips the second row for pandas `read_csv`. GEOCORR_SKIP_ROWS: typing.List[int] = [1] @@ -98,10 +103,4 @@ class GeoCorrAlternativesETL(ExtractTransformLoad): ".", "", regex=False ) - logger.info(zip_codes_to_tracts_df.head()) - self.output_df = zip_codes_to_tracts_df - - # TODO: DELETE - def validate(self) -> None: - pass