mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
fixing up validation
This commit is contained in:
parent
7ceab512c1
commit
a3ad7e0a5a
2 changed files with 19 additions and 8 deletions
|
@ -119,6 +119,12 @@ class ExtractTransformLoad:
|
||||||
# the YAML files?
|
# the YAML files?
|
||||||
LOAD_YAML_CONFIG: bool = False
|
LOAD_YAML_CONFIG: bool = False
|
||||||
|
|
||||||
|
# Some data sets will have multiple rows of data per tract. For those data sets,
|
||||||
|
# set this variable to `True` to skip two validation steps.
|
||||||
|
# However, note that datasets with multiple rows per tract *cannot* be used
|
||||||
|
# in the score process.
|
||||||
|
VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT: bool = False
|
||||||
|
|
||||||
# We use output_df as the final dataframe to use to write to the CSV
|
# We use output_df as the final dataframe to use to write to the CSV
|
||||||
# It is used on the "load" base class method
|
# It is used on the "load" base class method
|
||||||
output_df: pd.DataFrame = None
|
output_df: pd.DataFrame = None
|
||||||
|
@ -276,7 +282,10 @@ class ExtractTransformLoad:
|
||||||
f"Must have `{geo_field}` in columns if "
|
f"Must have `{geo_field}` in columns if "
|
||||||
f"specifying geo level as `{geo_level} "
|
f"specifying geo level as `{geo_level} "
|
||||||
)
|
)
|
||||||
if self.output_df.shape[0] > expected_rows:
|
if (
|
||||||
|
self.output_df.shape[0] > expected_rows
|
||||||
|
and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
|
||||||
|
):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Too many rows: `{self.output_df.shape[0]}` rows in "
|
f"Too many rows: `{self.output_df.shape[0]}` rows in "
|
||||||
f"output exceeds expectation of `{expected_rows}` "
|
f"output exceeds expectation of `{expected_rows}` "
|
||||||
|
@ -302,7 +311,10 @@ class ExtractTransformLoad:
|
||||||
self.output_df[geo_field].shape[0]
|
self.output_df[geo_field].shape[0]
|
||||||
- self.output_df[geo_field].nunique()
|
- self.output_df[geo_field].nunique()
|
||||||
)
|
)
|
||||||
if duplicate_geo_field_values > 0:
|
if (
|
||||||
|
duplicate_geo_field_values > 0
|
||||||
|
and not self.VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT
|
||||||
|
):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Duplicate values: There are {duplicate_geo_field_values} "
|
f"Duplicate values: There are {duplicate_geo_field_values} "
|
||||||
f"duplicate values in "
|
f"duplicate values in "
|
||||||
|
|
|
@ -24,12 +24,17 @@ class GeoCorrAlternativesETL(ExtractTransformLoad):
|
||||||
# Metadata for the baseclass
|
# Metadata for the baseclass
|
||||||
NAME = "geocorr_alternatives"
|
NAME = "geocorr_alternatives"
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
|
|
||||||
INPUT_GEOCORR_TRACT_FIELD = "tract"
|
INPUT_GEOCORR_TRACT_FIELD = "tract"
|
||||||
INPUT_GEOCORR_COUNTY_FIELD = "county"
|
INPUT_GEOCORR_COUNTY_FIELD = "county"
|
||||||
INPUT_GEOCORR_ZIP_FIELD = "zcta5"
|
INPUT_GEOCORR_ZIP_FIELD = "zcta5"
|
||||||
INPUT_GEOCORR_ALLOCATION_FIELD = "afact"
|
INPUT_GEOCORR_ALLOCATION_FIELD = "afact"
|
||||||
|
|
||||||
|
# Skip some validation checks, because there will be multiple rows per tract in this
|
||||||
|
# geocorr dataset.
|
||||||
|
VALIDATE_SHOULD_SKIP_DUPLICATE_GEOGRAPHIES_AND_GEOGRAPHY_COUNT = True
|
||||||
|
|
||||||
# GeoCorr downloads have a field definition in the second row of the CSV.
|
# GeoCorr downloads have a field definition in the second row of the CSV.
|
||||||
# This parameter skips the second row for pandas `read_csv`.
|
# This parameter skips the second row for pandas `read_csv`.
|
||||||
GEOCORR_SKIP_ROWS: typing.List[int] = [1]
|
GEOCORR_SKIP_ROWS: typing.List[int] = [1]
|
||||||
|
@ -98,10 +103,4 @@ class GeoCorrAlternativesETL(ExtractTransformLoad):
|
||||||
".", "", regex=False
|
".", "", regex=False
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(zip_codes_to_tracts_df.head())
|
|
||||||
|
|
||||||
self.output_df = zip_codes_to_tracts_df
|
self.output_df = zip_codes_to_tracts_df
|
||||||
|
|
||||||
# TODO: DELETE
|
|
||||||
def validate(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue