diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py index 68c451a2..2ee3aa52 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr_alternatives/etl.py @@ -49,6 +49,7 @@ class GeoCorrAlternativesETL(ExtractTransformLoad): ZIP_CODE_INPUT_FIELD = "ZCTA5CE20" AREA_JOINED_FIELD = "area_joined" AREA_ZIP_FIELD = "area_zip" + TRACT_AREA = "area_tract" def __init__(self): self.COLUMNS_TO_KEEP = [ @@ -118,4 +119,15 @@ class GeoCorrAlternativesETL(ExtractTransformLoad): joined_gdf[self.AREA_JOINED_FIELD] / joined_gdf[self.AREA_ZIP_FIELD] ) + # Calculating "size of tract" that is relevant, e.g., the sum of all overlapping + # area between the tract and the zip + joined_gdf[self.TRACT_AREA] = joined_gdf.groupby( + field_names.GEOID_TRACT_FIELD + )[self.AREA_JOINED_FIELD].transform(sum) + + # Calculating share of tract in the zipcode (ordered at tract, zip level) + joined_gdf[field_names.PERCENT_OF_TRACT_IN_ZIP] = ( + joined_gdf[self.AREA_JOINED_FIELD] / joined_gdf[self.TRACT_AREA] + ) + self.output_df = joined_gdf diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index c19bf5cb..a3add9ef 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -363,7 +363,11 @@ PERCENT_OF_TRIBAL_AREA_IN_TRACT = ( # GeoCorr alternatives variables ZIP_CODE = "Zip code tabulation area (ZCTA)" -PERCENT_OF_ZIP_CODE_IN_TRACT = "Percent of zip code tabulation area (ZCTA) in tract" +PERCENT_OF_ZIP_CODE_IN_TRACT = ( + "Percent of zip code tabulation area (ZCTA) in tract" +) +PERCENT_OF_TRACT_IN_ZIP = "Percent of tract in zip code tabulation area (ZCTA)" + ##### # Names for individual factors being exceeded