j40-cejst-2/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py

import geopandas as gpd
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
from data_pipeline.etl.sources.geo_utils import get_tribal_geojson
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger

logger = get_module_logger(__name__)


class TribalOverlapETL(ExtractTransformLoad):
    """Calculates the overlap between Census tracts and Tribal boundaries."""

    # Metadata for the baseclass
    NAME = "tribal_overlap"
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT

    PUERTO_RICO_EXPECTED_IN_DATA = False
    ALASKA_AND_HAWAII_EXPECTED_IN_DATA = True
    EXPECTED_MISSING_STATES = [
        # 15 is Hawaii, which has Hawaiian Home Lands, but they are not included in
        # this dataset.
        "15",
        # The following states do not have any federally recognized Tribes in this
        # dataset.
        "10",
        "11",
        "13",
        "17",
        "18",
        "21",
        "24",
        "33",
        "34",
        "39",
        "50",
        "51",
        "54",
    ]

    # A Tribal area that requires some special processing.
    ANNETTE_ISLAND_TRIBAL_NAME = "Annette Island LAR"

    CRS_INTEGER = 3857

    # Define these for easy code completion
    def __init__(self):
        self.COLUMNS_TO_KEEP = [
            self.GEOID_TRACT_FIELD_NAME,
            field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
            field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
            field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
            field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT_DISPLAY_STRING,
        ]

        self.output_df: pd.DataFrame
        self.census_tract_gdf: gpd.GeoDataFrame
        self.tribal_gdf: gpd.GeoDataFrame

    @staticmethod
    def _create_string_from_list(series: pd.Series) -> str:
        """Helper method that creates a sorted string list (for tribal names)."""
        str_list = series.tolist()
        str_list = sorted(str_list)
        return ", ".join(str_list)

    @staticmethod
    def _adjust_percentage_to_string(percentage_float: float) -> str:
        """Helper method that converts numeric floats to strings based on what-to-show rules.

        What are these rules?
        0. If None, return none
        1. If the percentage is below 1%, produce 'less than 1%'
        2. If the percentage is above 99.95%, produce '100%'
        3. If the percentage is X.00 when rounded to two sig digits, display the integer of the percent
        4. If the percentage has unique significant digits, report two digits
        """
        # Rule 0
        if not percentage_float:
            # I believe we need to do this because JS will do weird things with a mix-type column?
            return "No tribal areas"
        # Rule 1
        if percentage_float < 0.01:
            return "less than 1%"
        # Rule 2
        if percentage_float > 0.9995:
            return "100%"

        rounded_percentage_str = str(round(percentage_float, 4) * 100)
        first_digits, last_digits = rounded_percentage_str.split(".")

        # Rule 3 (this is a shorthand because round(4) will truncate repeated 0s)
        if last_digits[-1] == "0":
            return first_digits + "%"

        # Rule 4
        if last_digits != "00":
            return rounded_percentage_str + "%"

        # There is something missing!
        raise Exception("Yikes! The string conversion here failed!")

    def extract(self) -> None:
        self.census_tract_gdf = get_tract_geojson()
        self.tribal_gdf = get_tribal_geojson()

    def transform(self) -> None:
        logger.info("Starting tribal overlap transforms.")

        # First, calculate whether tracts include any areas from the Tribal areas,
        # for both the points in AK and the polygons in the continental US (CONUS).
        tribal_overlap_with_tracts = add_tracts_for_geometries(
            df=self.tribal_gdf, tract_data=self.census_tract_gdf
        )

        tribal_overlap_with_tracts = tribal_overlap_with_tracts.groupby(
            [self.GEOID_TRACT_FIELD_NAME]
        ).agg(
            {
                field_names.TRIBAL_ID: "count",
                field_names.TRIBAL_LAND_AREA_NAME: self._create_string_from_list,
            }
        )

        tribal_overlap_with_tracts = tribal_overlap_with_tracts.reset_index()

        tribal_overlap_with_tracts = tribal_overlap_with_tracts.rename(
            columns={
                field_names.TRIBAL_ID: field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
                field_names.TRIBAL_LAND_AREA_NAME: field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
            }
        )

        # Second, calculate percentage overlap.
        # Drop the points from the Tribal data (because these cannot be joined to a
        # (Multi)Polygon tract data frame)
        tribal_gdf_without_points = self.tribal_gdf[
            self.tribal_gdf.geom_type.isin(["Polygon", "MultiPolygon"])
        ]

        # Switch from geographic to projected CRSes
        # because logically that's right
        self.census_tract_gdf = self.census_tract_gdf.to_crs(
            crs=self.CRS_INTEGER
        )
        tribal_gdf_without_points = tribal_gdf_without_points.to_crs(
            crs=self.CRS_INTEGER
        )

        # Create a measure for the entire census tract area
        self.census_tract_gdf["area_tract"] = self.census_tract_gdf.area

        # Performing overlay funcion
        # We have a mix of polygons and multipolygons, and we just want the overlaps
        # without caring a ton about the specific types, so we ignore geom type.
        # Realistically, this changes almost nothing in the calculation; True and False
        # are the same within 9 digits of precision
        gdf_joined = gpd.overlay(
            self.census_tract_gdf,
            tribal_gdf_without_points,
            how="intersection",
            keep_geom_type=False,
        )

        # Calculating the areas of the newly-created overlapping geometries
        gdf_joined["area_joined"] = gdf_joined.area

        # Calculating the areas of the newly-created geometries in relation
        # to the original tract geometries
        gdf_joined[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT] = (
            gdf_joined["area_joined"] / gdf_joined["area_tract"]
        )

        # Aggregate the results
        percentage_results = gdf_joined.groupby(
            [self.GEOID_TRACT_FIELD_NAME]
        ).agg({field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "sum"})

        percentage_results = percentage_results.reset_index()

        # Merge the two results.
        merged_output_df = tribal_overlap_with_tracts.merge(
            right=percentage_results,
            how="outer",
            on=self.GEOID_TRACT_FIELD_NAME,
        )

        # Finally, fix one unique error.
        # There is one unique Tribal area (self.ANNETTE_ISLAND_TRIBAL_NAME) that is a polygon in
        # Alaska. All other Tribal areas in Alaska are points.
        # For tracts that *only* contain that Tribal area, leave percentage as is.
        # For tracts that include that Tribal area AND Alaska Native villages,
        # null the percentage, because we cannot calculate the percent of the tract
        # this is within Tribal areas.

        # Create state FIPS codes.
        merged_output_df_state_fips_code = merged_output_df[
            self.GEOID_TRACT_FIELD_NAME
        ].str[0:2]

        # Start by testing for Annette Island exception, to make sure data is as
        # expected
        alaskan_non_annette_matches = (
            # Data from Alaska
            (merged_output_df_state_fips_code == "02")
            # Where the Tribal areas do *not* include Annette
            & (
                ~merged_output_df[
                    field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT
                ].str.contains(self.ANNETTE_ISLAND_TRIBAL_NAME)
            )
            # But somehow percentage is greater than zero.
            & (
                merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT]
                > 0
            )
        )

        # There should be none of these matches.
        if sum(alaskan_non_annette_matches) > 0:
            raise ValueError(
                "Data has changed. More than one Alaskan Tribal Area has polygon "
                "boundaries. You'll need to refactor this ETL. \n"
                f"Data:\n{merged_output_df[alaskan_non_annette_matches]}"
            )

        # Now, fix the exception that is already known.
        merged_output_df[
            field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT
        ] = np.where(
            # For tracts inside Alaska
            (merged_output_df_state_fips_code == "02")
            # That are not only represented by Annette Island
            & (
                merged_output_df[field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT]
                != self.ANNETTE_ISLAND_TRIBAL_NAME
            ),
            # Set the value to `None` for tracts with more than just Annette.
            None,
            # Otherwise, set the value to what it was.
            merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT],
        )

        # The very final thing we want to do is produce a string for the front end to show
        # We do this here so that all of the logic is included
        merged_output_df[
            field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT_DISPLAY_STRING
        ] = merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT].apply(
            self._adjust_percentage_to_string
        )

        self.output_df = merged_output_df
Issue 1900: Tribal overlap with Census tracts (#1903) * working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov> 2022-09-20 14:53:12 -04:00			`import geopandas as gpd`
			`import numpy as np`
			`import pandas as pd`
Issue 105: Configure and run `black` and other pre-commit hooks (clean branch) (#1962) * Configure and run `black` and other pre-commit hooks Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov> 2022-10-04 18:08:47 -04:00			`from data_pipeline.etl.base import ExtractTransformLoad`
			`from data_pipeline.etl.base import ValidGeoLevel`
			`from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries`
			`from data_pipeline.etl.sources.geo_utils import get_tract_geojson`
			`from data_pipeline.etl.sources.geo_utils import get_tribal_geojson`
Issue 1900: Tribal overlap with Census tracts (#1903) * working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov> 2022-09-20 14:53:12 -04:00			`from data_pipeline.score import field_names`
			`from data_pipeline.utils import get_module_logger`

			`logger = get_module_logger(__name__)`


			`class TribalOverlapETL(ExtractTransformLoad):`
			`"""Calculates the overlap between Census tracts and Tribal boundaries."""`

			`# Metadata for the baseclass`
			`NAME = "tribal_overlap"`
			`GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT`

			`PUERTO_RICO_EXPECTED_IN_DATA = False`
			`ALASKA_AND_HAWAII_EXPECTED_IN_DATA = True`
			`EXPECTED_MISSING_STATES = [`
			`# 15 is Hawaii, which has Hawaiian Home Lands, but they are not included in`
			`# this dataset.`
			`"15",`
			`# The following states do not have any federally recognized Tribes in this`
			`# dataset.`
			`"10",`
			`"11",`
			`"13",`
			`"17",`
			`"18",`
			`"21",`
			`"24",`
			`"33",`
			`"34",`
			`"39",`
			`"50",`
			`"51",`
			`"54",`
			`]`

			`# A Tribal area that requires some special processing.`
			`ANNETTE_ISLAND_TRIBAL_NAME = "Annette Island LAR"`

			`CRS_INTEGER = 3857`

			`# Define these for easy code completion`
			`def __init__(self):`
			`self.COLUMNS_TO_KEEP = [`
			`self.GEOID_TRACT_FIELD_NAME,`
			`field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,`
			`field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,`
			`field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,`
produce a string for the front end to ingest (#1963) * wip * i believe this works -- let's see the pipeline * updated fixtures 2022-10-04 10:05:00 -04:00			`field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT_DISPLAY_STRING,`
Issue 1900: Tribal overlap with Census tracts (#1903) * working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov> 2022-09-20 14:53:12 -04:00			`]`

			`self.output_df: pd.DataFrame`
			`self.census_tract_gdf: gpd.GeoDataFrame`
			`self.tribal_gdf: gpd.GeoDataFrame`

			`@staticmethod`
			`def _create_string_from_list(series: pd.Series) -> str:`
			`"""Helper method that creates a sorted string list (for tribal names)."""`
			`str_list = series.tolist()`
			`str_list = sorted(str_list)`
			`return ", ".join(str_list)`

produce a string for the front end to ingest (#1963) * wip * i believe this works -- let's see the pipeline * updated fixtures 2022-10-04 10:05:00 -04:00			`@staticmethod`
			`def _adjust_percentage_to_string(percentage_float: float) -> str:`
			`"""Helper method that converts numeric floats to strings based on what-to-show rules.`

			`What are these rules?`
			`0. If None, return none`
			`1. If the percentage is below 1%, produce 'less than 1%'`
			`2. If the percentage is above 99.95%, produce '100%'`
			`3. If the percentage is X.00 when rounded to two sig digits, display the integer of the percent`
			`4. If the percentage has unique significant digits, report two digits`
			`"""`
			`# Rule 0`
			`if not percentage_float:`
			`# I believe we need to do this because JS will do weird things with a mix-type column?`
			`return "No tribal areas"`
			`# Rule 1`
			`if percentage_float < 0.01:`
			`return "less than 1%"`
			`# Rule 2`
			`if percentage_float > 0.9995:`
			`return "100%"`

			`rounded_percentage_str = str(round(percentage_float, 4) * 100)`
			`first_digits, last_digits = rounded_percentage_str.split(".")`

			`# Rule 3 (this is a shorthand because round(4) will truncate repeated 0s)`
			`if last_digits[-1] == "0":`
			`return first_digits + "%"`

			`# Rule 4`
			`if last_digits != "00":`
			`return rounded_percentage_str + "%"`

			`# There is something missing!`
			`raise Exception("Yikes! The string conversion here failed!")`

Issue 1900: Tribal overlap with Census tracts (#1903) * working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov> 2022-09-20 14:53:12 -04:00			`def extract(self) -> None:`
			`self.census_tract_gdf = get_tract_geojson()`
			`self.tribal_gdf = get_tribal_geojson()`

			`def transform(self) -> None:`
			`logger.info("Starting tribal overlap transforms.")`

			`# First, calculate whether tracts include any areas from the Tribal areas,`
			`# for both the points in AK and the polygons in the continental US (CONUS).`
			`tribal_overlap_with_tracts = add_tracts_for_geometries(`
			`df=self.tribal_gdf, tract_data=self.census_tract_gdf`
			`)`

			`tribal_overlap_with_tracts = tribal_overlap_with_tracts.groupby(`
			`[self.GEOID_TRACT_FIELD_NAME]`
			`).agg(`
			`{`
			`field_names.TRIBAL_ID: "count",`
			`field_names.TRIBAL_LAND_AREA_NAME: self._create_string_from_list,`
			`}`
			`)`

			`tribal_overlap_with_tracts = tribal_overlap_with_tracts.reset_index()`

			`tribal_overlap_with_tracts = tribal_overlap_with_tracts.rename(`
			`columns={`
			`field_names.TRIBAL_ID: field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,`
			`field_names.TRIBAL_LAND_AREA_NAME: field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,`
			`}`
			`)`

			`# Second, calculate percentage overlap.`
			`# Drop the points from the Tribal data (because these cannot be joined to a`
			`# (Multi)Polygon tract data frame)`
			`tribal_gdf_without_points = self.tribal_gdf[`
			`self.tribal_gdf.geom_type.isin(["Polygon", "MultiPolygon"])`
			`]`

			`# Switch from geographic to projected CRSes`
			`# because logically that's right`
Issue 1910: Do not impute income for 0 population tracts (#1918) * should be working, has unnecessary loggers * removing loggers and cleaning up * updating ejscreen tests * adding tests and responding to PR feedback * fixing broken smoke test * delete smoketest docs 2022-09-26 11:00:21 -04:00			`self.census_tract_gdf = self.census_tract_gdf.to_crs(`
			`crs=self.CRS_INTEGER`
			`)`
			`tribal_gdf_without_points = tribal_gdf_without_points.to_crs(`
			`crs=self.CRS_INTEGER`
			`)`
Issue 1900: Tribal overlap with Census tracts (#1903) * working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov> 2022-09-20 14:53:12 -04:00
			`# Create a measure for the entire census tract area`
			`self.census_tract_gdf["area_tract"] = self.census_tract_gdf.area`

			`# Performing overlay funcion`
			`# We have a mix of polygons and multipolygons, and we just want the overlaps`
			`# without caring a ton about the specific types, so we ignore geom type.`
			`# Realistically, this changes almost nothing in the calculation; True and False`
			`# are the same within 9 digits of precision`
			`gdf_joined = gpd.overlay(`
			`self.census_tract_gdf,`
			`tribal_gdf_without_points,`
			`how="intersection",`
			`keep_geom_type=False,`
			`)`

			`# Calculating the areas of the newly-created overlapping geometries`
			`gdf_joined["area_joined"] = gdf_joined.area`

			`# Calculating the areas of the newly-created geometries in relation`
			`# to the original tract geometries`
			`gdf_joined[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT] = (`
			`gdf_joined["area_joined"] / gdf_joined["area_tract"]`
			`)`

			`# Aggregate the results`
			`percentage_results = gdf_joined.groupby(`
			`[self.GEOID_TRACT_FIELD_NAME]`
			`).agg({field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "sum"})`

			`percentage_results = percentage_results.reset_index()`

			`# Merge the two results.`
			`merged_output_df = tribal_overlap_with_tracts.merge(`
			`right=percentage_results,`
			`how="outer",`
			`on=self.GEOID_TRACT_FIELD_NAME,`
			`)`

			`# Finally, fix one unique error.`
			`# There is one unique Tribal area (self.ANNETTE_ISLAND_TRIBAL_NAME) that is a polygon in`
			`# Alaska. All other Tribal areas in Alaska are points.`
			`# For tracts that only contain that Tribal area, leave percentage as is.`
			`# For tracts that include that Tribal area AND Alaska Native villages,`
			`# null the percentage, because we cannot calculate the percent of the tract`
			`# this is within Tribal areas.`

			`# Create state FIPS codes.`
			`merged_output_df_state_fips_code = merged_output_df[`
			`self.GEOID_TRACT_FIELD_NAME`
			`].str[0:2]`

			`# Start by testing for Annette Island exception, to make sure data is as`
			`# expected`
			`alaskan_non_annette_matches = (`
			`# Data from Alaska`
			`(merged_output_df_state_fips_code == "02")`
			`# Where the Tribal areas do not include Annette`
			`& (`
			`~merged_output_df[`
			`field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT`
			`].str.contains(self.ANNETTE_ISLAND_TRIBAL_NAME)`
			`)`
			`# But somehow percentage is greater than zero.`
			`& (`
			`merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT]`
			`> 0`
			`)`
			`)`

			`# There should be none of these matches.`
			`if sum(alaskan_non_annette_matches) > 0:`
			`raise ValueError(`
			`"Data has changed. More than one Alaskan Tribal Area has polygon "`
			`"boundaries. You'll need to refactor this ETL. \n"`
			`f"Data:\n{merged_output_df[alaskan_non_annette_matches]}"`
			`)`

			`# Now, fix the exception that is already known.`
			`merged_output_df[`
			`field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT`
			`] = np.where(`
			`# For tracts inside Alaska`
			`(merged_output_df_state_fips_code == "02")`
			`# That are not only represented by Annette Island`
			`& (`
			`merged_output_df[field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT]`
			`!= self.ANNETTE_ISLAND_TRIBAL_NAME`
			`),`
			# Set the value to `None` for tracts with more than just Annette.
			`None,`
			`# Otherwise, set the value to what it was.`
			`merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT],`
			`)`

produce a string for the front end to ingest (#1963) * wip * i believe this works -- let's see the pipeline * updated fixtures 2022-10-04 10:05:00 -04:00			`# The very final thing we want to do is produce a string for the front end to show`
			`# We do this here so that all of the logic is included`
			`merged_output_df[`
			`field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT_DISPLAY_STRING`
			`] = merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT].apply(`
			`self._adjust_percentage_to_string`
			`)`

Issue 1900: Tribal overlap with Census tracts (#1903) * working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov> 2022-09-20 14:53:12 -04:00			`self.output_df = merged_output_df`