mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 07:20:18 -07:00
Issue 1900: Tribal overlap with Census tracts (#1903)
* working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov>
This commit is contained in:
parent
876655d2b2
commit
aca226165c
19 changed files with 1921 additions and 36 deletions
|
@ -186,6 +186,12 @@ DATASET_LIST = [
|
|||
"class_name": "AbandonedMineETL",
|
||||
"is_memory_intensive": True,
|
||||
},
|
||||
{
|
||||
"name": "tribal_overlap",
|
||||
"module_dir": "tribal_overlap",
|
||||
"class_name": "TribalOverlapETL",
|
||||
"is_memory_intensive": True,
|
||||
},
|
||||
]
|
||||
|
||||
CENSUS_INFO = {
|
||||
|
|
|
@ -106,6 +106,8 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
|||
# Otherwise, the exceptions are silently ignored.
|
||||
fut.result()
|
||||
|
||||
# Note: these high-memory datasets also usually require the Census geojson to be
|
||||
# generated, and one of them requires the Tribal geojson to be generated.
|
||||
if high_memory_datasets:
|
||||
logger.info("Running high-memory jobs")
|
||||
for dataset in high_memory_datasets:
|
||||
|
|
|
@ -290,6 +290,32 @@ datasets:
|
|||
include_in_tiles: true
|
||||
include_in_downloadable_files: true
|
||||
create_percentile: true
|
||||
- long_name: "Overlap between Census tract boundaries and Tribal area boundaries."
|
||||
short_name: "tribal_overlap"
|
||||
module_name: "tribal_overlap"
|
||||
input_geoid_tract_field_name: "GEOID10_TRACT"
|
||||
load_fields:
|
||||
- short_name: "tribal_count"
|
||||
df_field_name: "COUNT_OF_TRIBAL_AREAS_IN_TRACT"
|
||||
long_name: "Number of Tribal areas within Census tract"
|
||||
field_type: int64
|
||||
include_in_tiles: true
|
||||
include_in_downloadable_files: true
|
||||
create_percentile: false
|
||||
- short_name: "tribal_percent"
|
||||
df_field_name: "PERCENT_OF_TRIBAL_AREA_IN_TRACT"
|
||||
long_name: "Percent of the Census tract that is within Tribal areas"
|
||||
field_type: float
|
||||
include_in_tiles: true
|
||||
include_in_downloadable_files: true
|
||||
create_percentile: false
|
||||
number_of_decimals_in_output: 6
|
||||
- short_name: "tribal_names"
|
||||
df_field_name: "NAMES_OF_TRIBAL_AREAS_IN_TRACT"
|
||||
long_name: "Names of Tribal areas within Census tract"
|
||||
field_type: string
|
||||
include_in_tiles: true
|
||||
include_in_downloadable_files: true
|
||||
- long_name: "CDC Life Expeectancy"
|
||||
short_name: "cdc_life_expectancy"
|
||||
module_name: "cdc_life_expectancy"
|
||||
|
@ -302,5 +328,4 @@ datasets:
|
|||
include_in_tiles: false
|
||||
include_in_downloadable_files: true
|
||||
create_percentile: false
|
||||
create_reverse_percentile: true
|
||||
|
||||
create_reverse_percentile: true
|
|
@ -15,6 +15,7 @@ from data_pipeline.etl.sources.fsf_flood_risk.etl import (
|
|||
FloodRiskETL,
|
||||
)
|
||||
from data_pipeline.etl.sources.eamlis.etl import AbandonedMineETL
|
||||
from data_pipeline.etl.sources.tribal_overlap.etl import TribalOverlapETL
|
||||
from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
|
||||
from data_pipeline.etl.sources.nlcd_nature_deprived.etl import NatureDeprivedETL
|
||||
from data_pipeline.etl.sources.fsf_wildfire_risk.etl import WildfireRiskETL
|
||||
|
@ -52,6 +53,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.nature_deprived_df: pd.DataFrame
|
||||
self.eamlis_df: pd.DataFrame
|
||||
self.fuds_df: pd.DataFrame
|
||||
self.tribal_overlap_df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Loading data sets from disk.")
|
||||
|
@ -148,6 +150,9 @@ class ScoreETL(ExtractTransformLoad):
|
|||
# Load FUDS dataset
|
||||
self.fuds_df = USArmyFUDS.get_data_frame()
|
||||
|
||||
# Load Tribal overlap dataset
|
||||
self.tribal_overlap_df = TribalOverlapETL.get_data_frame()
|
||||
|
||||
# Load GeoCorr Urban Rural Map
|
||||
geocorr_urban_rural_csv = (
|
||||
constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
|
||||
|
@ -359,6 +364,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.nature_deprived_df,
|
||||
self.eamlis_df,
|
||||
self.fuds_df,
|
||||
self.tribal_overlap_df
|
||||
]
|
||||
|
||||
# Sanity check each data frame before merging.
|
||||
|
@ -469,12 +475,15 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.PERCENT_AGE_UNDER_10,
|
||||
field_names.PERCENT_AGE_10_TO_64,
|
||||
field_names.PERCENT_AGE_OVER_64,
|
||||
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
|
||||
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
|
||||
]
|
||||
|
||||
non_numeric_columns = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD,
|
||||
field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
|
||||
field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
|
||||
]
|
||||
|
||||
boolean_columns = [
|
||||
|
|
|
@ -229,3 +229,25 @@ def test_compare_to_list_of_expected_state_fips_codes():
|
|||
continental_us_expected=False,
|
||||
alaska_and_hawaii_expected=False,
|
||||
)
|
||||
|
||||
# Missing Hawaii but not Alaska
|
||||
fips_codes_test_5 = [x for x in fips_codes_test_1 if x not in ["15"]]
|
||||
|
||||
# Should raise error because both Hawaii and Alaska are expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_5,
|
||||
alaska_and_hawaii_expected=True,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
"['15']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Should work as expected
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_5,
|
||||
alaska_and_hawaii_expected=True,
|
||||
additional_fips_codes_not_expected=["15"],
|
||||
)
|
||||
|
|
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||
from typing import Optional
|
||||
from functools import lru_cache
|
||||
import geopandas as gpd
|
||||
from data_pipeline.etl.sources.tribal.etl import TribalETL
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from .census.etl import CensusETL
|
||||
|
||||
|
@ -18,21 +19,44 @@ def get_tract_geojson(
|
|||
GEOJSON_PATH = _tract_data_path
|
||||
if GEOJSON_PATH is None:
|
||||
GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH
|
||||
if not GEOJSON_PATH.exists():
|
||||
logger.debug("Census data has not been computed, running")
|
||||
census_etl = CensusETL()
|
||||
census_etl.extract()
|
||||
census_etl.transform()
|
||||
census_etl.load()
|
||||
else:
|
||||
logger.debug("Loading existing tract geojson")
|
||||
tract_data = gpd.read_file(GEOJSON_PATH, include_fields=["GEOID10"])
|
||||
tract_data.rename(columns={"GEOID10": "GEOID10_TRACT"}, inplace=True)
|
||||
if not GEOJSON_PATH.exists():
|
||||
logger.debug("Census data has not been computed, running")
|
||||
census_etl = CensusETL()
|
||||
census_etl.extract()
|
||||
census_etl.transform()
|
||||
census_etl.load()
|
||||
tract_data = gpd.read_file(
|
||||
GEOJSON_PATH,
|
||||
include_fields=["GEOID10"],
|
||||
)
|
||||
tract_data = tract_data.rename(
|
||||
columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
|
||||
)
|
||||
return tract_data
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_tribal_geojson(
|
||||
_tribal_data_path: Optional[Path] = None,
|
||||
) -> gpd.GeoDataFrame:
|
||||
logger.info("Loading Tribal geometry data from Tribal ETL")
|
||||
GEOJSON_PATH = _tribal_data_path
|
||||
if GEOJSON_PATH is None:
|
||||
GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH
|
||||
if not GEOJSON_PATH.exists():
|
||||
logger.debug("Tribal data has not been computed, running")
|
||||
tribal_etl = TribalETL()
|
||||
tribal_etl.extract()
|
||||
tribal_etl.transform()
|
||||
tribal_etl.load()
|
||||
tribal_data = gpd.read_file(
|
||||
GEOJSON_PATH,
|
||||
)
|
||||
return tribal_data
|
||||
|
||||
|
||||
def add_tracts_for_geometries(
|
||||
df: gpd.GeoDataFrame, _tract_data_path: Optional[Path] = None
|
||||
df: gpd.GeoDataFrame, tract_data: Optional[gpd.GeoDataFrame] = None
|
||||
) -> gpd.GeoDataFrame:
|
||||
"""Adds tract-geoids to dataframe df that contains spatial geometries
|
||||
|
||||
|
@ -40,8 +64,8 @@ def add_tracts_for_geometries(
|
|||
|
||||
Args:
|
||||
df (GeoDataFrame): a geopandas GeoDataFrame with a point geometry column
|
||||
_tract_data_path (Path): an override to directly pass a GEOJSON file of
|
||||
tracts->Geometries, to simplify testing.
|
||||
tract_data (GeoDataFrame): optional override to directly pass a
|
||||
geodataframe of the tract boundaries. Also helps simplify testing.
|
||||
|
||||
Returns:
|
||||
GeoDataFrame: the above dataframe, with an additional GEOID10_TRACT column that
|
||||
|
@ -49,7 +73,12 @@ def add_tracts_for_geometries(
|
|||
spatial analysis
|
||||
"""
|
||||
logger.debug("Appending tract data to dataframe")
|
||||
tract_data = get_tract_geojson(_tract_data_path)
|
||||
|
||||
if tract_data is None:
|
||||
tract_data = get_tract_geojson()
|
||||
else:
|
||||
logger.debug("Using existing tract data.")
|
||||
|
||||
assert (
|
||||
tract_data.crs == df.crs
|
||||
), f"Dataframe must be projected to {tract_data.crs}"
|
||||
|
|
|
@ -3,6 +3,7 @@ import geopandas as gpd
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger, unzip_file_from_url
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
@ -59,7 +60,10 @@ class TribalETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
bia_national_lar_df.rename(
|
||||
columns={"LARID": "tribalId", "LARName": "landAreaName"},
|
||||
columns={
|
||||
"LARID": field_names.TRIBAL_ID,
|
||||
"LARName": field_names.TRIBAL_LAND_AREA_NAME,
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
|
@ -87,7 +91,10 @@ class TribalETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
bia_aian_supplemental_df.rename(
|
||||
columns={"OBJECTID": "tribalId", "Land_Area_": "landAreaName"},
|
||||
columns={
|
||||
"OBJECTID": field_names.TRIBAL_ID,
|
||||
"Land_Area_": field_names.TRIBAL_LAND_AREA_NAME,
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
|
@ -113,7 +120,10 @@ class TribalETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
bia_tsa_df.rename(
|
||||
columns={"TSAID": "tribalId", "LARName": "landAreaName"},
|
||||
columns={
|
||||
"TSAID": field_names.TRIBAL_ID,
|
||||
"LARName": field_names.TRIBAL_LAND_AREA_NAME,
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
|
@ -136,8 +146,8 @@ class TribalETL(ExtractTransformLoad):
|
|||
|
||||
alaska_native_villages_df.rename(
|
||||
columns={
|
||||
"GlobalID": "tribalId",
|
||||
"TRIBALOFFICENAME": "landAreaName",
|
||||
"GlobalID": field_names.TRIBAL_ID,
|
||||
"TRIBALOFFICENAME": field_names.TRIBAL_LAND_AREA_NAME,
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
|
|
|
@ -0,0 +1,208 @@
|
|||
import geopandas as gpd
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
|
||||
from data_pipeline.etl.sources.geo_utils import (
|
||||
add_tracts_for_geometries,
|
||||
get_tribal_geojson,
|
||||
get_tract_geojson,
|
||||
)
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class TribalOverlapETL(ExtractTransformLoad):
|
||||
"""Calculates the overlap between Census tracts and Tribal boundaries."""
|
||||
|
||||
# Metadata for the baseclass
|
||||
NAME = "tribal_overlap"
|
||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||
|
||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||
ALASKA_AND_HAWAII_EXPECTED_IN_DATA = True
|
||||
EXPECTED_MISSING_STATES = [
|
||||
# 15 is Hawaii, which has Hawaiian Home Lands, but they are not included in
|
||||
# this dataset.
|
||||
"15",
|
||||
# The following states do not have any federally recognized Tribes in this
|
||||
# dataset.
|
||||
"10",
|
||||
"11",
|
||||
"13",
|
||||
"17",
|
||||
"18",
|
||||
"21",
|
||||
"24",
|
||||
"33",
|
||||
"34",
|
||||
"39",
|
||||
"50",
|
||||
"51",
|
||||
"54",
|
||||
]
|
||||
|
||||
# A Tribal area that requires some special processing.
|
||||
ANNETTE_ISLAND_TRIBAL_NAME = "Annette Island LAR"
|
||||
|
||||
CRS_INTEGER = 3857
|
||||
|
||||
# Define these for easy code completion
|
||||
def __init__(self):
|
||||
self.COLUMNS_TO_KEEP = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
|
||||
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
|
||||
field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
|
||||
]
|
||||
|
||||
self.output_df: pd.DataFrame
|
||||
self.census_tract_gdf: gpd.GeoDataFrame
|
||||
self.tribal_gdf: gpd.GeoDataFrame
|
||||
|
||||
@staticmethod
|
||||
def _create_string_from_list(series: pd.Series) -> str:
|
||||
"""Helper method that creates a sorted string list (for tribal names)."""
|
||||
str_list = series.tolist()
|
||||
str_list = sorted(str_list)
|
||||
return ", ".join(str_list)
|
||||
|
||||
def extract(self) -> None:
|
||||
self.census_tract_gdf = get_tract_geojson()
|
||||
self.tribal_gdf = get_tribal_geojson()
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting tribal overlap transforms.")
|
||||
|
||||
# First, calculate whether tracts include any areas from the Tribal areas,
|
||||
# for both the points in AK and the polygons in the continental US (CONUS).
|
||||
tribal_overlap_with_tracts = add_tracts_for_geometries(
|
||||
df=self.tribal_gdf, tract_data=self.census_tract_gdf
|
||||
)
|
||||
|
||||
tribal_overlap_with_tracts = tribal_overlap_with_tracts.groupby(
|
||||
[self.GEOID_TRACT_FIELD_NAME]
|
||||
).agg(
|
||||
{
|
||||
field_names.TRIBAL_ID: "count",
|
||||
field_names.TRIBAL_LAND_AREA_NAME: self._create_string_from_list,
|
||||
}
|
||||
)
|
||||
|
||||
tribal_overlap_with_tracts = tribal_overlap_with_tracts.reset_index()
|
||||
|
||||
tribal_overlap_with_tracts = tribal_overlap_with_tracts.rename(
|
||||
columns={
|
||||
field_names.TRIBAL_ID: field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
|
||||
field_names.TRIBAL_LAND_AREA_NAME: field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
|
||||
}
|
||||
)
|
||||
|
||||
# Second, calculate percentage overlap.
|
||||
# Drop the points from the Tribal data (because these cannot be joined to a
|
||||
# (Multi)Polygon tract data frame)
|
||||
tribal_gdf_without_points = self.tribal_gdf[
|
||||
self.tribal_gdf.geom_type.isin(["Polygon", "MultiPolygon"])
|
||||
]
|
||||
|
||||
# Switch from geographic to projected CRSes
|
||||
# because logically that's right
|
||||
self.census_tract_gdf = self.census_tract_gdf.to_crs(crs=self.CRS_INTEGER)
|
||||
tribal_gdf_without_points = tribal_gdf_without_points.to_crs(crs=self.CRS_INTEGER)
|
||||
|
||||
# Create a measure for the entire census tract area
|
||||
self.census_tract_gdf["area_tract"] = self.census_tract_gdf.area
|
||||
|
||||
# Performing overlay funcion
|
||||
# We have a mix of polygons and multipolygons, and we just want the overlaps
|
||||
# without caring a ton about the specific types, so we ignore geom type.
|
||||
# Realistically, this changes almost nothing in the calculation; True and False
|
||||
# are the same within 9 digits of precision
|
||||
gdf_joined = gpd.overlay(
|
||||
self.census_tract_gdf,
|
||||
tribal_gdf_without_points,
|
||||
how="intersection",
|
||||
keep_geom_type=False,
|
||||
)
|
||||
|
||||
# Calculating the areas of the newly-created overlapping geometries
|
||||
gdf_joined["area_joined"] = gdf_joined.area
|
||||
|
||||
# Calculating the areas of the newly-created geometries in relation
|
||||
# to the original tract geometries
|
||||
gdf_joined[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT] = (
|
||||
gdf_joined["area_joined"] / gdf_joined["area_tract"]
|
||||
)
|
||||
|
||||
# Aggregate the results
|
||||
percentage_results = gdf_joined.groupby(
|
||||
[self.GEOID_TRACT_FIELD_NAME]
|
||||
).agg({field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "sum"})
|
||||
|
||||
percentage_results = percentage_results.reset_index()
|
||||
|
||||
# Merge the two results.
|
||||
merged_output_df = tribal_overlap_with_tracts.merge(
|
||||
right=percentage_results,
|
||||
how="outer",
|
||||
on=self.GEOID_TRACT_FIELD_NAME,
|
||||
)
|
||||
|
||||
# Finally, fix one unique error.
|
||||
# There is one unique Tribal area (self.ANNETTE_ISLAND_TRIBAL_NAME) that is a polygon in
|
||||
# Alaska. All other Tribal areas in Alaska are points.
|
||||
# For tracts that *only* contain that Tribal area, leave percentage as is.
|
||||
# For tracts that include that Tribal area AND Alaska Native villages,
|
||||
# null the percentage, because we cannot calculate the percent of the tract
|
||||
# this is within Tribal areas.
|
||||
|
||||
# Create state FIPS codes.
|
||||
merged_output_df_state_fips_code = merged_output_df[
|
||||
self.GEOID_TRACT_FIELD_NAME
|
||||
].str[0:2]
|
||||
|
||||
# Start by testing for Annette Island exception, to make sure data is as
|
||||
# expected
|
||||
alaskan_non_annette_matches = (
|
||||
# Data from Alaska
|
||||
(merged_output_df_state_fips_code == "02")
|
||||
# Where the Tribal areas do *not* include Annette
|
||||
& (
|
||||
~merged_output_df[
|
||||
field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT
|
||||
].str.contains(self.ANNETTE_ISLAND_TRIBAL_NAME)
|
||||
)
|
||||
# But somehow percentage is greater than zero.
|
||||
& (
|
||||
merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT]
|
||||
> 0
|
||||
)
|
||||
)
|
||||
|
||||
# There should be none of these matches.
|
||||
if sum(alaskan_non_annette_matches) > 0:
|
||||
raise ValueError(
|
||||
"Data has changed. More than one Alaskan Tribal Area has polygon "
|
||||
"boundaries. You'll need to refactor this ETL. \n"
|
||||
f"Data:\n{merged_output_df[alaskan_non_annette_matches]}"
|
||||
)
|
||||
|
||||
# Now, fix the exception that is already known.
|
||||
merged_output_df[
|
||||
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT
|
||||
] = np.where(
|
||||
# For tracts inside Alaska
|
||||
(merged_output_df_state_fips_code == "02")
|
||||
# That are not only represented by Annette Island
|
||||
& (
|
||||
merged_output_df[field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT]
|
||||
!= self.ANNETTE_ISLAND_TRIBAL_NAME
|
||||
),
|
||||
# Set the value to `None` for tracts with more than just Annette.
|
||||
None,
|
||||
# Otherwise, set the value to what it was.
|
||||
merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT],
|
||||
)
|
||||
|
||||
self.output_df = merged_output_df
|
|
@ -2435,7 +2435,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -0,0 +1,128 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e0b801f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import geopandas as gpd\n",
|
||||
"import pyogrio\n",
|
||||
"from data_pipeline.etl.sources.census.etl import CensusETL\n",
|
||||
"\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "c4cbab25",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Time taken to execute the function using pyogrio is 63.07696199417114\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"begin = time.time()\n",
|
||||
"census_tract_gdf = gpd.read_file(\n",
|
||||
" CensusETL.NATIONAL_TRACT_JSON_PATH,\n",
|
||||
" # Use `pyogrio` because it's vectorized and faster.\n",
|
||||
" engine=\"pyogrio\",\n",
|
||||
")\n",
|
||||
"end = time.time()\n",
|
||||
" \n",
|
||||
"print(\"Time taken to execute the function using pyogrio is\", end-begin)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "372ab939",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Time taken to execute the function using include fields is 67.33577013015747\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"begin2 = time.time()\n",
|
||||
"census_tract_gdf = gpd.read_file(\n",
|
||||
" CensusETL.NATIONAL_TRACT_JSON_PATH,\n",
|
||||
" engine=\"fiona\",\n",
|
||||
" include_fields=[\"GEOID10\"]\n",
|
||||
")\n",
|
||||
"end2 = time.time()\n",
|
||||
" \n",
|
||||
"print(\"Time taken to execute the function using include fields is\", end2-begin2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "32fb7d4b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_21074/2531126572.py\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mbegin2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m census_tract_gdf = gpd.read_file(\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mCensusETL\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNATIONAL_TRACT_JSON_PATH\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"fiona\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0minclude_fields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"GEOID10\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/geopandas/io/file.py\u001b[0m in \u001b[0;36m_read_file\u001b[0;34m(filename, bbox, mask, rows, engine, **kwargs)\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"fiona\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 253\u001b[0;31m return _read_file_fiona(\n\u001b[0m\u001b[1;32m 254\u001b[0m \u001b[0mpath_or_bytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfrom_bytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbbox\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbbox\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 255\u001b[0m )\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/geopandas/io/file.py\u001b[0m in \u001b[0;36m_read_file_fiona\u001b[0;34m(path_or_bytes, from_bytes, bbox, mask, rows, **kwargs)\u001b[0m\n\u001b[1;32m 338\u001b[0m )\n\u001b[1;32m 339\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 340\u001b[0;31m df = GeoDataFrame.from_features(\n\u001b[0m\u001b[1;32m 341\u001b[0m \u001b[0mf_filt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcrs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcrs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"geometry\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m )\n",
|
||||
"\u001b[0;32m~/.virtualenvs/scoring2/lib/python3.9/site-packages/geopandas/geodataframe.py\u001b[0m in \u001b[0;36mfrom_features\u001b[0;34m(cls, features, crs, columns)\u001b[0m\n\u001b[1;32m 641\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 643\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mfeature\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfeatures_lst\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 644\u001b[0m \u001b[0;31m# load geometry\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeature\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"__geo_interface__\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32mfiona/ogrext.pyx\u001b[0m in \u001b[0;36mfiona.ogrext.Iterator.__next__\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32mfiona/ogrext.pyx\u001b[0m in \u001b[0;36mfiona.ogrext.Iterator._next\u001b[0;34m()\u001b[0m\n",
|
||||
"\u001b[0;32m~/.pyenv/versions/3.9.6/lib/python3.9/logging/__init__.py\u001b[0m in \u001b[0;36mdebug\u001b[0;34m(self, msg, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1422\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmanager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_clear_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1423\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1424\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1425\u001b[0m \"\"\"\n\u001b[1;32m 1426\u001b[0m \u001b[0mLog\u001b[0m \u001b[0;34m'msg % args'\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mseverity\u001b[0m \u001b[0;34m'DEBUG'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"begin2 = time.time()\n",
|
||||
"census_tract_gdf = gpd.read_file(\n",
|
||||
" CensusETL.NATIONAL_TRACT_JSON_PATH,\n",
|
||||
" engine=\"fiona\",\n",
|
||||
" include_fields=[\"GEOID10\"],\n",
|
||||
" rows=slice(0, 76322, 100),\n",
|
||||
")\n",
|
||||
"end2 = time.time()\n",
|
||||
"\n",
|
||||
"print(\"Time taken to execute the function using slice is\", end2 - begin2)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -349,6 +349,17 @@ ELIGIBLE_FUDS_BINARY_FIELD_NAME = (
|
|||
)
|
||||
ELIGIBLE_FUDS_FILLED_IN_FIELD_NAME = "Is there at least one Formerly Used Defense Site (FUDS) in the tract, where missing data is treated as False?"
|
||||
|
||||
# Tribal variables
|
||||
TRIBAL_ID = "tribalId"
|
||||
TRIBAL_LAND_AREA_NAME = "landAreaName"
|
||||
|
||||
# Tribal overlap variables
|
||||
COUNT_OF_TRIBAL_AREAS_IN_TRACT = "Number of Tribal areas within Census tract"
|
||||
NAMES_OF_TRIBAL_AREAS_IN_TRACT = "Names of Tribal areas within Census tract"
|
||||
PERCENT_OF_TRIBAL_AREA_IN_TRACT = (
|
||||
"Percent of the Census tract that is within Tribal areas"
|
||||
)
|
||||
|
||||
#####
|
||||
# Names for individual factors being exceeded
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ logger = get_module_logger(__name__)
|
|||
|
||||
|
||||
class ScoreNarwhal(Score):
|
||||
"""Very similar to Score M, at present."""
|
||||
"""Score N, aka Narwhal."""
|
||||
|
||||
LOW_INCOME_THRESHOLD: float = 0.65
|
||||
MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -16,9 +16,11 @@ from data_pipeline.score import field_names
|
|||
|
||||
@contextmanager
|
||||
def patch_calculate_tract_adjacency_scores():
|
||||
tract_data = Path(__file__).parent / "data" / "us.geojson"
|
||||
# Use fixtures for tract data.
|
||||
tract_data_path = Path(__file__).parent / "data" / "us.geojson"
|
||||
|
||||
get_tract_geojson_mock = partial(
|
||||
get_tract_geojson, _tract_data_path=tract_data
|
||||
get_tract_geojson, _tract_data_path=tract_data_path
|
||||
)
|
||||
with mock.patch(
|
||||
"data_pipeline.score.utils.get_tract_geojson",
|
||||
|
|
|
@ -23,6 +23,10 @@ def test_add_tracts_for_geometries():
|
|||
),
|
||||
crs="epsg:4326",
|
||||
)
|
||||
tract_data = Path(__file__).parent / "data" / "us.geojson"
|
||||
enriched_df = add_tracts_for_geometries(df, _tract_data_path=tract_data)
|
||||
|
||||
# Use fixtures for tract data.
|
||||
tract_data_path = Path(__file__).parent / "data" / "us.geojson"
|
||||
tract_data = gpd.read_file(tract_data_path)
|
||||
|
||||
enriched_df = add_tracts_for_geometries(df, tract_data=tract_data)
|
||||
assert (df["expected_geoid"] == enriched_df["GEOID10_TRACT"]).all()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue