Add FUDS ETL (#1817)

* Add spatial join method (#1871)

Since we'll need to figure out the tracts for a large number of points
in future tickets, add a utility to handle grabbing the tract geometries
and adding tract data to a point dataset.

* Add FUDS, also jupyter lab (#1871)

* Add YAML configs for FUDS (#1871)

* Allow input geoid to be optional (#1871)

* Add FUDS ETL, tests, test-datae noteobook (#1871)

This adds the ETL class for Formerly Used Defense Sites (FUDS). This is
different from most other ETLs since these FUDS are not provided by
tract, but instead by geographic point, so we need to assign FUDS to
tracts and then do calculations from there.

* Floats -> Ints, as I intended (#1871)

* Floats -> Ints, as I intended (#1871)

* Formatting fixes (#1871)

* Add test false positive GEOIDs (#1871)

* Add gdal binaries (#1871)

* Refactor pandas code to be more idiomatic (#1871)

Per Emma, the more pandas-y way of doing my counts is using np.where to
add the values i need, then groupby and size. It is definitely more
compact, and also I think more correct!

* Update configs per Emma suggestions (#1871)

* Type fixed! (#1871)

* Remove spurious import from vscode (#1871)

* Snapshot update after changing col name (#1871)

* Move up GDAL (#1871)

* Adjust geojson strategy (#1871)

* Try running census separately first (#1871)

* Fix import order (#1871)

* Cleanup cache strategy (#1871)

* Download census data from S3 instead of re-calculating (#1871)

* Clarify pandas code per Emma (#1871)
This commit is contained in:
Matt Bowen 2022-08-16 13:28:39 -04:00 committed by GitHub
commit d5fbb802e8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 2534 additions and 416 deletions

View file

@ -10,6 +10,7 @@ from data_pipeline.etl.runner import (
score_post,
)
from data_pipeline.etl.sources.census.etl_utils import (
check_census_data_source,
reset_data_directories as census_reset,
zip_census_data,
)
@ -96,6 +97,23 @@ def census_data_download(zip_compress):
sys.exit()
@cli.command(help="Retrieve census data from source")
@click.option(
"-s",
"--data-source",
default="local",
required=False,
type=str,
help=dataset_cli_help,
)
def pull_census_data(data_source: str):
logger.info("Pulling census data from %s", data_source)
data_path = settings.APP_ROOT / "data" / "census"
check_census_data_source(data_path, data_source)
logger.info("Finished pulling census data")
sys.exit()
@cli.command(
help="Run all ETL processes or a specific one",
)

View file

@ -127,9 +127,10 @@ class ExtractTransformLoad:
sys.exit()
# set some of the basic fields
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
"input_geoid_tract_field_name"
]
if "input_geoid_tract_field_name" in dataset_config:
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
"input_geoid_tract_field_name"
]
# get the columns to write on the CSV
# and set the constants

View file

@ -130,6 +130,11 @@ DATASET_LIST = [
"module_dir": "census_acs_2010",
"class_name": "CensusACS2010ETL",
},
{
"name": "us_army_fuds",
"module_dir": "us_army_fuds",
"class_name": "USArmyFUDS",
},
]
CENSUS_INFO = {

View file

@ -117,6 +117,34 @@ datasets:
field_type: float
include_in_downloadable_files: true
include_in_tiles: true
- long_name: "Formerly Used Defense Sites"
short_name: "FUDS"
module_name: "us_army_fuds"
load_fields:
- short_name: "fuds_count"
df_field_name: "ELIGIBLE_FUDS_COUNT_FIELD_NAME"
long_name: "Count of eligible Formerly Used Defense Site (FUDS) properties centroids"
description_short:
"The number of FUDS marked as Eligible and Has Project in the tract."
field_type: int64
include_in_tiles: false
include_in_downloadable_files: false
- short_name: "not_fuds_ct"
df_field_name: "INELIGIBLE_FUDS_COUNT_FIELD_NAME"
long_name: "Count of ineligible Formerly Used Defense Site (FUDS) properties centroids"
description_short:
"The number of FUDS marked as Ineligible or Project in the tract."
field_type: int64
include_in_tiles: false
include_in_downloadable_files: false
- short_name: "has_fuds"
df_field_name: "ELIGIBLE_FUDS_BINARY_FIELD_NAME"
long_name: "Is there at least one Formerly Used Defense Site (FUDS) in the tract?"
description_short:
"Whether the tract has a FUDS"
field_type: bool
include_in_tiles: false
include_in_downloadable_files: false
- long_name: "Example ETL"
short_name: "Example"
module_name: "example_dataset"
@ -128,4 +156,3 @@ datasets:
field_type: float
include_in_tiles: true
include_in_downloadable_files: true

View file

@ -77,7 +77,7 @@ class DatasetsConfig:
long_name: str
short_name: str
module_name: str
input_geoid_tract_field_name: str
load_fields: List[LoadField]
input_geoid_tract_field_name: Optional[str] = None
datasets: List[Dataset]

View file

@ -20,19 +20,20 @@ class GeoFileType(Enum):
class CensusETL(ExtractTransformLoad):
SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
NATIONAL_TRACT_CSV_PATH = CSV_BASE_PATH / "us.csv"
NATIONAL_TRACT_JSON_PATH = GEOJSON_BASE_PATH / "us.json"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def __init__(self):
self.SHP_BASE_PATH = self.DATA_PATH / "census" / "shp"
self.GEOJSON_BASE_PATH = self.DATA_PATH / "census" / "geojson"
self.CSV_BASE_PATH = self.DATA_PATH / "census" / "csv"
# the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
self.GEOJSON_PATH = self.DATA_PATH / "census" / "geojson"
self.TRACT_PER_STATE: dict = {} # in-memory dict per state
self.TRACT_NATIONAL: list = [] # in-memory global list
self.NATIONAL_TRACT_CSV_PATH = self.CSV_BASE_PATH / "us.csv"
self.NATIONAL_TRACT_JSON_PATH = self.GEOJSON_BASE_PATH / "us.json"
self.GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def _path_for_fips_file(
self, fips_code: str, file_type: GeoFileType

View file

@ -0,0 +1,62 @@
"""Utililities for turning geographies into tracts, using census data"""
from pathlib import Path
from typing import Optional
from functools import lru_cache
import geopandas as gpd
from data_pipeline.utils import get_module_logger
from .census.etl import CensusETL
logger = get_module_logger(__name__)
@lru_cache()
def get_tract_geojson(
_tract_data_path: Optional[Path] = None,
) -> gpd.GeoDataFrame:
logger.info("Loading tract geometry data from census ETL")
GEOJSON_PATH = _tract_data_path
if GEOJSON_PATH is None:
GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH
if not GEOJSON_PATH.exists():
logger.debug("Census data has not been computed, running")
census_etl = CensusETL()
census_etl.extract()
census_etl.transform()
census_etl.load()
else:
logger.debug("Loading existing tract geojson")
tract_data = gpd.read_file(GEOJSON_PATH, include_fields=["GEOID10"])
tract_data.rename(columns={"GEOID10": "GEOID10_TRACT"}, inplace=True)
return tract_data
def add_tracts_for_geometries(
df: gpd.GeoDataFrame, _tract_data_path: Optional[Path] = None
) -> gpd.GeoDataFrame:
"""Adds tract-geoids to dataframe df that contains spatial geometries
Depends on CensusETL for the geodata to do its conversion
Args:
df (GeoDataFrame): a geopandas GeoDataFrame with a point geometry column
_tract_data_path (Path): an override to directly pass a GEOJSON file of
tracts->Geometries, to simplify testing.
Returns:
GeoDataFrame: the above dataframe, with an additional GEOID10_TRACT column that
maps the points in DF to census tracts and a geometry column for later
spatial analysis
"""
logger.debug("Appending tract data to dataframe")
tract_data = get_tract_geojson(_tract_data_path)
assert (
tract_data.crs == df.crs
), f"Dataframe must be projected to {tract_data.crs}"
df = gpd.sjoin(
df,
tract_data[["GEOID10_TRACT", "geometry"]],
how="inner",
op="intersects",
)
return df

View file

@ -0,0 +1,98 @@
from pathlib import Path
import geopandas as gpd
import pandas as pd
import numpy as np
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import get_module_logger, download_file_from_url
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
logger = get_module_logger(__name__)
class USArmyFUDS(ExtractTransformLoad):
"""The Formerly Used Defense Sites (FUDS)"""
NAME: str = "us_army_fuds"
ELIGIBLE_FUDS_COUNT_FIELD_NAME: str
INELIGIBLE_FUDS_COUNT_FIELD_NAME: str
ELIGIBLE_FUDS_BINARY_FIELD_NAME: str
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
def __init__(self):
self.FILE_URL: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
# Constants for output
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
self.ELIGIBLE_FUDS_COUNT_FIELD_NAME,
self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
]
self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
self.raw_df: gpd.GeoDataFrame
self.output_df: pd.DataFrame
def extract(self) -> None:
logger.info("Starting FUDS data download.")
download_file_from_url(
file_url=self.FILE_URL,
download_file_name=self.DOWNLOAD_FILE_NAME,
verify=True,
)
def transform(self) -> None:
logger.info("Starting FUDS transform.")
# before we try to do any transformation, get the tract data
# so it's loaded and the census ETL is out of scope
logger.info("Loading FUDs data as GeoDataFrame for transform")
raw_df = gpd.read_file(
filename=self.DOWNLOAD_FILE_NAME,
low_memory=False,
)
# Note that the length of raw_df will not be exactly the same
# because same bases lack coordinated or have coordinates in
# Mexico or in the ocean. See the following dataframe:
# raw_df[~raw_df.OBJECTID.isin(df_with_tracts.OBJECTID)][
# ['OBJECTID', 'CLOSESTCITY', 'COUNTY', 'ELIGIBILITY',
# 'STATE', 'LATITUDE', "LONGITUDE"]]
logger.debug("Adding tracts to FUDS data")
df_with_tracts = add_tracts_for_geometries(raw_df)
self.output_df = pd.DataFrame()
# this will create a boolean series which you can do actually sans np.where
df_with_tracts["tmp_fuds"] = (
df_with_tracts.ELIGIBILITY == "Eligible"
) & (df_with_tracts.HASPROJECTS == "Yes")
self.output_df[
self.ELIGIBLE_FUDS_COUNT_FIELD_NAME
] = df_with_tracts.groupby(self.GEOID_TRACT_FIELD_NAME)[
"tmp_fuds"
].sum()
self.output_df[self.INELIGIBLE_FUDS_COUNT_FIELD_NAME] = (
df_with_tracts[~df_with_tracts.tmp_fuds]
.groupby(self.GEOID_TRACT_FIELD_NAME)
.size()
)
self.output_df = (
self.output_df.fillna(0).astype("int64").sort_index().reset_index()
)
self.output_df[self.ELIGIBLE_FUDS_BINARY_FIELD_NAME] = np.where(
self.output_df[self.ELIGIBLE_FUDS_COUNT_FIELD_NAME] > 0.0,
True,
False,
)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,10 @@
{
"type": "FeatureCollection",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "STATEFP10": "06", "COUNTYFP10": "037", "TRACTCE10": "207400", "GEOID10_TRACT": "06037207400", "NAME10": "2074", "NAMELSAD10": "Census Tract 2074", "MTFCC10": "G5020", "FUNCSTAT10": "S", "ALAND10": 862884, "AWATER10": 6531, "INTPTLAT10": "+34.0561941", "INTPTLON10": "-118.2466502" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -118.25165, 34.057561 ], [ -118.251856, 34.057693 ], [ -118.251973, 34.057769 ], [ -118.253069, 34.058478 ], [ -118.253333, 34.058635 ], [ -118.253175, 34.058788 ], [ -118.252985, 34.058967 ], [ -118.252934, 34.059012 ], [ -118.252592, 34.059315 ], [ -118.252391, 34.059485 ], [ -118.252131, 34.059695 ], [ -118.251474, 34.060224 ], [ -118.251082, 34.060543 ], [ -118.250554, 34.060988 ], [ -118.249996, 34.061475 ], [ -118.248871, 34.06247 ], [ -118.248822, 34.062513 ], [ -118.248754, 34.062434 ], [ -118.247476, 34.060942 ], [ -118.247368, 34.060818 ], [ -118.247013, 34.06041 ], [ -118.24698, 34.060373 ], [ -118.246769, 34.060147 ], [ -118.246548, 34.059926 ], [ -118.246318, 34.059712 ], [ -118.246079, 34.059505 ], [ -118.245633, 34.059146 ], [ -118.245532, 34.059066 ], [ -118.245262, 34.058851 ], [ -118.244952, 34.058609 ], [ -118.244638, 34.05837 ], [ -118.244425, 34.058215 ], [ -118.244007, 34.057917 ], [ -118.243393, 34.057507 ], [ -118.243099, 34.057319 ], [ -118.24245, 34.056913 ], [ -118.241377, 34.056241 ], [ -118.241204, 34.056133 ], [ -118.240288, 34.055562 ], [ -118.239443, 34.055035 ], [ -118.238512, 34.054454 ], [ -118.238227, 34.054289 ], [ -118.238023, 34.054178 ], [ -118.237887, 34.054108 ], [ -118.2379, 34.054002 ], [ -118.237936, 34.053725 ], [ -118.237945, 34.053651 ], [ -118.237976, 34.052819 ], [ -118.238039, 34.05107 ], [ -118.239698, 34.052451 ], [ -118.239867, 34.051906 ], [ -118.240115, 34.0514 ], [ -118.240172, 34.051284 ], [ -118.240271, 34.051083 ], [ -118.240856, 34.050405 ], [ -118.242151, 34.051344 ], [ -118.242382, 34.051511 ], [ -118.24334, 34.050273 ], [ -118.244519, 34.051003 ], [ -118.245067, 34.051354 ], [ -118.245606, 34.051703 ], [ -118.246677, 34.052395 ], [ -118.247754, 34.053091 ], [ -118.248466, 34.053552 ], [ -118.248818, 34.05378 ], [ -118.249888, 34.054472 ], [ -118.25095, 34.055158 ], [ -118.251081, 34.055241 ], [ -118.250895, 34.055373 ], [ -118.250712, 34.05553 ], [ -118.250052, 34.056232 ], [ -118.249838, 34.056391 ], [ -118.25165, 34.057561 ] ] ] } },
{ "type": "Feature", "properties": { "STATEFP10": "13", "COUNTYFP10": "121", "TRACTCE10": "011900", "GEOID10_TRACT": "13121011900", "NAME10": "119", "NAMELSAD10": "Census Tract 119", "MTFCC10": "G5020", "FUNCSTAT10": "S", "ALAND10": 1530847, "AWATER10": 0, "INTPTLAT10": "+33.7539369", "INTPTLON10": "-084.3826910" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -84.393243, 33.754604 ], [ -84.393434, 33.754711 ], [ -84.393836, 33.75492 ], [ -84.39376, 33.755141 ], [ -84.394037, 33.756265 ], [ -84.394411, 33.757235 ], [ -84.394982, 33.758491 ], [ -84.394325, 33.758955 ], [ -84.393831, 33.759308 ], [ -84.393459, 33.759573 ], [ -84.393366, 33.759627 ], [ -84.393273, 33.759663 ], [ -84.393187, 33.759685 ], [ -84.392783, 33.75973 ], [ -84.392071, 33.759729 ], [ -84.390564, 33.759722 ], [ -84.389801, 33.759719 ], [ -84.389083, 33.759716 ], [ -84.387584, 33.759709 ], [ -84.387539, 33.759708 ], [ -84.386062, 33.759685 ], [ -84.384198, 33.759666 ], [ -84.38422, 33.758392 ], [ -84.384242, 33.757117 ], [ -84.384268, 33.755571 ], [ -84.384283, 33.75473 ], [ -84.384287, 33.754521 ], [ -84.384305, 33.754462 ], [ -84.382272, 33.754439 ], [ -84.381907, 33.754434 ], [ -84.380277, 33.754417 ], [ -84.3802, 33.754414 ], [ -84.379455, 33.754397 ], [ -84.379157, 33.75439 ], [ -84.378673, 33.754379 ], [ -84.378332, 33.75438 ], [ -84.378297, 33.75437 ], [ -84.378044, 33.754368 ], [ -84.377363, 33.754378 ], [ -84.377298, 33.754379 ], [ -84.377099, 33.754376 ], [ -84.376604, 33.754371 ], [ -84.375544, 33.754355 ], [ -84.374384, 33.754337 ], [ -84.37336, 33.754322 ], [ -84.372422, 33.754309 ], [ -84.37215, 33.754305 ], [ -84.371286, 33.754295 ], [ -84.369769, 33.754278 ], [ -84.368828, 33.754282 ], [ -84.368562, 33.754283 ], [ -84.368027, 33.754285 ], [ -84.367498, 33.754287 ], [ -84.366551, 33.75429 ], [ -84.366444, 33.754291 ], [ -84.365863, 33.754297 ], [ -84.365599, 33.754312 ], [ -84.365617, 33.754242 ], [ -84.365791, 33.753851 ], [ -84.366268, 33.75328 ], [ -84.366323, 33.753215 ], [ -84.3666, 33.752984 ], [ -84.366842, 33.752754 ], [ -84.366935, 33.752666 ], [ -84.36698, 33.752629 ], [ -84.367086, 33.752523 ], [ -84.367248, 33.75237 ], [ -84.368362, 33.752078 ], [ -84.369133, 33.751836 ], [ -84.369871, 33.751612 ], [ -84.370491, 33.751434 ], [ -84.370976, 33.751284 ], [ -84.37217, 33.750916 ], [ -84.373348, 33.750533 ], [ -84.374128, 33.750253 ], [ -84.375093, 33.749926 ], [ -84.376294, 33.749564 ], [ -84.376636, 33.749461 ], [ -84.376945, 33.749372 ], [ -84.37768, 33.749186 ], [ -84.378404, 33.74904 ], [ -84.378835, 33.748964 ], [ -84.379047, 33.748935 ], [ -84.379541, 33.748892 ], [ -84.379663, 33.748881 ], [ -84.380133, 33.748853 ], [ -84.380525, 33.748853 ], [ -84.380758, 33.748868 ], [ -84.381016, 33.748884 ], [ -84.381506, 33.748923 ], [ -84.382132, 33.748903 ], [ -84.38251, 33.748886 ], [ -84.382727, 33.748877 ], [ -84.383153, 33.748907 ], [ -84.383313, 33.748923 ], [ -84.383493, 33.748941 ], [ -84.383746, 33.749 ], [ -84.383896, 33.749035 ], [ -84.384064, 33.749089 ], [ -84.384277, 33.749158 ], [ -84.384328, 33.74918 ], [ -84.384564, 33.749282 ], [ -84.38487, 33.749449 ], [ -84.385214, 33.749686 ], [ -84.385654, 33.749989 ], [ -84.386389, 33.750471 ], [ -84.387563, 33.75124 ], [ -84.387886, 33.751452 ], [ -84.388865, 33.752093 ], [ -84.389895, 33.752768 ], [ -84.390844, 33.753391 ], [ -84.39132, 33.753703 ], [ -84.391525, 33.753837 ], [ -84.392156, 33.754065 ], [ -84.392373, 33.754172 ], [ -84.392834, 33.754399 ], [ -84.39318, 33.754569 ], [ -84.393243, 33.754604 ] ] ] } },
{ "type": "Feature", "properties": { "STATEFP10": "25", "COUNTYFP10": "025", "TRACTCE10": "030300", "GEOID10_TRACT": "25025030300", "NAME10": "303", "NAMELSAD10": "Census Tract 303", "MTFCC10": "G5020", "FUNCSTAT10": "S", "ALAND10": 691377, "AWATER10": 234496, "INTPTLAT10": "+42.3600562", "INTPTLON10": "-071.0532861" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -71.045566, 42.359733 ], [ -71.049073, 42.354939 ], [ -71.049333, 42.354585 ], [ -71.049396, 42.354498 ], [ -71.049595, 42.354497 ], [ -71.050434, 42.354846 ], [ -71.050471, 42.354898 ], [ -71.050892, 42.35506 ], [ -71.05106, 42.355131 ], [ -71.050981, 42.355309 ], [ -71.050889, 42.355475 ], [ -71.050856, 42.355555 ], [ -71.050762, 42.356011 ], [ -71.050749, 42.356124 ], [ -71.050816, 42.35664 ], [ -71.051009, 42.356937 ], [ -71.051198, 42.357241 ], [ -71.05137, 42.357474 ], [ -71.051411, 42.357539 ], [ -71.051508, 42.357692 ], [ -71.051613, 42.357921 ], [ -71.051784, 42.358295 ], [ -71.051941, 42.358637 ], [ -71.051976, 42.358699 ], [ -71.052005, 42.358693 ], [ -71.052065, 42.358682 ], [ -71.052158, 42.358666 ], [ -71.052294, 42.358646 ], [ -71.052749, 42.358576 ], [ -71.053192, 42.358496 ], [ -71.053248, 42.358478 ], [ -71.053321, 42.358455 ], [ -71.053518, 42.358356 ], [ -71.053765, 42.358183 ], [ -71.053961, 42.358012 ], [ -71.054265, 42.357737 ], [ -71.05437, 42.357662 ], [ -71.054524, 42.357551 ], [ -71.054848, 42.35735 ], [ -71.05502, 42.357245 ], [ -71.05519, 42.357143 ], [ -71.055539, 42.356971 ], [ -71.055759, 42.356913 ], [ -71.056292, 42.356874 ], [ -71.05659, 42.356852 ], [ -71.057191, 42.356822 ], [ -71.05771, 42.356777 ], [ -71.057993, 42.356789 ], [ -71.058235, 42.356832 ], [ -71.058737, 42.356988 ], [ -71.058561, 42.357161 ], [ -71.05829, 42.35741 ], [ -71.058759, 42.357577 ], [ -71.059299, 42.357766 ], [ -71.059613, 42.357863 ], [ -71.060354, 42.358092 ], [ -71.061259, 42.358283 ], [ -71.06151, 42.358336 ], [ -71.061714, 42.358318 ], [ -71.061977, 42.358246 ], [ -71.062375, 42.358095 ], [ -71.062642, 42.357977 ], [ -71.062727, 42.358311 ], [ -71.062817, 42.358665 ], [ -71.062823, 42.358714 ], [ -71.062846, 42.358889 ], [ -71.062862, 42.359204 ], [ -71.062875, 42.359483 ], [ -71.062864, 42.36009 ], [ -71.062911, 42.361229 ], [ -71.062762, 42.361642 ], [ -71.062626, 42.361842 ], [ -71.062499, 42.362001 ], [ -71.062354, 42.362143 ], [ -71.062268, 42.362205 ], [ -71.062195, 42.362258 ], [ -71.061856, 42.36243 ], [ -71.061669, 42.362493 ], [ -71.061223, 42.362633 ], [ -71.060878, 42.362731 ], [ -71.060042, 42.362967 ], [ -71.059606, 42.36307 ], [ -71.059491, 42.363104 ], [ -71.058769, 42.363318 ], [ -71.058559, 42.363381 ], [ -71.0584, 42.363412 ], [ -71.058216, 42.363431 ], [ -71.058037, 42.363481 ], [ -71.057979, 42.363511 ], [ -71.057882, 42.363546 ], [ -71.057776, 42.363542 ], [ -71.057709, 42.363543 ], [ -71.05757, 42.36342 ], [ -71.057332, 42.36318 ], [ -71.057051, 42.362987 ], [ -71.056227, 42.362386 ], [ -71.056176, 42.362357 ], [ -71.05525, 42.36183 ], [ -71.055228, 42.361869 ], [ -71.055183, 42.361919 ], [ -71.055187, 42.361941 ], [ -71.055159, 42.361989 ], [ -71.055123, 42.362045 ], [ -71.055026, 42.362149 ], [ -71.05489, 42.362265 ], [ -71.054661, 42.36238 ], [ -71.054626, 42.362404 ], [ -71.054581, 42.362434 ], [ -71.054494, 42.362511 ], [ -71.054407, 42.362634 ], [ -71.054311, 42.362802 ], [ -71.054296, 42.36283 ], [ -71.05419, 42.362973 ], [ -71.054061, 42.363108 ], [ -71.053826, 42.363303 ], [ -71.053709, 42.363367 ], [ -71.053585, 42.363405 ], [ -71.053549, 42.363416 ], [ -71.053199, 42.363474 ], [ -71.053043, 42.363495 ], [ -71.052769, 42.36353 ], [ -71.05246, 42.363586 ], [ -71.05224, 42.363626 ], [ -71.052061, 42.36371 ], [ -71.051895, 42.363501 ], [ -71.051661, 42.363192 ], [ -71.051647, 42.36311 ], [ -71.051414, 42.363386 ], [ -71.05135, 42.36347 ], [ -71.051195, 42.36372 ], [ -71.051115, 42.363979 ], [ -71.051088, 42.364065 ], [ -71.05109, 42.364175 ], [ -71.0496, 42.364044 ], [ -71.049409, 42.364045 ], [ -71.046389, 42.363935 ], [ -71.045985, 42.362294 ], [ -71.045918, 42.361164 ], [ -71.0455, 42.359825 ], [ -71.045566, 42.359733 ] ] ] } },
{ "type": "Feature", "properties": { "STATEFP10": "28", "COUNTYFP10": "047", "TRACTCE10": "003800", "GEOID10_TRACT": "28047003800", "NAME10": "38", "NAMELSAD10": "Census Tract 38", "MTFCC10": "G5020", "FUNCSTAT10": "S", "ALAND10": 2304789, "AWATER10": 3104014, "INTPTLAT10": "+30.3577592", "INTPTLON10": "-089.1130708" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -89.101237, 30.347697 ], [ -89.117538, 30.342797 ], [ -89.124278, 30.343971 ], [ -89.124335, 30.353194 ], [ -89.124336, 30.353446 ], [ -89.124338, 30.353697 ], [ -89.124555, 30.354007 ], [ -89.124595, 30.353991 ], [ -89.124991, 30.354701 ], [ -89.125114, 30.354921 ], [ -89.125679, 30.355921 ], [ -89.127359, 30.358407 ], [ -89.127508, 30.358574 ], [ -89.127077, 30.35871 ], [ -89.124073, 30.359753 ], [ -89.12318, 30.360048 ], [ -89.122255, 30.360367 ], [ -89.121353, 30.360674 ], [ -89.120354, 30.36101 ], [ -89.117854, 30.36182 ], [ -89.116359, 30.362304 ], [ -89.11492, 30.362785 ], [ -89.113579, 30.363225 ], [ -89.112509, 30.363583 ], [ -89.11135, 30.363984 ], [ -89.11121, 30.364005 ], [ -89.110283, 30.364326 ], [ -89.109295, 30.364647 ], [ -89.108217, 30.365012 ], [ -89.107137, 30.365376 ], [ -89.105342, 30.365959 ], [ -89.102779, 30.36682 ], [ -89.101505, 30.367176 ], [ -89.100242, 30.367636 ], [ -89.098984, 30.368 ], [ -89.097738, 30.368327 ], [ -89.097572, 30.368365 ], [ -89.096742, 30.368555 ], [ -89.096574, 30.368614 ], [ -89.095317, 30.368959 ], [ -89.095334, 30.371183 ], [ -89.095338, 30.371317 ], [ -89.093988, 30.371319 ], [ -89.09397, 30.371327 ], [ -89.093034, 30.371329 ], [ -89.092869, 30.371322 ], [ -89.09153, 30.371326 ], [ -89.090312, 30.371327 ], [ -89.090136, 30.371327 ], [ -89.088809, 30.371327 ], [ -89.088797, 30.372373 ], [ -89.087557, 30.372377 ], [ -89.087432, 30.372371 ], [ -89.087429, 30.371074 ], [ -89.087429, 30.370979 ], [ -89.087431, 30.36924 ], [ -89.087424, 30.368559 ], [ -89.087394, 30.368228 ], [ -89.087398, 30.3681 ], [ -89.087408, 30.367653 ], [ -89.087405, 30.367552 ], [ -89.088805, 30.367086 ], [ -89.090137, 30.366643 ], [ -89.090263, 30.366603 ], [ -89.091459, 30.366215 ], [ -89.092643, 30.365831 ], [ -89.092912, 30.365758 ], [ -89.093006, 30.365732 ], [ -89.093168, 30.365712 ], [ -89.094308, 30.36534 ], [ -89.094388, 30.365301 ], [ -89.094683, 30.365183 ], [ -89.094739, 30.365156 ], [ -89.094852, 30.365118 ], [ -89.095644, 30.364853 ], [ -89.096427, 30.364604 ], [ -89.096534, 30.364567 ], [ -89.097512, 30.364275 ], [ -89.097679, 30.364234 ], [ -89.098915, 30.363843 ], [ -89.10016, 30.363411 ], [ -89.100979, 30.363155 ], [ -89.101422, 30.362993 ], [ -89.101423, 30.362631 ], [ -89.101426, 30.36174 ], [ -89.101417, 30.361088 ], [ -89.101237, 30.347697 ] ] ] } }
]
}

View file

@ -0,0 +1,28 @@
from pathlib import Path
from collections import namedtuple
import geopandas as gpd
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
def test_add_tracts_for_geometries():
field_names = ["latitude", "longitude", "expected_geoid"]
DataPoint = namedtuple("DataPoint", field_names)
# Pulled the tract IDs from the census geocoder
records = [
DataPoint(33.75649254612824, -84.39215035031984, "13121011900"),
DataPoint(34.05289139656212, -118.2402117966315, "06037207400"),
DataPoint(42.357500146415475, -71.0563146836545, "25025030300"),
DataPoint(30.368185144529168, -89.0930992763473, "28047003800"),
]
df = gpd.GeoDataFrame.from_records(records, columns=field_names)
df = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(
x=df["longitude"],
y=df["latitude"],
),
crs="epsg:4326",
)
tract_data = Path(__file__).parent / "data" / "us.geojson"
enriched_df = add_tracts_for_geometries(df, _tract_data_path=tract_data)
assert (df["expected_geoid"] == enriched_df["GEOID10_TRACT"]).all()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,16 @@
GEOID10_TRACT,Count of eligible Formerly Used Defense Site (FUDS) properties centroids,Count of ineligible Formerly Used Defense Site (FUDS) properties centroids,Is there at least one Formerly Used Defense Site (FUDS) in the tract?
06027000800,3,14,True
06061021322,1,2,True
06069000802,1,0,True
15001021010,1,2,True
15001021101,0,1,False
15001021402,1,2,True
15001021800,1,2,True
15003010201,2,1,True
15007040603,0,2,False
15007040604,1,2,True
15007040700,1,2,True
15009030100,0,1,False
15009030201,1,2,True
15009030402,1,2,True
15009030800,1,2,True
1 GEOID10_TRACT Count of eligible Formerly Used Defense Site (FUDS) properties centroids Count of ineligible Formerly Used Defense Site (FUDS) properties centroids Is there at least one Formerly Used Defense Site (FUDS) in the tract?
2 06027000800 3 14 True
3 06061021322 1 2 True
4 06069000802 1 0 True
5 15001021010 1 2 True
6 15001021101 0 1 False
7 15001021402 1 2 True
8 15001021800 1 2 True
9 15003010201 2 1 True
10 15007040603 0 2 False
11 15007040604 1 2 True
12 15007040700 1 2 True
13 15009030100 0 1 False
14 15009030201 1 2 True
15 15009030402 1 2 True
16 15009030800 1 2 True

View file

@ -0,0 +1,16 @@
GEOID10_TRACT,Count of eligible Formerly Used Defense Site (FUDS) properties centroids,Count of ineligible Formerly Used Defense Site (FUDS) properties centroids,Is there at least one Formerly Used Defense Site (FUDS) in the tract?
06027000800,3,14,True
06061021322,1,2,True
06069000802,1,0,True
15001021010,1,2,True
15001021101,0,1,False
15001021402,1,2,True
15001021800,1,2,True
15003010201,2,1,True
15007040603,0,2,False
15007040604,1,2,True
15007040700,1,2,True
15009030100,0,1,False
15009030201,1,2,True
15009030402,1,2,True
15009030800,1,2,True
1 GEOID10_TRACT Count of eligible Formerly Used Defense Site (FUDS) properties centroids Count of ineligible Formerly Used Defense Site (FUDS) properties centroids Is there at least one Formerly Used Defense Site (FUDS) in the tract?
2 06027000800 3 14 True
3 06061021322 1 2 True
4 06069000802 1 0 True
5 15001021010 1 2 True
6 15001021101 0 1 False
7 15001021402 1 2 True
8 15001021800 1 2 True
9 15003010201 2 1 True
10 15007040603 0 2 False
11 15007040604 1 2 True
12 15007040700 1 2 True
13 15009030100 0 1 False
14 15009030201 1 2 True
15 15009030402 1 2 True
16 15009030800 1 2 True

View file

@ -0,0 +1,187 @@
# pylint: disable=protected-access
from unittest import mock
import pathlib
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.us_army_fuds.etl import (
USArmyFUDS,
)
from data_pipeline.tests.sources.example.test_etl import TestETL
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
def _fake_add_tracts_for_geometries(df):
"""The actual geojoin is too slow for tests. Use precomputed results."""
lookups = {
(-121.39361572299998, 38.87463378900003): "06061021322",
(-121.40020751999998, 38.897583008000026): "06061021322",
(-121.40020751999998, 38.75158691400003): "06061021322",
(-157.84301757799997, 21.53619384800004): "15003010201",
(-157.85168456999997, 21.553405762000068): "15003010201",
(-157.90679931599996, 21.554199219000054): "15003010201",
(-159.52191162099996, 21.976623535000044): "15007040700",
(-159.52996826199998, 21.93762207000003): "15007040700",
(-159.52111816399997, 21.922607422000056): "15007040700",
(-156.14270019499997, 20.840393066000047): "15009030100",
(-155.85968017599998, 20.26519775400004): "15001021800",
(-155.73327636699997, 20.166809082000043): "15001021800",
(-155.89270019499997, 20.23522949200003): "15001021800",
(-156.26019287099996, 20.899414062000062): "15009030201",
(-156.22076415999996, 20.91241455100004): "15009030201",
(-156.20739746099997, 20.890991211000028): "15009030201",
(-159.46496581999997, 21.90460205100004): "15007040603",
(-159.46441650399998, 21.905212402000075): "15007040603",
(-154.82519531299997, 19.49182128900003): "15001021101",
(-121.06768798799999, 36.61480712900004): "06069000802",
(-117.391601563, 36.33343505900007): "06027000800",
(-117.85546874999994, 36.46960449200003): "06027000800",
(-117.23529052699996, 36.387634277000075): "06027000800",
(-118.15270996099997, 36.725219727000024): "06027000800",
(-118.13891601599994, 36.56683349600007): "06027000800",
(-117.311096191, 36.783386230000076): "06027000800",
(-118.00030517599998, 36.283813477000024): "06027000800",
(-116.86248779299996, 36.46124267600004): "06027000800",
(-117.16418456999997, 36.60681152300003): "06027000800",
(-117.06939697299998, 36.158386230000076): "06027000800",
(-117.873596191, 36.487609863000046): "06027000800",
(-116.82971191399997, 36.283386230000076): "06027000800",
(-117.21667480499997, 35.95843505900007): "06027000800",
(-118.04998779299996, 36.59478759800004): "06027000800",
(-117.03576660199997, 36.27801513700007): "06027000800",
(-116.10028076199995, 35.83380127000004): "06027000800",
(-117.86499023399995, 36.14422607400007): "06027000800",
(-155.10320912843935, 19.497857096442765): "15001021010",
(-155.91378674587037, 19.516632121497878): "15001021402",
(-156.3306524489697, 20.825377142028497): "15009030402",
(-156.5429023670438, 20.917074254751412): "15009030800",
(-159.48416820625405, 21.907546119100093): "15007040604",
}
df["GEOID10_TRACT"] = df.geometry.apply(
lambda point: lookups[(point.x, point.y)]
)
return df
class TestUSArmyFUDSETL(TestETL):
"""Tests the FUDS ETL.
This uses pytest-snapshot.
To update individual snapshots: $ poetry run pytest
data_pipeline/tests/sources/us_army_fuds/test_etl.py::TestClassNameETL::<testname>
--snapshot-update
"""
_ETL_CLASS = USArmyFUDS
_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "fuds.geojson"
_SAMPLE_DATA_ZIP_FILE_NAME = "fuds.geojson"
_EXTRACT_TMP_FOLDER_NAME = "USArmyFUDS"
def setup_method(self, _method, filename=__file__):
"""Invoke `setup_method` from Parent, but using the current file name.
This code can be copied identically between all child classes.
"""
super().setup_method(_method=_method, filename=filename)
def test_init(self, mock_etl, mock_paths):
"""Tests that the mock NationalRiskIndexETL class instance was
initiliazed correctly.
Validates the following conditions:
- self.DATA_PATH points to the "data" folder in the temp directory
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
- self.INPUT_PATH points to the correct path in the temp directory
- self.OUTPUT_PATH points to the correct path in the temp directory
"""
# setup
etl = self._ETL_CLASS()
# validation
assert etl.GEOID_FIELD_NAME == "GEOID10"
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
assert etl.NAME == "us_army_fuds"
assert etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT
assert etl.COLUMNS_TO_KEEP == [
etl.GEOID_TRACT_FIELD_NAME,
etl.ELIGIBLE_FUDS_COUNT_FIELD_NAME,
etl.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
etl.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
]
def test_get_output_file_path(self, mock_etl, mock_paths):
"""Tests the right file name is returned."""
etl = self._ETL_CLASS()
data_path, tmp_path = mock_paths
output_file_path = etl._get_output_file_path()
expected_output_file_path = (
data_path / "dataset" / self._ETL_CLASS.NAME / "usa.csv"
)
assert output_file_path == expected_output_file_path
def test_fixtures_contain_shared_tract_ids_base(self, mock_etl, mock_paths):
with mock.patch(
"data_pipeline.etl.sources.us_army_fuds.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
return super().test_fixtures_contain_shared_tract_ids_base(
mock_etl, mock_paths
)
def test_transform_base(self, snapshot, mock_etl, mock_paths):
with mock.patch(
"data_pipeline.etl.sources.us_army_fuds.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
super().test_transform_base(
snapshot=snapshot, mock_etl=mock_etl, mock_paths=mock_paths
)
def test_transform_sets_output_df_base(self, mock_etl, mock_paths):
with mock.patch(
"data_pipeline.etl.sources.us_army_fuds.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
super().test_transform_sets_output_df_base(
mock_etl=mock_etl, mock_paths=mock_paths
)
def test_validate_base(self, mock_etl, mock_paths):
with mock.patch(
"data_pipeline.etl.sources.us_army_fuds.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
super().test_validate_base(mock_etl=mock_etl, mock_paths=mock_paths)
def test_full_etl_base(self, mock_etl, mock_paths):
with mock.patch(
"data_pipeline.etl.sources.us_army_fuds.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
return super().test_full_etl_base(mock_etl, mock_paths)
def test_get_data_frame_base(self, mock_etl, mock_paths):
with mock.patch(
"data_pipeline.etl.sources.us_army_fuds.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
return super().test_get_data_frame_base(mock_etl, mock_paths)
def test_tracts_without_fuds_not_in_results(self, mock_etl, mock_paths):
with mock.patch(
"data_pipeline.etl.sources.us_army_fuds.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl, mock_paths=mock_paths
)
etl.transform()
etl.validate()
etl.load()
df = etl.get_data_frame()
assert len(df[etl.GEOID_TRACT_FIELD_NAME]) == len(
self._FIXTURES_SHARED_TRACT_IDS
)