# Generate FUDS test data

Creating the fixture data for the Formerly Used Defense Sites (FUDS) is pretty involved. The below walks through creating the data and then eyeballing it so you can check your test results. So, if the FUDS updates and you want to generate new sample data for your tests, run this notebook.

In [1]:
import os
import sys
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
import json

# Add this project to the path
module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import geopandas as gpd
import pandas as pd
from data_pipeline.etl.sources.census.etl import CensusETL
from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS

# Load the source data and census tract data

In [4]:
# get the data
etl = USArmyFUDS()
etl.extract()

2022-08-10 17:57:23,542 [data_pipeline.etl.sources.us_army_fuds.etl] INFO     Starting data download.
2022-08-10 17:57:23,542 [data_pipeline.utils] INFO     Downloading https://opendata.arcgis.com/api/v3/datasets/3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/data?format=geojson&spatialRefId=4326&where=1%3D1


In [5]:
df = gpd.read_file(etl.DOWNLOAD_FILE_NAME, lowmemory=False)

In [3]:
census_tracts = gpd.read_file(CensusETL.NATIONAL_TRACT_JSON_PATH)

In [7]:
census_tracts.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 74134 entries, 0 to 74133
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   STATEFP10   74134 non-null  object  
 1   COUNTYFP10  74134 non-null  object  
 2   TRACTCE10   74134 non-null  object  
 3   GEOID10     74134 non-null  object  
 4   NAME10      74134 non-null  object  
 5   NAMELSAD10  74134 non-null  object  
 6   MTFCC10     74134 non-null  object  
 7   FUNCSTAT10  74134 non-null  object  
 8   ALAND10     74134 non-null  int64   
 9   AWATER10    74134 non-null  int64   
 10  INTPTLAT10  74134 non-null  object  
 11  INTPTLON10  74134 non-null  object  
 12  geometry    74134 non-null  geometry
dtypes: geometry(1), int64(2), object(10)
memory usage: 7.4+ MB


In [7]:
census_tracts.set_index("GEOID10", inplace=True)

# Generate the test data

In [8]:
with open(etl.DOWNLOAD_FILE_NAME) as geojson:
    raw_fuds_geojson = json.load(geojson)

In [9]:
tract_df = add_tracts_for_geometries(df)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
example_geoids = pd.read_csv(
    "../tests/sources/example/data/extract.csv", dtype="object"
)

In [11]:
merged_exaple_data = pd.merge(
    example_geoids["GEOID10_TRACT"],
    tract_df,
    on="GEOID10_TRACT",
    how="left",
    indicator=True,
)

In [12]:
merged_exaple_data[merged_exaple_data["_merge"] == "left_only"]

Unnamed: 0,GEOID10_TRACT,OBJECTID,CENTROIDLAT,CENTROIDLONG,CLOSESTCITY,CONGRESSIONALDISTRICT,COUNTY,CURRENTOWNER,DODFUDSPROPERTYIDPK,ELIGIBILITY,...,STATE,STATUS,STATUSCODE,USACEDISTRICT,FISCALYEAR,PROPERTY_HISTORY,USACEDIVISION,geometry,index_right,_merge
21,15001021010,,,,,,,,,,...,,,,,,,,,,left_only
32,15001021402,,,,,,,,,,...,,,,,,,,,,left_only
36,15009030402,,,,,,,,,,...,,,,,,,,,,left_only
37,15009030800,,,,,,,,,,...,,,,,,,,,,left_only
41,15007040604,,,,,,,,,,...,,,,,,,,,,left_only


In [13]:
original_crs = census_tracts.crs
points = (
    census_tracts.to_crs(epsg=3395)
    .loc[
        merged_exaple_data[(merged_exaple_data["_merge"] == "left_only")]
        .query('not GEOID10_TRACT.str.startswith("06")')
        .GEOID10_TRACT
    ]
    .centroid.to_crs(original_crs)
    .to_dict()
)

In [14]:
object_ids_to_keep = set(
    merged_exaple_data[merged_exaple_data["_merge"] == "both"].OBJECTID.astype(
        "int"
    )
)
features = []
for feature in raw_fuds_geojson["features"]:
    if feature["properties"]["OBJECTID"] in object_ids_to_keep:
        features.append(feature)

In [15]:
def make_fake_feature(
    state: str,
    has_projects: bool,
    is_eligible: bool,
    latitude: float,
    longitude: float,
):
    """For tracts where we don't have a FUDS, fake one."""
    make_fake_feature._object_id += 1
    return {
        "type": "Feature",
        "properties": {
            "OBJECTID": make_fake_feature._object_id,
            "CENTROIDLAT": None,
            "CENTROIDLONG": None,
            "CLOSESTCITY": None,
            "CONGRESSIONALDISTRICT": "15",
            "COUNTY": None,
            "CURRENTOWNER": None,
            "DODFUDSPROPERTYIDPK": " ",
            "ELIGIBILITY": "Eligible" if is_eligible else "Ineligible",
            "EMSMGMTACTIONPLANLINK": "https://fudsportal.usace.army.mil/ems/inventory/map?id=54113",
            "EPAREGION": "06",
            "FEATUREDESCRIPTION": None,
            "FEATURENAME": "NEIL, ET AL, PROPERTIES",
            "FUDSINSTALLATIONID": None,
            "FUDSUNIQUEPROPERTYNUMBER": "K06TX1120",
            "HASPROJECTS": "Yes" if has_projects else "No",
            "LATITUDE": latitude,
            "LONGITUDE": longitude,
            "MEDIAID": None,
            "METADATAID": None,
            "NOFURTHERACTION": None,
            "PROJECTREQUIRED": "No",
            "SDSID": None,
            "SITEELIGIBILITY": None,
            "STATE": state,
            "STATUS": "Properties with projects"
            if has_projects
            else "Properties without projects",
            "STATUSCODE": "Not on the NPL",
            "USACEDISTRICT": "swf",
            "FISCALYEAR": "2019",
            "PROPERTY_HISTORY": None,
            "USACEDIVISION": "swd",
        },
        "geometry": {
            "type": "Point",
            "coordinates": [longitude, latitude],
        },
    }


make_fake_feature._object_id = 50

In [16]:
# Create FUDS in CA for each tract that doesn't have a FUDS
for tract_id, point in points.items():
    for bools in [(True, True), (True, False), (False, False)]:
        features.append(
            make_fake_feature("CA", bools[0], bools[1], point.y, point.x)
        )

In [17]:
test_fuds_geojson = raw_fuds_geojson.copy()
test_fuds_geojson["features"] = features

In [18]:
with open("../tests/sources/us_army_fuds/data/fuds.geojson", "w") as outfile:
    json.dump(test_fuds_geojson, outfile)

# Eyeball the data to check the results of the tests

In [19]:
test_frame = gpd.read_file("../tests/sources/us_army_fuds/data/fuds.geojson")

In [20]:
test_frame_with_tracts_full = (
    test_frame_with_tracts
) = add_tracts_for_geometries(test_frame)

  exec(code_obj, self.user_global_ns, self.user_ns)


## Pre-compute the long, lat: tract relationship for use in a mock in the tests

In [21]:
test_frame_with_tracts = test_frame_with_tracts.set_index(
    ["GEOID10_TRACT", "OBJECTID"]
)[["ELIGIBILITY", "HASPROJECTS"]]

In [22]:
tracts = test_frame_with_tracts_full[
    ["GEOID10_TRACT", "geometry"]
].drop_duplicates()
tracts["lat_long"] = test_frame_with_tracts_full.geometry.apply(
    lambda point: (point.x, point.y)
)
tracts.set_index("lat_long")["GEOID10_TRACT"].to_dict()

{(-121.39361572299998, 38.87463378900003): '06061021322',
 (-121.40020751999998, 38.897583008000026): '06061021322',
 (-121.40020751999998, 38.75158691400003): '06061021322',
 (-157.84301757799997, 21.53619384800004): '15003010201',
 (-157.85168456999997, 21.553405762000068): '15003010201',
 (-157.90679931599996, 21.554199219000054): '15003010201',
 (-159.52191162099996, 21.976623535000044): '15007040700',
 (-159.52996826199998, 21.93762207000003): '15007040700',
 (-159.52111816399997, 21.922607422000056): '15007040700',
 (-156.14270019499997, 20.840393066000047): '15009030100',
 (-155.85968017599998, 20.26519775400004): '15001021800',
 (-155.73327636699997, 20.166809082000043): '15001021800',
 (-155.89270019499997, 20.23522949200003): '15001021800',
 (-156.26019287099996, 20.899414062000062): '15009030201',
 (-156.22076415999996, 20.91241455100004): '15009030201',
 (-156.20739746099997, 20.890991211000028): '15009030201',
 (-159.46496581999997, 21.90460205100004): '15007040603',
 (-15

## Look at the sample data itself

In [23]:
test_frame_with_tracts

Unnamed: 0_level_0,Unnamed: 1_level_0,ELIGIBILITY,HASPROJECTS
GEOID10_TRACT,OBJECTID,Unnamed: 2_level_1,Unnamed: 3_level_1
6061021322,684,Eligible,No
6061021322,1719,Eligible,Yes
6061021322,7428,Eligible,No
15003010201,1538,Eligible,Yes
15003010201,1629,Eligible,No
15003010201,6062,Eligible,Yes
15007040700,2093,Eligible,Yes
15007040700,2123,Eligible,No
15007040700,6015,Eligible,No
15009030100,2217,Eligible,No


In [24]:
test_frame_with_tracts.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,ELIGIBILITY,HASPROJECTS
GEOID10_TRACT,OBJECTID,Unnamed: 2_level_1,Unnamed: 3_level_1
6027000800,7018,Eligible,No
6027000800,7046,Ineligible,No
6027000800,7565,Ineligible,No
6027000800,7689,Eligible,Yes
6027000800,7691,Eligible,Yes
6027000800,7831,Ineligible,No
6027000800,7866,Eligible,No
6027000800,7977,Eligible,No
6027000800,8235,Ineligible,No
6027000800,8237,Ineligible,No
