In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import sys

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl import CensusETL
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import unzip_file_from_url

In [2]:
%load_ext lab_black

# Grab the data

In [3]:
tmp_path = ExtractTransformLoad.DATA_PATH / "tmp" / "abandoned_mine_lands"
# Create directory if it doesn't exist
tmp_path.mkdir(parents=True, exist_ok=True)

eamlis_path_in_s3 = (
    settings.AWS_JUSTICE40_DATASOURCES_URL + "/eAMLIS export of all data.tsv.zip"
)

unzip_file_from_url(
    file_url=eamlis_path_in_s3,
    download_path=tmp_path,
    unzipped_file_path=tmp_path,
)

eamlis_path = tmp_path / "eAMLIS export of all data.tsv"

2022-08-16 11:50:57,573 [data_pipeline.utils] INFO     Downloading https://justice40-data.s3.amazonaws.com/data-sources/eAMLIS export of all data.tsv.zip
2022-08-16 11:50:57,857 [data_pipeline.utils] INFO     Extracting /home/matt/active/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/downloaded-af59fffe-aec2-48b4-a57f-716b8dc7e0a3.zip


In [7]:
str(eamlis_path)

'/home/matt/active/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/eAMLIS export of all data.tsv'

In [4]:
df = pd.read_csv(eamlis_path, sep="\t", low_memory=False)

In [5]:
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(
        x=df["Longitude"],
        y=df["Latitude"],
    ),
    crs="epsg:4326",
)
gdf.shape

(57149, 42)

In [6]:
gdf.drop_duplicates(subset=["geometry"], inplace=True, keep="last")
gdf.shape

(3977, 42)

In [7]:
gdf_tracts = add_tracts_for_geometries(gdf)

2022-08-16 11:51:28,795 [data_pipeline.etl.sources.geo_utils] DEBUG    Appending tract data to dataframe
2022-08-16 11:51:28,796 [data_pipeline.etl.sources.geo_utils] INFO     Loading tract geometry data from census ETL
2022-08-16 11:51:28,796 [data_pipeline.etl.sources.geo_utils] DEBUG    Loading existing tract geojson
  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
gdf_tracts[gdf_tracts.GEOID10_TRACT.isna()]

Unnamed: 0,AMLIS Key,State/Tribe,County,Congressional District,Quadrangle Name,Watershed,HUC Code,FIPS Code,Latitude,Longitude,...,Funded GPRA Acres,Funded Metric Units,Completed Standard Units,Completed Costs,Completed GPRA Acres,Completed Metric Units,Unnamed: 40,geometry,index_right,GEOID10_TRACT


In [10]:
gdf_tracts.head()

Unnamed: 0,AMLIS Key,State/Tribe,County,Congressional District,Quadrangle Name,Watershed,HUC Code,FIPS Code,Latitude,Longitude,...,Funded GPRA Acres,Funded Metric Units,Completed Standard Units,Completed Costs,Completed GPRA Acres,Completed Metric Units,Unnamed: 40,geometry,index_right,GEOID10_TRACT
2,AK000001,AK,MATANUSKA-SUSITNA,1.0,ANCHORAGE C-8,,,02170,61.6,-149.8,...,0.0,0.0,900.0,33200.0,12.86,274.3,,POINT (-149.80000 61.60000),9900,2170000401
6,AK000003,AK,VALDEZ-CORDOVA,1.0,Valdez C-1,19050003.0,,02-26,61.6,-144.0,...,0.0,0.0,0.34,9200.0,0.03,0.34,,POINT (-144.00000 61.60000),9918,2261000100
100,AK000080,AK,VALDEZ-CORDOVA CENSU,1.0,MCCARTHY C-5,,,02261,61.5,-142.8,...,0.0,0.0,4.0,9924.0,0.4,4.0,,POINT (-142.80000 61.50000),9918,2261000100
113,AK000096,AK,VALDEZ-CORDOVA,1.0,MCCARTHY C-6,,,Alaska,61.6,-142.8,...,0.0,0.0,2.0,29729.0,0.2,2.0,,POINT (-142.80000 61.60000),9918,2261000100
12,AK000006,AK,MATANUSKA-SUSITNA,1.0,ANCHORAGE C-6,,,Alaska,61.7,-149.0,...,0.0,0.0,3.0,9225.0,0.3,3.0,,POINT (-149.00000 61.70000),9938,2170000200


In [16]:
gdf_tracts.drop_duplicates("GEOID10_TRACT").shape

(2034, 44)

In [24]:
census_tracts = gpd.read_file(CensusETL.NATIONAL_TRACT_JSON_PATH)

In [26]:
census_tracts.set_index("GEOID10", drop=False, inplace=True)

In [54]:
gdf_tracts.shape

(3976, 44)

In [59]:
gdf_tracts[gdf_tracts.GEOID10_TRACT.duplicated()]

Unnamed: 0,AMLIS Key,State/Tribe,County,Congressional District,Quadrangle Name,Watershed,HUC Code,FIPS Code,Latitude,Longitude,...,Funded GPRA Acres,Funded Metric Units,Completed Standard Units,Completed Costs,Completed GPRA Acres,Completed Metric Units,Unnamed: 40,geometry,index_right,GEOID10_TRACT
100,AK000080,AK,VALDEZ-CORDOVA CENSU,1.0,MCCARTHY C-5,,,02261,61.5,-142.8,...,0.0,0.0,4.0,9924.0,0.4,4.0,,POINT (-142.80000 61.50000),9918,02261000100
113,AK000096,AK,VALDEZ-CORDOVA,1.0,MCCARTHY C-6,,,Alaska,61.6,-142.8,...,0.0,0.0,2.0,29729.0,0.2,2.0,,POINT (-142.80000 61.60000),9918,02261000100
30,AK000015,AK,MATANUSKA-SUSITNA,1.0,ANCHORAGE D-4,,,02170,61.7,-148.2,...,0.0,0.0,7.0,4100.0,0.7,7.0,,POINT (-148.20000 61.70000),9938,02170000200
45,AK000040,AK,MATANUSKA-SUSITNA,1.0,ANCHORAGE C-6,,,02170,61.7,-148.8,...,0.0,0.0,1.0,20284.0,0.1,1.0,,POINT (-148.80000 61.70000),9938,02170000200
117,AK000099,AK,MATANUSKA-SUSITNA,1.0,,,,02170,61.7,-148.4,...,0.0,0.0,0.0,0.0,0.0,0.0,,POINT (-148.40000 61.70000),9938,02170000200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57095,WY176742,WY,Campbell County,1.0,Little Thunder Reservoir,UPPER CHEYENNE,10120103.0,56005,43.7,-105.4,...,3.0,3.0,8.6,1407322.0,8.6,8.6,,POINT (-105.40000 43.70000),28394,56005000100
56861,WY082926,WY,PLATTE,1.0,Guernsey Reservoir,GLENDO RESERVOIR,10180008.0,56031,42.3,-104.7,...,0.0,0.0,1.0,293122.0,1.0,0.4,,POINT (-104.70000 42.30000),28402,56031959100
56864,WY086744,WY,PLATTE,1.0,HELL GAP,GLENDO RESERVOIR,10180008.0,56031,42.4,-104.7,...,0.0,0.0,1.0,18848.0,0.1,1.0,,POINT (-104.70000 42.40000),28402,56031959100
56930,WY102624,WY,FREMONT,1.0,Lookout Butte,LOWER WIND,10080005.0,56013,43.3,-108.7,...,0.0,0.0,0.0,0.0,0.0,0.0,,POINT (-108.70000 43.30000),28442,56013940201


# Assemble test data

## Get some test tracts

In [12]:
example_geoids = pd.read_csv(
    "../tests/sources/example/data/extract.csv", dtype="object"
)

In [17]:
example_geoids.shape

(15, 2)

In [13]:
tract_df = gdf_tracts

In [14]:
merged_exaple_data = pd.merge(
    example_geoids["GEOID10_TRACT"],
    tract_df,
    on="GEOID10_TRACT",
    how="left",
    indicator=True,
)

In [15]:
merged_exaple_data[merged_exaple_data["_merge"] == "left_only"]

Unnamed: 0,GEOID10_TRACT,AMLIS Key,State/Tribe,County,Congressional District,Quadrangle Name,Watershed,HUC Code,FIPS Code,Latitude,...,Funded GPRA Acres,Funded Metric Units,Completed Standard Units,Completed Costs,Completed GPRA Acres,Completed Metric Units,Unnamed: 40,geometry,index_right,_merge
0,6027000800,,,,,,,,,,...,,,,,,,,,,left_only
1,6069000802,,,,,,,,,,...,,,,,,,,,,left_only
2,6061021322,,,,,,,,,,...,,,,,,,,,,left_only
3,15001021010,,,,,,,,,,...,,,,,,,,,,left_only
4,15001021101,,,,,,,,,,...,,,,,,,,,,left_only
5,15007040603,,,,,,,,,,...,,,,,,,,,,left_only
6,15007040700,,,,,,,,,,...,,,,,,,,,,left_only
7,15009030100,,,,,,,,,,...,,,,,,,,,,left_only
8,15009030201,,,,,,,,,,...,,,,,,,,,,left_only
9,15001021402,,,,,,,,,,...,,,,,,,,,,left_only


In [22]:
dict(df.iloc[0])

{'AMLIS Key': 'AK000001',
 'State/Tribe': 'AK',
 'County': 'MATANUSKA-SUSITNA',
 'Congressional District': 1.0,
 'Quadrangle Name': 'ANCHORAGE C-8',
 'Watershed': nan,
 'HUC Code': nan,
 'FIPS Code': '02170',
 'Latitude': 61.6,
 'Longitude': -149.8,
 'Funding Source / Program': 'FRA',
 'Problem Area Name': 'EAST HOUSTON MINE',
 'Problem Area Number': 1,
 'Planning Unit Name': 'HOUSTON',
 'Planning Unit Number': '1',
 'Problem Priority': '1',
 'Problem Type': 'VO',
 'Mining Type': 'S',
 'Ore Types': nan,
 'Date Prepared': '12/3/1986 12:00:00 AM',
 'Date Revised': '4/23/2014 6:40:28 PM',
 'Private Owner %': 0.0,
 'State Owner %': 0.0,
 'Other Federal Owner %': 0.0,
 'Park Service Owner %': 0.0,
 'Forest Service Owner %': 0.0,
 'Indian Owner %': 0.0,
 'BLM Owner %': 0.0,
 'Unfunded Standard Units': 0.0,
 'Unfunded Costs': 0.0,
 'Unfunded GPRA Acres': 0.0,
 'Unfunded Metric Units': 0.0,
 'Funded Standard Units': 0.0,
 'Funded Costs': 0.0,
 'Funded GPRA Acres': 0.0,
 'Funded Metric Units': 

In [21]:
merged_exaple_data.columns.difference(["GEOID10_TRACT"])

Index(['AMLIS Key', 'BLM Owner %', 'Completed Costs', 'Completed GPRA Acres',
       'Completed Metric Units', 'Completed Standard Units',
       'Congressional District', 'County', 'Date Prepared', 'Date Revised',
       'FIPS Code', 'Forest Service Owner %', 'Funded Costs',
       'Funded GPRA Acres', 'Funded Metric Units', 'Funded Standard Units',
       'Funding Source / Program', 'HUC Code', 'Indian Owner %', 'Latitude',
       'Longitude', 'Mining Type', 'Ore Types', 'Other Federal Owner %',
       'Park Service Owner %', 'Planning Unit Name', 'Planning Unit Number',
       'Private Owner %', 'Problem Area Name', 'Problem Area Number',
       'Problem Priority', 'Problem Type', 'Quadrangle Name', 'State Owner %',
       'State/Tribe', 'Unfunded Costs', 'Unfunded GPRA Acres',
       'Unfunded Metric Units', 'Unfunded Standard Units', 'Unnamed: 40',
       'Watershed', '_merge', 'geometry', 'index_right'],
      dtype='object')

In [36]:
def generate_fake_eamlis_row(tract: str, state: str) -> dict:
    generate_fake_eamlis_row._row_id += 1
    centroid = census_tracts.loc[tract].geometry.centroid
    return {
        "AMLIS Key": f"{state}{str(generate_fake_eamlis_row._row_id).zfill(6)}",
        "State/Tribe": state,
        "County": "MATANUSKA-SUSITNA",
        "Congressional District": 1.0,
        "Quadrangle Name": "ANCHORAGE C-8",
        "Watershed": np.nan,
        "HUC Code": np.nan,
        "FIPS Code": "02170",
        "Latitude": centroid.y,
        "Longitude": centroid.x,
        "Funding Source / Program": "FRA",
        "Problem Area Name": "EAST HOUSTON MINE",
        "Problem Area Number": 1,
        "Planning Unit Name": "HOUSTON",
        "Planning Unit Number": "1",
        "Problem Priority": "1",
        "Problem Type": "VO",
        "Mining Type": "S",
        "Ore Types": np.nan,
        "Date Prepared": "12/3/1986 12:00:00 AM",
        "Date Revised": "4/23/2014 6:40:28 PM",
        "Private Owner %": 0.0,
        "State Owner %": 0.0,
        "Other Federal Owner %": 0.0,
        "Park Service Owner %": 0.0,
        "Forest Service Owner %": 0.0,
        "Indian Owner %": 0.0,
        "BLM Owner %": 0.0,
        "Unfunded Standard Units": 0.0,
        "Unfunded Costs": 0.0,
        "Unfunded GPRA Acres": 0.0,
        "Unfunded Metric Units": 0.0,
        "Funded Standard Units": 0.0,
        "Funded Costs": 0.0,
        "Funded GPRA Acres": 0.0,
        "Funded Metric Units": 0.0,
        "Completed Standard Units": 2.0,
        "Completed Costs": 10000.0,
        "Completed GPRA Acres": 0.2,
        "Completed Metric Units": 2.0,
        "Unnamed: 40": np.nan,
    }


generate_fake_eamlis_row._row_id = 0

In [37]:
rows = []
for tract in merged_exaple_data[
    merged_exaple_data["_merge"] == "left_only"
].GEOID10_TRACT:
    state = "HI"
    if tract.startswith("06"):
        state = "CA"
    rows.append(generate_fake_eamlis_row(tract, state))

In [42]:
pd.DataFrame(rows).to_csv(
    "/home/matt/active/justice40-tool/data/data-pipeline/data_pipeline/tests/sources/eamlis/data/eAMLIS export of all data.tsv.zip",
    index=False,
    sep="\t",
)

## Get the points for the geolocation mock

In [47]:
lookup_table = {}
for tract in merged_exaple_data[
    merged_exaple_data["_merge"] == "left_only"
].GEOID10_TRACT:
    centroid = census_tracts.loc[tract].geometry.centroid
    lookup_table[(centroid.x, centroid.y)] = tract

In [51]:
rows

[{'AMLIS Key': 'CA000001',
  'State/Tribe': 'CA',
  'County': 'MATANUSKA-SUSITNA',
  'Congressional District': 1.0,
  'Quadrangle Name': 'ANCHORAGE C-8',
  'Watershed': nan,
  'HUC Code': nan,
  'FIPS Code': '02170',
  'Latitude': 36.25161281807095,
  'Longitude': -117.11772856883819,
  'Funding Source / Program': 'FRA',
  'Problem Area Name': 'EAST HOUSTON MINE',
  'Problem Area Number': 1,
  'Planning Unit Name': 'HOUSTON',
  'Planning Unit Number': '1',
  'Problem Priority': '1',
  'Problem Type': 'VO',
  'Mining Type': 'S',
  'Ore Types': nan,
  'Date Prepared': '12/3/1986 12:00:00 AM',
  'Date Revised': '4/23/2014 6:40:28 PM',
  'Private Owner %': 0.0,
  'State Owner %': 0.0,
  'Other Federal Owner %': 0.0,
  'Park Service Owner %': 0.0,
  'Forest Service Owner %': 0.0,
  'Indian Owner %': 0.0,
  'BLM Owner %': 0.0,
  'Unfunded Standard Units': 0.0,
  'Unfunded Costs': 0.0,
  'Unfunded GPRA Acres': 0.0,
  'Unfunded Metric Units': 0.0,
  'Funded Standard Units': 0.0,
  'Funded Cost

In [48]:
lookup_table

{(-117.11772856883819, 36.25161281807095): '06027000800',
 (-121.0070599015156, 36.5498780497345): '06069000802',
 (-121.40564726784282, 38.84602113669345): '06061021322',
 (-155.10321769858746, 19.49784370888389): '15001021010',
 (-154.89548634140738, 19.446650238354696): '15001021101',
 (-159.43665201302525, 21.904412260968197): '15007040603',
 (-159.52362041178708, 21.94208315793464): '15007040700',
 (-156.14177664396527, 20.72796381691298): '15009030100',
 (-156.2497797752935, 20.86486713282688): '15009030201',
 (-155.91378867633992, 19.516629328900667): '15001021402',
 (-155.81110884967674, 20.164406070883054): '15001021800',
 (-156.33064622489087, 20.825369670478306): '15009030402',
 (-156.54289869319305, 20.9170439162332): '15009030800',
 (-157.89225964427064, 21.556464980367483): '15003010201',
 (-159.48416846823164, 21.90754283544759): '15007040604'}