{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "24085186-3472-43d3-8b87-b5191c4f6ca6", "metadata": {}, "outputs": [], "source": [ "import geopandas as gpd\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "import sys\n", "\n", "module_path = os.path.abspath(os.path.join(\"../..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", "from data_pipeline.config import settings\n", "from data_pipeline.etl.base import ExtractTransformLoad\n", "from data_pipeline.etl.sources.census.etl import CensusETL\n", "from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries\n", "from data_pipeline.utils import unzip_file_from_url" ] }, { "cell_type": "markdown", "id": "dbecd665-1c8a-40fe-a7fc-684ecf73f991", "metadata": {}, "source": [ "# Grab the data" ] }, { "cell_type": "code", "execution_count": 3, "id": "f451ea70-917c-45f9-adf9-9306436b955d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-08-16 11:50:57,573 [data_pipeline.utils] INFO Downloading https://justice40-data.s3.amazonaws.com/data-sources/eAMLIS export of all data.tsv.zip\n", "2022-08-16 11:50:57,857 [data_pipeline.utils] INFO Extracting /home/matt/active/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/downloaded-af59fffe-aec2-48b4-a57f-716b8dc7e0a3.zip\n" ] } ], "source": [ "tmp_path = ExtractTransformLoad.DATA_PATH / \"tmp\" / \"abandoned_mine_lands\"\n", "# Create directory if it doesn't exist\n", "tmp_path.mkdir(parents=True, exist_ok=True)\n", "\n", "eamlis_path_in_s3 = (\n", " settings.AWS_JUSTICE40_DATASOURCES_URL\n", " + \"/eAMLIS export of all data.tsv.zip\"\n", ")\n", "\n", "unzip_file_from_url(\n", " file_url=eamlis_path_in_s3,\n", " download_path=tmp_path,\n", " unzipped_file_path=tmp_path,\n", ")\n", "\n", "eamlis_path = tmp_path / \"eAMLIS export of all data.tsv\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "5cee0a59-4f69-4678-a7ab-877e57d06f1b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/matt/active/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/eAMLIS export of all data.tsv'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "str(eamlis_path)" ] }, { "cell_type": "code", "execution_count": 4, "id": "5b8119d8-315b-4e87-b286-3767158d63ab", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(eamlis_path, sep=\"\\t\", low_memory=False)" ] }, { "cell_type": "code", "execution_count": 5, "id": "4fbac263-5868-4fbc-bcb7-168ef479af53", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(57149, 42)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf = gpd.GeoDataFrame(\n", " df,\n", " geometry=gpd.points_from_xy(\n", " x=df[\"Longitude\"],\n", " y=df[\"Latitude\"],\n", " ),\n", " crs=\"epsg:4326\",\n", ")\n", "gdf.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "3b176f62-2d13-4bd2-9211-0ac7c2807146", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3977, 42)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf.drop_duplicates(subset=[\"geometry\"], inplace=True, keep=\"last\")\n", "gdf.shape" ] }, { "cell_type": "code", "execution_count": 7, "id": "a0116c52-58f5-48a6-aa9e-c49873ebafa7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-08-16 11:51:28,795 [data_pipeline.etl.sources.geo_utils] DEBUG Appending tract data to dataframe\n", "2022-08-16 11:51:28,796 [data_pipeline.etl.sources.geo_utils] INFO Loading tract geometry data from census ETL\n", "2022-08-16 11:51:28,796 [data_pipeline.etl.sources.geo_utils] DEBUG Loading existing tract geojson\n", "/home/matt/.cache/pypoetry/virtualenvs/justice40-data-pipeline-IwBjhw-4-py3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3553: FutureWarning: The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] } ], "source": [ "gdf_tracts = add_tracts_for_geometries(gdf)" ] }, { "cell_type": "code", "execution_count": 8, "id": "2a18ba4d-274b-4640-a83c-02ae1d02837c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | AMLIS Key | \n", "State/Tribe | \n", "County | \n", "Congressional District | \n", "Quadrangle Name | \n", "Watershed | \n", "HUC Code | \n", "FIPS Code | \n", "Latitude | \n", "Longitude | \n", "... | \n", "Funded GPRA Acres | \n", "Funded Metric Units | \n", "Completed Standard Units | \n", "Completed Costs | \n", "Completed GPRA Acres | \n", "Completed Metric Units | \n", "Unnamed: 40 | \n", "geometry | \n", "index_right | \n", "GEOID10_TRACT | \n", "
---|
0 rows × 44 columns
\n", "\n", " | AMLIS Key | \n", "State/Tribe | \n", "County | \n", "Congressional District | \n", "Quadrangle Name | \n", "Watershed | \n", "HUC Code | \n", "FIPS Code | \n", "Latitude | \n", "Longitude | \n", "... | \n", "Funded GPRA Acres | \n", "Funded Metric Units | \n", "Completed Standard Units | \n", "Completed Costs | \n", "Completed GPRA Acres | \n", "Completed Metric Units | \n", "Unnamed: 40 | \n", "geometry | \n", "index_right | \n", "GEOID10_TRACT | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | \n", "AK000001 | \n", "AK | \n", "MATANUSKA-SUSITNA | \n", "1.0 | \n", "ANCHORAGE C-8 | \n", "NaN | \n", "NaN | \n", "02170 | \n", "61.6 | \n", "-149.8 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "900.00 | \n", "33200.0 | \n", "12.86 | \n", "274.30 | \n", "NaN | \n", "POINT (-149.80000 61.60000) | \n", "9900 | \n", "02170000401 | \n", "
6 | \n", "AK000003 | \n", "AK | \n", "VALDEZ-CORDOVA | \n", "1.0 | \n", "Valdez C-1 | \n", "19050003 | \n", "NaN | \n", "02-26 | \n", "61.6 | \n", "-144.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.34 | \n", "9200.0 | \n", "0.03 | \n", "0.34 | \n", "NaN | \n", "POINT (-144.00000 61.60000) | \n", "9918 | \n", "02261000100 | \n", "
100 | \n", "AK000080 | \n", "AK | \n", "VALDEZ-CORDOVA CENSU | \n", "1.0 | \n", "MCCARTHY C-5 | \n", "NaN | \n", "NaN | \n", "02261 | \n", "61.5 | \n", "-142.8 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "4.00 | \n", "9924.0 | \n", "0.40 | \n", "4.00 | \n", "NaN | \n", "POINT (-142.80000 61.50000) | \n", "9918 | \n", "02261000100 | \n", "
113 | \n", "AK000096 | \n", "AK | \n", "VALDEZ-CORDOVA | \n", "1.0 | \n", "MCCARTHY C-6 | \n", "NaN | \n", "NaN | \n", "Alaska | \n", "61.6 | \n", "-142.8 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "2.00 | \n", "29729.0 | \n", "0.20 | \n", "2.00 | \n", "NaN | \n", "POINT (-142.80000 61.60000) | \n", "9918 | \n", "02261000100 | \n", "
12 | \n", "AK000006 | \n", "AK | \n", "MATANUSKA-SUSITNA | \n", "1.0 | \n", "ANCHORAGE C-6 | \n", "NaN | \n", "NaN | \n", "Alaska | \n", "61.7 | \n", "-149.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "3.00 | \n", "9225.0 | \n", "0.30 | \n", "3.00 | \n", "NaN | \n", "POINT (-149.00000 61.70000) | \n", "9938 | \n", "02170000200 | \n", "
5 rows × 44 columns
\n", "\n", " | AMLIS Key | \n", "State/Tribe | \n", "County | \n", "Congressional District | \n", "Quadrangle Name | \n", "Watershed | \n", "HUC Code | \n", "FIPS Code | \n", "Latitude | \n", "Longitude | \n", "... | \n", "Funded GPRA Acres | \n", "Funded Metric Units | \n", "Completed Standard Units | \n", "Completed Costs | \n", "Completed GPRA Acres | \n", "Completed Metric Units | \n", "Unnamed: 40 | \n", "geometry | \n", "index_right | \n", "GEOID10_TRACT | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
100 | \n", "AK000080 | \n", "AK | \n", "VALDEZ-CORDOVA CENSU | \n", "1.0 | \n", "MCCARTHY C-5 | \n", "NaN | \n", "NaN | \n", "02261 | \n", "61.5 | \n", "-142.8 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "4.0 | \n", "9924.0 | \n", "0.4 | \n", "4.0 | \n", "NaN | \n", "POINT (-142.80000 61.50000) | \n", "9918 | \n", "02261000100 | \n", "
113 | \n", "AK000096 | \n", "AK | \n", "VALDEZ-CORDOVA | \n", "1.0 | \n", "MCCARTHY C-6 | \n", "NaN | \n", "NaN | \n", "Alaska | \n", "61.6 | \n", "-142.8 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "29729.0 | \n", "0.2 | \n", "2.0 | \n", "NaN | \n", "POINT (-142.80000 61.60000) | \n", "9918 | \n", "02261000100 | \n", "
30 | \n", "AK000015 | \n", "AK | \n", "MATANUSKA-SUSITNA | \n", "1.0 | \n", "ANCHORAGE D-4 | \n", "NaN | \n", "NaN | \n", "02170 | \n", "61.7 | \n", "-148.2 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "7.0 | \n", "4100.0 | \n", "0.7 | \n", "7.0 | \n", "NaN | \n", "POINT (-148.20000 61.70000) | \n", "9938 | \n", "02170000200 | \n", "
45 | \n", "AK000040 | \n", "AK | \n", "MATANUSKA-SUSITNA | \n", "1.0 | \n", "ANCHORAGE C-6 | \n", "NaN | \n", "NaN | \n", "02170 | \n", "61.7 | \n", "-148.8 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "20284.0 | \n", "0.1 | \n", "1.0 | \n", "NaN | \n", "POINT (-148.80000 61.70000) | \n", "9938 | \n", "02170000200 | \n", "
117 | \n", "AK000099 | \n", "AK | \n", "MATANUSKA-SUSITNA | \n", "1.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "02170 | \n", "61.7 | \n", "-148.4 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "POINT (-148.40000 61.70000) | \n", "9938 | \n", "02170000200 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
57095 | \n", "WY176742 | \n", "WY | \n", "Campbell County | \n", "1.0 | \n", "Little Thunder Reservoir | \n", "UPPER CHEYENNE | \n", "10120103.0 | \n", "56005 | \n", "43.7 | \n", "-105.4 | \n", "... | \n", "3.0 | \n", "3.0 | \n", "8.6 | \n", "1407322.0 | \n", "8.6 | \n", "8.6 | \n", "NaN | \n", "POINT (-105.40000 43.70000) | \n", "28394 | \n", "56005000100 | \n", "
56861 | \n", "WY082926 | \n", "WY | \n", "PLATTE | \n", "1.0 | \n", "Guernsey Reservoir | \n", "GLENDO RESERVOIR | \n", "10180008.0 | \n", "56031 | \n", "42.3 | \n", "-104.7 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "293122.0 | \n", "1.0 | \n", "0.4 | \n", "NaN | \n", "POINT (-104.70000 42.30000) | \n", "28402 | \n", "56031959100 | \n", "
56864 | \n", "WY086744 | \n", "WY | \n", "PLATTE | \n", "1.0 | \n", "HELL GAP | \n", "GLENDO RESERVOIR | \n", "10180008.0 | \n", "56031 | \n", "42.4 | \n", "-104.7 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "18848.0 | \n", "0.1 | \n", "1.0 | \n", "NaN | \n", "POINT (-104.70000 42.40000) | \n", "28402 | \n", "56031959100 | \n", "
56930 | \n", "WY102624 | \n", "WY | \n", "FREMONT | \n", "1.0 | \n", "Lookout Butte | \n", "LOWER WIND | \n", "10080005.0 | \n", "56013 | \n", "43.3 | \n", "-108.7 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "POINT (-108.70000 43.30000) | \n", "28442 | \n", "56013940201 | \n", "
57021 | \n", "WY132533 | \n", "WY | \n", "FREMONT | \n", "1.0 | \n", "Eagle Point | \n", "LOWER WIND | \n", "10080005.0 | \n", "56013 | \n", "43.4 | \n", "-108.7 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "POINT (-108.70000 43.40000) | \n", "28442 | \n", "56013940201 | \n", "
1942 rows × 44 columns
\n", "\n", " | GEOID10_TRACT | \n", "AMLIS Key | \n", "State/Tribe | \n", "County | \n", "Congressional District | \n", "Quadrangle Name | \n", "Watershed | \n", "HUC Code | \n", "FIPS Code | \n", "Latitude | \n", "... | \n", "Funded GPRA Acres | \n", "Funded Metric Units | \n", "Completed Standard Units | \n", "Completed Costs | \n", "Completed GPRA Acres | \n", "Completed Metric Units | \n", "Unnamed: 40 | \n", "geometry | \n", "index_right | \n", "_merge | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "06027000800 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
1 | \n", "06069000802 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
2 | \n", "06061021322 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
3 | \n", "15001021010 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
4 | \n", "15001021101 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
5 | \n", "15007040603 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
6 | \n", "15007040700 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
7 | \n", "15009030100 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
8 | \n", "15009030201 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
9 | \n", "15001021402 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
10 | \n", "15001021800 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
11 | \n", "15009030402 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
12 | \n", "15009030800 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
13 | \n", "15003010201 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
14 | \n", "15007040604 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "left_only | \n", "
15 rows × 45 columns
\n", "