diff --git a/data/data-pipeline/data_pipeline/ipython/eAMLIS_and_lat_long.ipynb b/data/data-pipeline/data_pipeline/ipython/eAMLIS_and_lat_long.ipynb index f72a8c52..0dfb7cd4 100644 --- a/data/data-pipeline/data_pipeline/ipython/eAMLIS_and_lat_long.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/eAMLIS_and_lat_long.ipynb @@ -10,13 +10,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "INFO: Pandarallel will run on 8 workers.\n", + "INFO: Pandarallel will run on 4 workers.\n", "INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n" ] } ], "source": [ "# These are probably more imports than we need. Copied from comparison tool\n", + "import geopandas\n", "import pandas as pd\n", "import os\n", "import sys\n", @@ -29,7 +30,7 @@ "pandarallel.initialize(\n", " progress_bar=True,\n", " # If nb_workers is not set, it defaults to available cores.\n", - " nb_workers=8,\n", + " # nb_workers=8,\n", ")\n", "\n", "\n", @@ -53,49 +54,168 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "id": "4c907bdb", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-07-11 19:42:16,399 [data_pipeline.etl.sources.census.etl_utils] INFO Downloading fips from S3 repository\n", - "2022-07-11 19:42:16,403 [data_pipeline.utils] INFO Downloading https://justice40-data.s3.amazonaws.com/data-sources/fips_states_2010.zip\n", - "2022-07-11 19:42:16,801 [data_pipeline.utils] INFO Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/downloaded-b6413152-455e-47a5-817a-a96e3a892044.zip\n" + "2022-07-15 22:38:19,438 [data_pipeline.etl.sources.census.etl_utils] INFO Downloading fips from S3 repository\n", + "2022-07-15 22:38:19,443 [data_pipeline.utils] INFO Downloading https://justice40-data.s3.amazonaws.com/data-sources/fips_states_2010.zip\n", + "2022-07-15 22:38:19,867 [data_pipeline.utils] INFO Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/downloaded-8d08eaa9-3c28-469c-8e52-f9ef049d9fce.zip\n" ] } ], "source": [ "# Define some input fields\n", - "LAT_FIELD = \"Latitude\"\n", - "LONG_FIELD = \"Longitude\"\n", - "KEY_FIELD = \"AMLIS Key\"\n", + "EAMLIS_LAT_FIELD = \"Latitude\"\n", + "EAMLIS_LONG_FIELD = \"Longitude\"\n", + "EAMLIS_KEY_FIELD = \"AMLIS Key\"\n", + "\n", + "FUDS_LAT_FIELD = \"LATITUDE\"\n", + "FUDS_LONG_FIELD = \"LONGITUDE\"\n", + "FUDS_COLUMNS_TO_KEEP = [\n", + " FUDS_LAT_FIELD,\n", + " FUDS_LONG_FIELD,\n", + " \"FUDSUNIQUEPROPERTYNUMBER\",\n", + " \"CURRENTOWNER\",\n", + " \"ELIGIBILITY\",\n", + " \"EMSMGMTACTIONPLANLINK\",\n", + " \"FEATUREDESCRIPTION\",\n", + " \"FEATURENAME\",\n", + " \"FUDSINSTALLATIONID\",\n", + " \"HASPROJECTS\",\n", + " \"STATUS\",\n", + " \"PROPERTY_HISTORY\",\n", + "]\n", "\n", "\n", - "# TODO: switch to whole US\n", + "# Geojson input fields\n", "GEOJSON_PATH = CensusETL().GEOJSON_PATH / \"us.json\"\n", - "# GEOJSON_PATH = CensusETL().GEOJSON_PATH / \"02.json\"\n", "GEOJSON_TRACT_ID_FIELD = \"GEOID10\"\n", "\n", - "# Choose output directory:\n", - "OUTPUT_DIR = ExtractTransformLoad.DATA_PATH / \"abandoned_mine_lands\"\n", + "\n", + "# Choose output directories:\n", + "FUDS_OUTPUT_DIR = ExtractTransformLoad.DATA_PATH / \"formerly_used_defense_sites\"\n", "# Create directory if it doesn't exist\n", - "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)" + "FUDS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Choose output directory:\n", + "EAMLIS_OUTPUT_DIR = ExtractTransformLoad.DATA_PATH / \"abandoned_mine_lands\"\n", + "# Create directory if it doesn't exist\n", + "EAMLIS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "d760ddc5", + "execution_count": 10, + "id": "a3796020", "metadata": {}, + "outputs": [], + "source": [ + "# METHOD DEFINITIONS\n", + "def get_census_tract_for_one_coordinate(\n", + " geom_point: shapely.geometry.point.Point,\n", + " census_tract_gdf: geopandas.geodataframe.GeoDataFrame,\n", + ") -> str:\n", + " # (predicate=\"within\") and left join from mines to score[[\"tract\", \"geometry\"]].\n", + "\n", + " # TODO: consider switching this order from `all polygons CONTAIN single point` (e.g., all tracts CONTAIN single mine) to \n", + " # `all points WITHIN single polygon`. However, this requires refactoring the whole method to \n", + " # iterate through each polygon rather than each point. \n", + " \n", + " contains_result = census_tract_gdf.contains(geom_point)\n", + " count_of_census_tract_matches = len(census_tract_gdf[contains_result])\n", + "\n", + " if count_of_census_tract_matches == 0:\n", + " warnings.warn(\n", + " f\"Warning: no tract matches for {geom_point}\",\n", + " DeprecationWarning,\n", + " stacklevel=2,\n", + " )\n", + " census_tract_id = None\n", + "\n", + " elif count_of_census_tract_matches > 1:\n", + " warnings.warn(\n", + " f\"Warning: too many tract matches for {geom_point}\",\n", + " DeprecationWarning,\n", + " stacklevel=2,\n", + " )\n", + " census_tract_id = None\n", + "\n", + " else:\n", + " # With only one tract returned, extract the ID.\n", + " census_tract_id = census_tract_gdf[contains_result][\n", + " GEOJSON_TRACT_ID_FIELD\n", + " ].values[0]\n", + "\n", + " return census_tract_id\n", + "\n", + "\n", + "def get_census_tracts_for_geom_points(\n", + " points_gdf: geopandas.geodataframe.GeoDataFrame,\n", + " census_tract_gdf: geopandas.geodataframe.GeoDataFrame,\n", + ") -> geopandas.geodataframe.GeoDataFrame:\n", + " geometry_column_name = \"geometry\"\n", + " result_gdf = points_gdf.parallel_apply(\n", + " lambda frame: get_census_tract_for_one_coordinate(\n", + " geom_point=frame[geometry_column_name], census_tract_gdf=census_tract_gdf\n", + " ),\n", + " axis=1,\n", + " )\n", + " return result_gdf\n", + "\n", + "\n", + "def get_census_tracts_for_dataframe_with_lat_long(\n", + " coordinates_df: pd.DataFrame,\n", + " latitude_column: str,\n", + " longitude_column: str,\n", + " census_tract_gdf: geopandas.geodataframe.GeoDataFrame,\n", + "):\n", + " # First, convert the plain DataFrame into a geopandas data frame with lat/long geometry points.\n", + " coordinates_geopandas_gdf = geopandas.GeoDataFrame(\n", + " coordinates_df,\n", + " geometry=geopandas.points_from_xy(\n", + " x=coordinates_df[longitude_column],\n", + " y=coordinates_df[latitude_column],\n", + " ),\n", + " )\n", + "\n", + " # Find the tract IDs for each point.\n", + " tract_results = get_census_tracts_for_geom_points(\n", + " points_gdf=coordinates_geopandas_gdf, census_tract_gdf=census_tract_gdf\n", + " )\n", + "\n", + " # Join the tract IDs back on the original dataframe\n", + " coordinates_with_tracts_df = coordinates_df\n", + " coordinates_with_tracts_df[\n", + " ExtractTransformLoad.GEOID_TRACT_FIELD_NAME\n", + " ] = tract_results\n", + "\n", + " # Remove unnecessary `geometry` column\n", + " # For unclear reasons, the initial `GeoDataFrame` creates a `geometry` column on the input dataframe that we don't want.\n", + " coordinates_with_tracts_df = coordinates_with_tracts_df.drop(\"geometry\", axis=1)\n", + "\n", + " return coordinates_with_tracts_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d760ddc5", + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Code took 884.5575151443481 seconds.\n", + "Code took 308.0567150115967 seconds.\n", " STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 \\\n", "0 27 139 080202 27139080202 802.02 \n", "1 27 139 080204 27139080204 802.04 \n", @@ -153,10 +273,9 @@ } ], "source": [ - "# Load geojson\n", - "import geopandas\n", - "\n", "t1 = time.time()\n", + "\n", + "# Takes ~4 minutes with all of USA.\n", "census_tract_gdf = geopandas.read_file(\n", " GEOJSON_PATH,\n", " # Use `pyogrio` because it's vectorized and faster.\n", @@ -170,9 +289,824 @@ "print(census_tract_gdf)" ] }, + { + "cell_type": "markdown", + "id": "02ae1a0c", + "metadata": {}, + "source": [ + "# Start work on FUDS" + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 17, + "id": "f8badb15", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-15 22:52:44,802 [data_pipeline.utils] INFO Downloading https://justice40-data.s3.amazonaws.com/data-sources/fuds_all_fy2019.csv.zip\n", + "2022-07-15 22:53:12,094 [data_pipeline.utils] INFO Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/downloaded-17fdff8f-5857-40b5-8de7-c79944f99095.zip\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LATITUDELONGITUDEFUDSUNIQUEPROPERTYNUMBERCURRENTOWNERELIGIBILITYEMSMGMTACTIONPLANLINKFEATUREDESCRIPTIONFEATURENAMEFUDSINSTALLATIONIDHASPROJECTSSTATUSPROPERTY_HISTORY
030.098611-93.722222K06TX0667LOCAL: CITY INDIVIDUAL OWNERSEligiblehttps://fudsportal.usace.army.mil/ems/inventor...The site was initially acquired in 1946 and us...ORANGE PORT OF NAV SHIP STORTX69799F675300YesProperties with projectsThe site was initially acquired in 1946 and us...
133.809700-95.628304K06TX0305DOD: USACE -AGRICULTURAL, RECREATIONAL, AND FL...Eligiblehttps://fudsportal.usace.army.mil/ems/inventor...Camp Maxey was activated in July 1942. It was ...CAMP MAXEYTX69799F668600YesProperties with projectsCamp Maxey was activated in July 1942. It was ...
435.746111-95.412778K06OK0186LOCAL: CITY CITY MUNICIPAL AIRFIELD\\nEligiblehttps://fudsportal.usace.army.mil/ems/inventor...HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...MUSKOGEE AUX AFOK69799F639800YesProperties with all projects at site closeoutHATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...
636.226944-95.330000K06OK0025PRIV: PRIVATE CURRENTLY USED AS AN INDUSTRIAL ...Eligiblehttps://fudsportal.usace.army.mil/ems/inventor...The DoD began use in the early 1940s when the ...OKLAHOMA ORDNANCE WORKSOK69799F636200YesProperties with all projects at site closeoutThe DoD began use in the early 1940s when the ...
936.023333-102.541667K06TX0268LOCAL: CITY THE SITE IS NOW USED AS A MUNICIPA...Eligiblehttps://fudsportal.usace.army.mil/ems/inventor...In 1942, the DoD acquired 6,235.16 acres for u...DALHART AAFTX69799F665100YesProperties with projectsIn 1942, the DoD acquired 6,235.16 acres for u...
.......................................
1008051.379444179.293889F10AK0858FWS: USFWSTRIBE: NATIVE AMERICAN ANCSA NATIVE ...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNAMCHITKA AF AUXILIARY FIELDAK09799F709900YesProperties with projectsNaN
1010059.266111-135.448889F10AK1016OTHER: Private Landowner has not been identifi...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNHAINES FAIRBANKS PIPELINEAK09799F980700YesProperties with projectsNaN
1010161.200278-149.900278F10AK1023STATE: STATE ALL BUILDINGS TURNED OVER TO THE ...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNCAMP ANCHORAGE ARMYAK09799FA25200YesProperties with all projects at site closeoutNaN
1010260.555556-151.267778F10AK1024NaNEligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNFORT KENAI ARMY POSTAK09799FA25300YesProperties with all projects at site closeoutNaN
1010563.919444-145.303889F10AK1033OTHER: Native AllotmentBLM: BLMPUBLC: AT&T STA...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNCANOL PIPELINEAK09799FA27900YesProperties with projectsNaN
\n", + "

2866 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " LATITUDE LONGITUDE FUDSUNIQUEPROPERTYNUMBER \\\n", + "0 30.098611 -93.722222 K06TX0667 \n", + "1 33.809700 -95.628304 K06TX0305 \n", + "4 35.746111 -95.412778 K06OK0186 \n", + "6 36.226944 -95.330000 K06OK0025 \n", + "9 36.023333 -102.541667 K06TX0268 \n", + "... ... ... ... \n", + "10080 51.379444 179.293889 F10AK0858 \n", + "10100 59.266111 -135.448889 F10AK1016 \n", + "10101 61.200278 -149.900278 F10AK1023 \n", + "10102 60.555556 -151.267778 F10AK1024 \n", + "10105 63.919444 -145.303889 F10AK1033 \n", + "\n", + " CURRENTOWNER ELIGIBILITY \\\n", + "0 LOCAL: CITY INDIVIDUAL OWNERS Eligible \n", + "1 DOD: USACE -AGRICULTURAL, RECREATIONAL, AND FL... Eligible \n", + "4 LOCAL: CITY CITY MUNICIPAL AIRFIELD\\n Eligible \n", + "6 PRIV: PRIVATE CURRENTLY USED AS AN INDUSTRIAL ... Eligible \n", + "9 LOCAL: CITY THE SITE IS NOW USED AS A MUNICIPA... Eligible \n", + "... ... ... \n", + "10080 FWS: USFWSTRIBE: NATIVE AMERICAN ANCSA NATIVE ... Eligible \n", + "10100 OTHER: Private Landowner has not been identifi... Eligible \n", + "10101 STATE: STATE ALL BUILDINGS TURNED OVER TO THE ... Eligible \n", + "10102 NaN Eligible \n", + "10105 OTHER: Native AllotmentBLM: BLMPUBLC: AT&T STA... Eligible \n", + "\n", + " EMSMGMTACTIONPLANLINK \\\n", + "0 https://fudsportal.usace.army.mil/ems/inventor... \n", + "1 https://fudsportal.usace.army.mil/ems/inventor... \n", + "4 https://fudsportal.usace.army.mil/ems/inventor... \n", + "6 https://fudsportal.usace.army.mil/ems/inventor... \n", + "9 https://fudsportal.usace.army.mil/ems/inventor... \n", + "... ... \n", + "10080 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10100 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10101 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10102 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10105 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "\n", + " FEATUREDESCRIPTION \\\n", + "0 The site was initially acquired in 1946 and us... \n", + "1 Camp Maxey was activated in July 1942. It was ... \n", + "4 HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND... \n", + "6 The DoD began use in the early 1940s when the ... \n", + "9 In 1942, the DoD acquired 6,235.16 acres for u... \n", + "... ... \n", + "10080 NaN \n", + "10100 NaN \n", + "10101 NaN \n", + "10102 NaN \n", + "10105 NaN \n", + "\n", + " FEATURENAME FUDSINSTALLATIONID HASPROJECTS \\\n", + "0 ORANGE PORT OF NAV SHIP STOR TX69799F675300 Yes \n", + "1 CAMP MAXEY TX69799F668600 Yes \n", + "4 MUSKOGEE AUX AF OK69799F639800 Yes \n", + "6 OKLAHOMA ORDNANCE WORKS OK69799F636200 Yes \n", + "9 DALHART AAF TX69799F665100 Yes \n", + "... ... ... ... \n", + "10080 AMCHITKA AF AUXILIARY FIELD AK09799F709900 Yes \n", + "10100 HAINES FAIRBANKS PIPELINE AK09799F980700 Yes \n", + "10101 CAMP ANCHORAGE ARMY AK09799FA25200 Yes \n", + "10102 FORT KENAI ARMY POST AK09799FA25300 Yes \n", + "10105 CANOL PIPELINE AK09799FA27900 Yes \n", + "\n", + " STATUS \\\n", + "0 Properties with projects \n", + "1 Properties with projects \n", + "4 Properties with all projects at site closeout \n", + "6 Properties with all projects at site closeout \n", + "9 Properties with projects \n", + "... ... \n", + "10080 Properties with projects \n", + "10100 Properties with projects \n", + "10101 Properties with all projects at site closeout \n", + "10102 Properties with all projects at site closeout \n", + "10105 Properties with projects \n", + "\n", + " PROPERTY_HISTORY \n", + "0 The site was initially acquired in 1946 and us... \n", + "1 Camp Maxey was activated in July 1942. It was ... \n", + "4 HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND... \n", + "6 The DoD began use in the early 1940s when the ... \n", + "9 In 1942, the DoD acquired 6,235.16 acres for u... \n", + "... ... \n", + "10080 NaN \n", + "10100 NaN \n", + "10101 NaN \n", + "10102 NaN \n", + "10105 NaN \n", + "\n", + "[2866 rows x 12 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Data accessed from: \n", + "# \"https://opendata.arcgis.com/api/v3/datasets/3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/data?format=csv&spatialRefId=4326&where=1%3D1\"\n", + "\n", + "# fuds_url = \"https://opendata.arcgis.com/api/v3/datasets/3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/data?format=csv&spatialRefId=4326&where=1%3D1\"\n", + "\n", + "# Create temporary path\n", + "fuds_tmp_path = ExtractTransformLoad.DATA_PATH / \"tmp\" / \"abandoned_mine_lands\"\n", + "# Create directory if it doesn't exist\n", + "fuds_tmp_path.mkdir(parents=True, exist_ok=True)\n", + "\n", + "fuds_path_in_s3 = (\n", + " settings.AWS_JUSTICE40_DATASOURCES_URL + \"/fuds_all_fy2019.csv.zip\"\n", + ")\n", + "\n", + "unzip_file_from_url(\n", + " file_url=fuds_path_in_s3,\n", + " download_path=fuds_tmp_path,\n", + " unzipped_file_path=fuds_tmp_path,\n", + ")\n", + "\n", + "fuds_path = fuds_tmp_path / \"fuds_all_fy2019.csv\"\n", + "\n", + "fuds_source_df = pd.read_csv(\n", + " filepath_or_buffer=fuds_path\n", + ")\n", + "\n", + "# Only keep \"eligible\" sites with projects.\n", + "# TODO: confirm this is an appropriate interpretation of the eligible field.\n", + "fuds_source_df = fuds_source_df[fuds_source_df[\"ELIGIBILITY\"] == \"Eligible\"]\n", + "fuds_source_df = fuds_source_df[fuds_source_df[\"HASPROJECTS\"] == \"Yes\"]\n", + "\n", + "# Drop columns that are not meaningful.\n", + "fuds_source_df = fuds_source_df[FUDS_COLUMNS_TO_KEEP]\n", + "\n", + "fuds_source_df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "700e5d7b", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: delete! \n", + "# fuds_source_df_backup = fuds_source_df\n", + "\n", + "# fuds_source_df = fuds_source_df[0:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5c47a326", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lucas/.virtualenvs/scoring2/lib/python3.9/site-packages/geopandas/array.py:275: ShapelyDeprecationWarning: The array interface is deprecated and will no longer work in Shapely 2.0. Convert the '.coords' to a numpy array instead.\n", + " return GeometryArray(vectorized.points_from_xy(x, y, z), crs=crs)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "23c8c927612849cba95965eb6dc88f38", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=717), Label(value='0 / 717'))), HB…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_31785/1938123.py:46: DeprecationWarning: Warning: no tract matches for POINT (-157.93194444 21.20222222)\n", + " lambda frame: get_census_tract_for_one_coordinate(\n", + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_31785/1938123.py:46: DeprecationWarning: Warning: no tract matches for POINT EMPTY\n", + " lambda frame: get_census_tract_for_one_coordinate(\n", + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_31785/1938123.py:46: DeprecationWarning: Warning: no tract matches for POINT (-162.082001 5.88724)\n", + " lambda frame: get_census_tract_for_one_coordinate(\n", + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_31785/1938123.py:46: DeprecationWarning: Warning: no tract matches for POINT (-157.93055556 21.24666667)\n", + " lambda frame: get_census_tract_for_one_coordinate(\n", + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_31785/1938123.py:46: DeprecationWarning: Warning: no tract matches for POINT EMPTY\n", + " lambda frame: get_census_tract_for_one_coordinate(\n", + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_31785/1938123.py:46: DeprecationWarning: Warning: no tract matches for POINT (76.454 38.33174)\n", + " lambda frame: get_census_tract_for_one_coordinate(\n", + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_31785/1938123.py:46: DeprecationWarning: Warning: no tract matches for POINT (-170.0581 -14.27345)\n", + " lambda frame: get_census_tract_for_one_coordinate(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Code took 2248.5825169086456 seconds.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LATITUDELONGITUDEFUDSUNIQUEPROPERTYNUMBERCURRENTOWNERELIGIBILITYEMSMGMTACTIONPLANLINKFEATUREDESCRIPTIONFEATURENAMEFUDSINSTALLATIONIDHASPROJECTSSTATUSPROPERTY_HISTORYGEOID10_TRACT
030.098611-93.722222K06TX0667LOCAL: CITY INDIVIDUAL OWNERSEligiblehttps://fudsportal.usace.army.mil/ems/inventor...The site was initially acquired in 1946 and us...ORANGE PORT OF NAV SHIP STORTX69799F675300YesProperties with projectsThe site was initially acquired in 1946 and us...48361020200
133.809700-95.628304K06TX0305DOD: USACE -AGRICULTURAL, RECREATIONAL, AND FL...Eligiblehttps://fudsportal.usace.army.mil/ems/inventor...Camp Maxey was activated in July 1942. It was ...CAMP MAXEYTX69799F668600YesProperties with projectsCamp Maxey was activated in July 1942. It was ...48277000102
435.746111-95.412778K06OK0186LOCAL: CITY CITY MUNICIPAL AIRFIELD\\nEligiblehttps://fudsportal.usace.army.mil/ems/inventor...HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...MUSKOGEE AUX AFOK69799F639800YesProperties with all projects at site closeoutHATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND...40101000100
636.226944-95.330000K06OK0025PRIV: PRIVATE CURRENTLY USED AS AN INDUSTRIAL ...Eligiblehttps://fudsportal.usace.army.mil/ems/inventor...The DoD began use in the early 1940s when the ...OKLAHOMA ORDNANCE WORKSOK69799F636200YesProperties with all projects at site closeoutThe DoD began use in the early 1940s when the ...40097040400
936.023333-102.541667K06TX0268LOCAL: CITY THE SITE IS NOW USED AS A MUNICIPA...Eligiblehttps://fudsportal.usace.army.mil/ems/inventor...In 1942, the DoD acquired 6,235.16 acres for u...DALHART AAFTX69799F665100YesProperties with projectsIn 1942, the DoD acquired 6,235.16 acres for u...48205950200
..........................................
1008051.379444179.293889F10AK0858FWS: USFWSTRIBE: NATIVE AMERICAN ANCSA NATIVE ...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNAMCHITKA AF AUXILIARY FIELDAK09799F709900YesProperties with projectsNaN02016000100
1010059.266111-135.448889F10AK1016OTHER: Private Landowner has not been identifi...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNHAINES FAIRBANKS PIPELINEAK09799F980700YesProperties with projectsNaN02100000100
1010161.200278-149.900278F10AK1023STATE: STATE ALL BUILDINGS TURNED OVER TO THE ...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNCAMP ANCHORAGE ARMYAK09799FA25200YesProperties with all projects at site closeoutNaN02020001400
1010260.555556-151.267778F10AK1024NaNEligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNFORT KENAI ARMY POSTAK09799FA25300YesProperties with all projects at site closeoutNaN02122000600
1010563.919444-145.303889F10AK1033OTHER: Native AllotmentBLM: BLMPUBLC: AT&T STA...Eligiblehttps://fudsportal.usace.army.mil/ems/ems/inve...NaNCANOL PIPELINEAK09799FA27900YesProperties with projectsNaN02240000400
\n", + "

2866 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " LATITUDE LONGITUDE FUDSUNIQUEPROPERTYNUMBER \\\n", + "0 30.098611 -93.722222 K06TX0667 \n", + "1 33.809700 -95.628304 K06TX0305 \n", + "4 35.746111 -95.412778 K06OK0186 \n", + "6 36.226944 -95.330000 K06OK0025 \n", + "9 36.023333 -102.541667 K06TX0268 \n", + "... ... ... ... \n", + "10080 51.379444 179.293889 F10AK0858 \n", + "10100 59.266111 -135.448889 F10AK1016 \n", + "10101 61.200278 -149.900278 F10AK1023 \n", + "10102 60.555556 -151.267778 F10AK1024 \n", + "10105 63.919444 -145.303889 F10AK1033 \n", + "\n", + " CURRENTOWNER ELIGIBILITY \\\n", + "0 LOCAL: CITY INDIVIDUAL OWNERS Eligible \n", + "1 DOD: USACE -AGRICULTURAL, RECREATIONAL, AND FL... Eligible \n", + "4 LOCAL: CITY CITY MUNICIPAL AIRFIELD\\n Eligible \n", + "6 PRIV: PRIVATE CURRENTLY USED AS AN INDUSTRIAL ... Eligible \n", + "9 LOCAL: CITY THE SITE IS NOW USED AS A MUNICIPA... Eligible \n", + "... ... ... \n", + "10080 FWS: USFWSTRIBE: NATIVE AMERICAN ANCSA NATIVE ... Eligible \n", + "10100 OTHER: Private Landowner has not been identifi... Eligible \n", + "10101 STATE: STATE ALL BUILDINGS TURNED OVER TO THE ... Eligible \n", + "10102 NaN Eligible \n", + "10105 OTHER: Native AllotmentBLM: BLMPUBLC: AT&T STA... Eligible \n", + "\n", + " EMSMGMTACTIONPLANLINK \\\n", + "0 https://fudsportal.usace.army.mil/ems/inventor... \n", + "1 https://fudsportal.usace.army.mil/ems/inventor... \n", + "4 https://fudsportal.usace.army.mil/ems/inventor... \n", + "6 https://fudsportal.usace.army.mil/ems/inventor... \n", + "9 https://fudsportal.usace.army.mil/ems/inventor... \n", + "... ... \n", + "10080 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10100 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10101 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10102 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "10105 https://fudsportal.usace.army.mil/ems/ems/inve... \n", + "\n", + " FEATUREDESCRIPTION \\\n", + "0 The site was initially acquired in 1946 and us... \n", + "1 Camp Maxey was activated in July 1942. It was ... \n", + "4 HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND... \n", + "6 The DoD began use in the early 1940s when the ... \n", + "9 In 1942, the DoD acquired 6,235.16 acres for u... \n", + "... ... \n", + "10080 NaN \n", + "10100 NaN \n", + "10101 NaN \n", + "10102 NaN \n", + "10105 NaN \n", + "\n", + " FEATURENAME FUDSINSTALLATIONID HASPROJECTS \\\n", + "0 ORANGE PORT OF NAV SHIP STOR TX69799F675300 Yes \n", + "1 CAMP MAXEY TX69799F668600 Yes \n", + "4 MUSKOGEE AUX AF OK69799F639800 Yes \n", + "6 OKLAHOMA ORDNANCE WORKS OK69799F636200 Yes \n", + "9 DALHART AAF TX69799F665100 Yes \n", + "... ... ... ... \n", + "10080 AMCHITKA AF AUXILIARY FIELD AK09799F709900 Yes \n", + "10100 HAINES FAIRBANKS PIPELINE AK09799F980700 Yes \n", + "10101 CAMP ANCHORAGE ARMY AK09799FA25200 Yes \n", + "10102 FORT KENAI ARMY POST AK09799FA25300 Yes \n", + "10105 CANOL PIPELINE AK09799FA27900 Yes \n", + "\n", + " STATUS \\\n", + "0 Properties with projects \n", + "1 Properties with projects \n", + "4 Properties with all projects at site closeout \n", + "6 Properties with all projects at site closeout \n", + "9 Properties with projects \n", + "... ... \n", + "10080 Properties with projects \n", + "10100 Properties with projects \n", + "10101 Properties with all projects at site closeout \n", + "10102 Properties with all projects at site closeout \n", + "10105 Properties with projects \n", + "\n", + " PROPERTY_HISTORY GEOID10_TRACT \n", + "0 The site was initially acquired in 1946 and us... 48361020200 \n", + "1 Camp Maxey was activated in July 1942. It was ... 48277000102 \n", + "4 HATBOX DOWNTOWN AIRPORT DATES BACK TO 1912 AND... 40101000100 \n", + "6 The DoD began use in the early 1940s when the ... 40097040400 \n", + "9 In 1942, the DoD acquired 6,235.16 acres for u... 48205950200 \n", + "... ... ... \n", + "10080 NaN 02016000100 \n", + "10100 NaN 02100000100 \n", + "10101 NaN 02020001400 \n", + "10102 NaN 02122000600 \n", + "10105 NaN 02240000400 \n", + "\n", + "[2866 rows x 13 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t1 = time.time()\n", + "\n", + "# Takes ~8 minutes with 2,900 rows.\n", + "fuds_source_with_tracts_df = get_census_tracts_for_dataframe_with_lat_long(\n", + " coordinates_df=fuds_source_df,\n", + " longitude_column=FUDS_LONG_FIELD,\n", + " latitude_column=FUDS_LAT_FIELD,\n", + " census_tract_gdf=census_tract_gdf,\n", + ")\n", + "\n", + "t2 = time.time()\n", + "\n", + "print(f\"Code took {str(t2-t1)} seconds.\")\n", + "\n", + "fuds_source_with_tracts_df" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "feddd4f4", + "metadata": {}, + "outputs": [], + "source": [ + "fuds_source_with_tracts_df.to_csv(\n", + " FUDS_OUTPUT_DIR / \"formerly_used_defense_sites.csv\", index=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "04c51d58", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2077" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(fuds_source_with_tracts_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].unique())" + ] + }, + { + "cell_type": "markdown", + "id": "c92d7c00", + "metadata": {}, + "source": [ + "# Start work on eAMLIS" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "id": "99df4efd", "metadata": { "scrolled": true @@ -182,7 +1116,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_21791/630537932.py:6: DtypeWarning: Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "2022-07-15 22:26:56,037 [data_pipeline.utils] INFO Downloading https://justice40-data.s3.amazonaws.com/data-sources/eAMLIS export of all data.tsv.zip\n", + "2022-07-15 22:27:13,306 [data_pipeline.utils] INFO Extracting /Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/tmp/abandoned_mine_lands/downloaded-60f877ab-7ca0-4e7c-8422-b89d0442a30f.zip\n", + "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_30741/2097728537.py:18: DtypeWarning: Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.\n", " eamlis_source_df = pd.read_csv(\n" ] }, @@ -395,7 +1331,7 @@ "[5 rows x 41 columns]" ] }, - "execution_count": 4, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -416,7 +1352,7 @@ " unzipped_file_path=tmp_path,\n", ")\n", "\n", - "eamlis_path = tmp_path + \"/eAMLIS export of all data.tsv\"\n", + "eamlis_path = tmp_path / \"eAMLIS export of all data.tsv\"\n", "\n", "eamlis_source_df = pd.read_csv(\n", " filepath_or_buffer=eamlis_path,\n", @@ -428,306 +1364,48 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "3a1fe6e8", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['AMLIS Key', 'State/Tribe', 'County', 'Congressional District',\n", - " 'Quadrangle Name', 'Watershed', 'HUC Code', 'FIPS Code', 'Latitude',\n", - " 'Longitude', 'Funding Source / Program', 'Problem Area Name',\n", - " 'Problem Area Number', 'Planning Unit Name', 'Planning Unit Number',\n", - " 'Problem Priority', 'Problem Type', 'Mining Type', 'Ore Types',\n", - " 'Date Prepared', 'Date Revised', 'Private Owner %', 'State Owner %',\n", - " 'Other Federal Owner %', 'Park Service Owner %',\n", - " 'Forest Service Owner %', 'Indian Owner %', 'BLM Owner %',\n", - " 'Unfunded Standard Units', 'Unfunded Costs', 'Unfunded GPRA Acres',\n", - " 'Unfunded Metric Units', 'Funded Standard Units', 'Funded Costs',\n", - " 'Funded GPRA Acres', 'Funded Metric Units', 'Completed Standard Units',\n", - " 'Completed Costs', 'Completed GPRA Acres', 'Completed Metric Units',\n", - " 'Unnamed: 40'],\n", - " dtype='object')\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AMLIS KeyLatitudeLongitude
2AK00000161.6-149.8
6AK00000361.6-144.0
12AK00000661.7-149.0
25AK00001261.6-148.9
30AK00001561.7-148.2
\n", - "
" - ], - "text/plain": [ - " AMLIS Key Latitude Longitude\n", - "2 AK000001 61.6 -149.8\n", - "6 AK000003 61.6 -144.0\n", - "12 AK000006 61.7 -149.0\n", - "25 AK000012 61.6 -148.9\n", - "30 AK000015 61.7 -148.2" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "mines_df = eamlis_source_df\n", "\n", "print(mines_df.columns)\n", "\n", "# TODO: investigate how to combine multiple rows for the same lat/long.\n", + "# Probably do something like, groupby([lat, long])[value_of_interest].size().\n", + "# TODO: Investigate aggregating over mine severity.\n", "# This just keeps one of the rows arbitrarily. We might need additional columns of information.\n", - "mines_unique_df = mines_df.drop_duplicates(subset=[LAT_FIELD, LONG_FIELD], keep=\"last\")\n", + "mines_unique_df = mines_df.drop_duplicates(\n", + " subset=[EAMLIS_LAT_FIELD, EAMLIS_LONG_FIELD], keep=\"last\"\n", + ")\n", "\n", "# TODO: investigate whether other columns (such as mine problem severity) are needed.\n", - "mines_unique_df = mines_unique_df[[KEY_FIELD, LAT_FIELD, LONG_FIELD]]\n", + "mines_unique_df = mines_unique_df[\n", + " [EAMLIS_KEY_FIELD, EAMLIS_LAT_FIELD, EAMLIS_LONG_FIELD]\n", + "]\n", "\n", "mines_unique_df.head()" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "aa316a32", - "metadata": {}, - "outputs": [], - "source": [ - "# METHOD DEFINITIONS\n", - "def get_census_tract_for_one_coordinate(\n", - " geom_point: shapely.geometry.point.Point,\n", - " census_tract_gdf: geopandas.geodataframe.GeoDataFrame,\n", - ") -> str:\n", - " # GEOJSON_TRACT_ID_FIELD\n", - "\n", - " # geopandas' contain method works row to row.\n", - " # So create a duplicate row for the point across the length of the census tract gdf\n", - " # number_of_census_tracts = len(census_tract_gdf)\n", - " # point_as_gdf = geopandas.GeoDataFrame([[geom_point] * number_of_census_tracts])\n", - "\n", - " # Now run a row-to-row contains\n", - " # print(point_as_gdf)\n", - "\n", - " contains_result = census_tract_gdf.contains(geom_point)\n", - " count_of_census_tract_matches = len(census_tract_gdf[contains_result])\n", - "\n", - " if count_of_census_tract_matches == 0:\n", - " warnings.warn(\n", - " f\"Warning: no tract matches for {geom_point}\",\n", - " DeprecationWarning,\n", - " stacklevel=2,\n", - " )\n", - " census_tract_id = None\n", - "\n", - " elif count_of_census_tract_matches > 1:\n", - " warnings.warn(\n", - " f\"Warning: too many tract matches for {geom_point}\",\n", - " DeprecationWarning,\n", - " stacklevel=2,\n", - " )\n", - " census_tract_id = None\n", - "\n", - " else:\n", - " # With only one tract returned, extract the ID.\n", - " census_tract_id = census_tract_gdf[contains_result][\n", - " GEOJSON_TRACT_ID_FIELD\n", - " ].values[0]\n", - "\n", - " return census_tract_id\n", - "\n", - "\n", - "def get_census_tracts_for_geom_points(\n", - " points_gdf: geopandas.geodataframe.GeoDataFrame,\n", - " census_tract_gdf: geopandas.geodataframe.GeoDataFrame,\n", - ") -> geopandas.geodataframe.GeoDataFrame:\n", - " geometry_column_name = \"geometry\"\n", - " result_gdf = points_gdf.parallel_apply(\n", - " lambda frame: get_census_tract_for_one_coordinate(\n", - " geom_point=frame[geometry_column_name], census_tract_gdf=census_tract_gdf\n", - " ),\n", - " axis=1,\n", - " )\n", - " return result_gdf\n", - "\n", - "\n", - "def get_census_tracts_for_dataframe_with_lat_long(\n", - " coordinates_df: pd.DataFrame,\n", - " latitude_column: str = LAT_FIELD,\n", - " longitude_column: str = LONG_FIELD,\n", - " census_tract_gdf: geopandas.geodataframe.GeoDataFrame = census_tract_gdf,\n", - "):\n", - " # Avoid these side-effects by creating a duplicate.\n", - " coordinates_df_duplicate = coordinates_df\n", - "\n", - " # First, convert the plain DataFrame into a geopandas data frame with lat/long geometry points.\n", - " coordinates_geopandas_gdf = geopandas.GeoDataFrame(\n", - " coordinates_df_duplicate,\n", - " geometry=geopandas.points_from_xy(\n", - " x=coordinates_df_duplicate[longitude_column],\n", - " y=coordinates_df_duplicate[latitude_column],\n", - " ),\n", - " )\n", - "\n", - " # Find the tract IDs for each point.\n", - " tract_results = get_census_tracts_for_geom_points(\n", - " points_gdf=coordinates_geopandas_gdf, census_tract_gdf=census_tract_gdf\n", - " )\n", - "\n", - " # Join the tract IDs back on the original dataframe\n", - " coordinates_with_tracts_df = coordinates_df\n", - " coordinates_with_tracts_df[\n", - " ExtractTransformLoad.GEOID_TRACT_FIELD_NAME\n", - " ] = tract_results\n", - "\n", - " # Remove unnecessary `geometry` column\n", - " # For unclear reasons, the initial `GeoDataFrame` creates a `geometry` column on the input dataframe that we don't want.\n", - " coordinates_with_tracts_df = coordinates_with_tracts_df.drop(\"geometry\", axis=1)\n", - "\n", - " return coordinates_with_tracts_df" - ] - }, - { - "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "a145f162", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " AMLIS Key Latitude Longitude\n", - "2 AK000001 61.6 -149.8\n", - "6 AK000003 61.6 -144.0\n", - "12 AK000006 61.7 -149.0\n", - "25 AK000012 61.6 -148.9\n", - "30 AK000015 61.7 -148.2\n", - "... ... ... ...\n", - "57140 WY216747 42.9 -108.1\n", - "57145 WY242429 41.8 -106.8\n", - "57146 WY242431 42.5 -108.7\n", - "57147 WY242441 42.8 -107.4\n", - "57148 WY242444 42.6 -110.9\n", - "\n", - "[3977 rows x 3 columns]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/lucas/.virtualenvs/scoring2/lib/python3.9/site-packages/geopandas/array.py:275: ShapelyDeprecationWarning: The array interface is deprecated and will no longer work in Shapely 2.0. Convert the '.coords' to a numpy array instead.\n", - " return GeometryArray(vectorized.points_from_xy(x, y, z), crs=crs)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9a3b14332bc649d3a5f49e11373a9409", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=498), Label(value='0 / 498'))), HB…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/lx/xmq8p65j71v9xq2bhsd2j5w40000gp/T/ipykernel_21791/3033776402.py:55: DeprecationWarning: Warning: no tract matches for POINT (-130 55.9)\n", - " lambda frame: get_census_tract_for_one_coordinate(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Code took 1550.5722029209137 seconds.\n", - " AMLIS Key Latitude Longitude GEOID10_TRACT\n", - "2 AK000001 61.6 -149.8 02170000401\n", - "6 AK000003 61.6 -144.0 02261000100\n", - "12 AK000006 61.7 -149.0 02170000200\n", - "25 AK000012 61.6 -148.9 02170001300\n", - "30 AK000015 61.7 -148.2 02170000200\n", - "... ... ... ... ...\n", - "57140 WY216747 42.9 -108.1 56013000300\n", - "57145 WY242429 41.8 -106.8 56007968100\n", - "57146 WY242431 42.5 -108.7 56013000300\n", - "57147 WY242441 42.8 -107.4 56025001800\n", - "57148 WY242444 42.6 -110.9 56023978100\n", - "\n", - "[3977 rows x 4 columns]\n" - ] - } - ], + "outputs": [], "source": [ "t1 = time.time()\n", "\n", "# Takes ~26 minutes with 4,000 rows.\n", "mines_unique_with_tracts_df = get_census_tracts_for_dataframe_with_lat_long(\n", - " coordinates_df=mines_unique_df\n", + " coordinates_df=mines_unique_df,\n", + " longitude_column=EAMLIS_LONG_FIELD,\n", + " latitude_column=EAMLIS_LAT_FIELD,\n", + " census_tract_gdf=census_tract_gdf,\n", ")\n", "\n", "t2 = time.time()\n", @@ -739,31 +1417,22 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "52aefca6", "metadata": {}, "outputs": [], "source": [ - "mines_unique_with_tracts_df.to_csv(OUTPUT_DIR / \"abandoned_mine_lands.csv\", index=False)" + "mines_unique_with_tracts_df.to_csv(\n", + " EAMLIS_OUTPUT_DIR / \"abandoned_mine_lands.csv\", index=False\n", + ")" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "d659ce7d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2035" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(mines_unique_with_tracts_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].unique())" ]