diff --git a/.gitignore b/.gitignore index 85e43381..6d4c3ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,4 @@ cython_debug/ # temporary census data score/data/census score/data/tiles +score/data/tmp diff --git a/score/etl/__init__.oy b/score/etl/__init__.oy new file mode 100644 index 00000000..e69de29b diff --git a/score/etl/datasets/__init__.py b/score/etl/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/score/etl/datasets/ejscreen_2020.py b/score/etl/datasets/ejscreen_2020.py new file mode 100644 index 00000000..12f433f1 --- /dev/null +++ b/score/etl/datasets/ejscreen_2020.py @@ -0,0 +1 @@ +# https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip diff --git a/score/ipython/ejscreen_etl.ipynb b/score/ipython/ejscreen_etl.ipynb new file mode 100644 index 00000000..3b24556b --- /dev/null +++ b/score/ipython/ejscreen_etl.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "f4d63367", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "data_path = Path.cwd().parent / \"data\" / \"tmp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0e6eb55e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "WindowsPath('C:/opt/justice40-tool/score/data/tmp')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_path" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a1431996", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\opt\\justice40-tool\\score\\venv\\lib\\site-packages\\urllib3\\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gaftp.epa.gov'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import requests\n", + "download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n", + "file_contents = download.content\n", + "zip_file_path = data_path / \"downloaded.zip\"\n", + "zip_file = open(zip_file_path, \"wb\")\n", + "zip_file.write(file_contents)\n", + "zip_file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "bc5f3466", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n", + " zip_ref.extractall(data_path)\n", + "ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "392ccb67", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OBJECTIDIDSTATE_NAMEST_ABBREVREGIONACSTOTPOPD_PM25_2B_PM25_D2P_PM25_D2D_OZONE_2...T_PNPLT_PNPL_D2T_PRMPT_PRMP_D2T_PTSDFT_PTSDF_D2T_PWDIST_PWDIS_D2Shape_LengthShape_Area
0110010201001AlabamaAL4636-492.025529412652.0-1866.38637046...0.071 facilities/km distance (79%ile)40%ile0.085 facilities/km distance (23%ile)53%ile0.59 facilities/km distance (57%ile)38%ileNoneNone13443.1552066.040790e+06
1210010201002AlabamaAL41287-2053.08341364430.0-7787.90260177...0.064 facilities/km distance (76%ile)19%ile0.074 facilities/km distance (17%ile)42%ile0.45 facilities/km distance (52%ile)23%ileNoneNone11917.0895987.834160e+06
2310010202001AlabamaAL48101846.12693767875.07002.78371663...0.069 facilities/km distance (78%ile)85%ile0.078 facilities/km distance (20%ile)67%ile0.65 facilities/km distance (59%ile)77%ileNoneNone7770.9151212.900774e+06
3410010202002AlabamaAL412181392.07530488872.05280.46153188...0.076 facilities/km distance (81%ile)83%ile0.087 facilities/km distance (24%ile)66%ile1 facilities/km distance (69%ile)78%ileNoneNone6506.8047841.793332e+06
4510010203001AlabamaAL42641-769.374640358548.0-2911.8926061...0.074 facilities/km distance (80%ile)32%ile0.08 facilities/km distance (21%ile)51%ile1.2 facilities/km distance (74%ile)24%ileNoneNone11070.3678485.461602e+06
\n", + "

5 rows × 124 columns

\n", + "
" + ], + "text/plain": [ + " OBJECTID ID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n", + "0 1 10010201001 Alabama AL 4 636 \n", + "1 2 10010201002 Alabama AL 4 1287 \n", + "2 3 10010202001 Alabama AL 4 810 \n", + "3 4 10010202002 Alabama AL 4 1218 \n", + "4 5 10010203001 Alabama AL 4 2641 \n", + "\n", + " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n", + "0 -492.025529412 6 52.0 -1866.38637046 ... \n", + "1 -2053.08341364 4 30.0 -7787.90260177 ... \n", + "2 1846.12693767 8 75.0 7002.78371663 ... \n", + "3 1392.07530488 8 72.0 5280.46153188 ... \n", + "4 -769.374640358 5 48.0 -2911.8926061 ... \n", + "\n", + " T_PNPL T_PNPL_D2 \\\n", + "0 0.071 facilities/km distance (79%ile) 40%ile \n", + "1 0.064 facilities/km distance (76%ile) 19%ile \n", + "2 0.069 facilities/km distance (78%ile) 85%ile \n", + "3 0.076 facilities/km distance (81%ile) 83%ile \n", + "4 0.074 facilities/km distance (80%ile) 32%ile \n", + "\n", + " T_PRMP T_PRMP_D2 \\\n", + "0 0.085 facilities/km distance (23%ile) 53%ile \n", + "1 0.074 facilities/km distance (17%ile) 42%ile \n", + "2 0.078 facilities/km distance (20%ile) 67%ile \n", + "3 0.087 facilities/km distance (24%ile) 66%ile \n", + "4 0.08 facilities/km distance (21%ile) 51%ile \n", + "\n", + " T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n", + "0 0.59 facilities/km distance (57%ile) 38%ile None None \n", + "1 0.45 facilities/km distance (52%ile) 23%ile None None \n", + "2 0.65 facilities/km distance (59%ile) 77%ile None None \n", + "3 1 facilities/km distance (69%ile) 78%ile None None \n", + "4 1.2 facilities/km distance (74%ile) 24%ile None None \n", + "\n", + " Shape_Length Shape_Area \n", + "0 13443.155206 6.040790e+06 \n", + "1 11917.089598 7.834160e+06 \n", + "2 7770.915121 2.900774e+06 \n", + "3 6506.804784 1.793332e+06 \n", + "4 11070.367848 5.461602e+06 \n", + "\n", + "[5 rows x 124 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "df = pd.read_csv(ejscreen_csv, low_memory=False)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "0ce9e22a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]\n", + "df.head()\n", + "df.count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e051623b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/scripts/download_cbg.py b/score/scripts/download_cbg.py index dbeefeb5..ff58451c 100644 --- a/score/scripts/download_cbg.py +++ b/score/scripts/download_cbg.py @@ -10,7 +10,7 @@ data_path = Path.cwd() / "data" with requests.Session() as s: # the fips_states_2010.csv is generated from data here # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html - fips_csv_path = data_path.joinpath("fips_states_2010.csv") + fips_csv_path = data_path / "fips_states_2010.csv" with open(fips_csv_path) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") line_count = 0 @@ -27,10 +27,12 @@ with requests.Session() as s: if not os.path.isfile(shp_file_path): print(f"downloading {row[1]}") + # 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/ + # But using 2010 for now cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip" download = s.get(cbg_state_url) file_contents = download.content - zip_file_path = data_path.joinpath("census", "downloaded.zip") + zip_file_path = data_path / "census" / "downloaded.zip" zip_file = open(zip_file_path, "wb") zip_file.write(file_contents) zip_file.close() @@ -38,7 +40,7 @@ with requests.Session() as s: print(f"extracting {row[1]}") with zipfile.ZipFile(zip_file_path, "r") as zip_ref: - shp_dir_path = data_path.joinpath("census", "shp", fips) + shp_dir_path = data_path / "census" / "shp" / fips zip_ref.extractall(shp_dir_path) geojson_dir_path = data_path.joinpath( diff --git a/score/scripts/generate_mbtiles.py b/score/scripts/generate_mbtiles.py index 036d077e..22678438 100644 --- a/score/scripts/generate_mbtiles.py +++ b/score/scripts/generate_mbtiles.py @@ -22,8 +22,6 @@ for file in os.listdir(geojson_path): if file.endswith(".json"): geojson_list += f"/home/data/census/geojson/{file} " -breakpoint() - if geojson_list == "": print("No GeoJson files found. Please run download_cbg.py first") @@ -35,7 +33,7 @@ else: cmd = ( 'docker run --rm -it -v "' + pwd - + '"/:/home klokantech/tippecanoe tippecanoe -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 ' + + '"/:/home klokantech/tippecanoe tippecanoe -s_srs EPSG:4269 -t_srs EPSG:4326 --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 ' + geojson_list ) print(cmd) @@ -51,7 +49,7 @@ else: cmd = ( 'docker run --rm -it -v "' + pwd - + '"/:/home klokantech/tippecanoe tippecanoe --no-tile-compression -zg -e /home/data/tiles/mvt /home/data/census/geojson/01.json ' + + '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed --no-tile-compression -zg -e /home/data/tiles/mvt ' + geojson_list ) print(cmd)