From 244b3663d11355c6f5d68e945f93a28f38fca021 Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Wed, 16 Jun 2021 13:47:27 -0400 Subject: [PATCH] Starting etl for score (#141) * starting etl for score * projection fix * projection flags --- .gitignore | 1 + score/etl/__init__.oy | 0 score/etl/datasets/__init__.py | 0 score/etl/datasets/ejscreen_2020.py | 1 + score/ipython/ejscreen_etl.ipynb | 376 ++++++++++++++++++++++++++++ score/scripts/download_cbg.py | 8 +- score/scripts/generate_mbtiles.py | 6 +- 7 files changed, 385 insertions(+), 7 deletions(-) create mode 100644 score/etl/__init__.oy create mode 100644 score/etl/datasets/__init__.py create mode 100644 score/etl/datasets/ejscreen_2020.py create mode 100644 score/ipython/ejscreen_etl.ipynb diff --git a/.gitignore b/.gitignore index 85e43381..6d4c3ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,4 @@ cython_debug/ # temporary census data score/data/census score/data/tiles +score/data/tmp diff --git a/score/etl/__init__.oy b/score/etl/__init__.oy new file mode 100644 index 00000000..e69de29b diff --git a/score/etl/datasets/__init__.py b/score/etl/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/score/etl/datasets/ejscreen_2020.py b/score/etl/datasets/ejscreen_2020.py new file mode 100644 index 00000000..12f433f1 --- /dev/null +++ b/score/etl/datasets/ejscreen_2020.py @@ -0,0 +1 @@ +# https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip diff --git a/score/ipython/ejscreen_etl.ipynb b/score/ipython/ejscreen_etl.ipynb new file mode 100644 index 00000000..3b24556b --- /dev/null +++ b/score/ipython/ejscreen_etl.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "f4d63367", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "data_path = Path.cwd().parent / \"data\" / \"tmp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0e6eb55e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "WindowsPath('C:/opt/justice40-tool/score/data/tmp')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_path" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a1431996", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\opt\\justice40-tool\\score\\venv\\lib\\site-packages\\urllib3\\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gaftp.epa.gov'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import requests\n", + "download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n", + "file_contents = download.content\n", + "zip_file_path = data_path / \"downloaded.zip\"\n", + "zip_file = open(zip_file_path, \"wb\")\n", + "zip_file.write(file_contents)\n", + "zip_file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "bc5f3466", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n", + " zip_ref.extractall(data_path)\n", + "ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "392ccb67", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | OBJECTID | \n", + "ID | \n", + "STATE_NAME | \n", + "ST_ABBREV | \n", + "REGION | \n", + "ACSTOTPOP | \n", + "D_PM25_2 | \n", + "B_PM25_D2 | \n", + "P_PM25_D2 | \n", + "D_OZONE_2 | \n", + "... | \n", + "T_PNPL | \n", + "T_PNPL_D2 | \n", + "T_PRMP | \n", + "T_PRMP_D2 | \n", + "T_PTSDF | \n", + "T_PTSDF_D2 | \n", + "T_PWDIS | \n", + "T_PWDIS_D2 | \n", + "Shape_Length | \n", + "Shape_Area | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "10010201001 | \n", + "Alabama | \n", + "AL | \n", + "4 | \n", + "636 | \n", + "-492.025529412 | \n", + "6 | \n", + "52.0 | \n", + "-1866.38637046 | \n", + "... | \n", + "0.071 facilities/km distance (79%ile) | \n", + "40%ile | \n", + "0.085 facilities/km distance (23%ile) | \n", + "53%ile | \n", + "0.59 facilities/km distance (57%ile) | \n", + "38%ile | \n", + "None | \n", + "None | \n", + "13443.155206 | \n", + "6.040790e+06 | \n", + "
1 | \n", + "2 | \n", + "10010201002 | \n", + "Alabama | \n", + "AL | \n", + "4 | \n", + "1287 | \n", + "-2053.08341364 | \n", + "4 | \n", + "30.0 | \n", + "-7787.90260177 | \n", + "... | \n", + "0.064 facilities/km distance (76%ile) | \n", + "19%ile | \n", + "0.074 facilities/km distance (17%ile) | \n", + "42%ile | \n", + "0.45 facilities/km distance (52%ile) | \n", + "23%ile | \n", + "None | \n", + "None | \n", + "11917.089598 | \n", + "7.834160e+06 | \n", + "
2 | \n", + "3 | \n", + "10010202001 | \n", + "Alabama | \n", + "AL | \n", + "4 | \n", + "810 | \n", + "1846.12693767 | \n", + "8 | \n", + "75.0 | \n", + "7002.78371663 | \n", + "... | \n", + "0.069 facilities/km distance (78%ile) | \n", + "85%ile | \n", + "0.078 facilities/km distance (20%ile) | \n", + "67%ile | \n", + "0.65 facilities/km distance (59%ile) | \n", + "77%ile | \n", + "None | \n", + "None | \n", + "7770.915121 | \n", + "2.900774e+06 | \n", + "
3 | \n", + "4 | \n", + "10010202002 | \n", + "Alabama | \n", + "AL | \n", + "4 | \n", + "1218 | \n", + "1392.07530488 | \n", + "8 | \n", + "72.0 | \n", + "5280.46153188 | \n", + "... | \n", + "0.076 facilities/km distance (81%ile) | \n", + "83%ile | \n", + "0.087 facilities/km distance (24%ile) | \n", + "66%ile | \n", + "1 facilities/km distance (69%ile) | \n", + "78%ile | \n", + "None | \n", + "None | \n", + "6506.804784 | \n", + "1.793332e+06 | \n", + "
4 | \n", + "5 | \n", + "10010203001 | \n", + "Alabama | \n", + "AL | \n", + "4 | \n", + "2641 | \n", + "-769.374640358 | \n", + "5 | \n", + "48.0 | \n", + "-2911.8926061 | \n", + "... | \n", + "0.074 facilities/km distance (80%ile) | \n", + "32%ile | \n", + "0.08 facilities/km distance (21%ile) | \n", + "51%ile | \n", + "1.2 facilities/km distance (74%ile) | \n", + "24%ile | \n", + "None | \n", + "None | \n", + "11070.367848 | \n", + "5.461602e+06 | \n", + "
5 rows × 124 columns
\n", + "