mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 09:41:26 -08:00
Starting etl for score (#141)
* starting etl for score * projection fix * projection flags
This commit is contained in:
parent
6f568b0e20
commit
244b3663d1
7 changed files with 385 additions and 7 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -131,3 +131,4 @@ cython_debug/
|
|||
# temporary census data
|
||||
score/data/census
|
||||
score/data/tiles
|
||||
score/data/tmp
|
||||
|
|
0
score/etl/__init__.oy
Normal file
0
score/etl/__init__.oy
Normal file
0
score/etl/datasets/__init__.py
Normal file
0
score/etl/datasets/__init__.py
Normal file
1
score/etl/datasets/ejscreen_2020.py
Normal file
1
score/etl/datasets/ejscreen_2020.py
Normal file
|
@ -0,0 +1 @@
|
|||
# https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip
|
376
score/ipython/ejscreen_etl.ipynb
Normal file
376
score/ipython/ejscreen_etl.ipynb
Normal file
|
@ -0,0 +1,376 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f4d63367",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"data_path = Path.cwd().parent / \"data\" / \"tmp\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "0e6eb55e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"WindowsPath('C:/opt/justice40-tool/score/data/tmp')"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data_path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "a1431996",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\opt\\justice40-tool\\score\\venv\\lib\\site-packages\\urllib3\\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gaftp.epa.gov'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n",
|
||||
"file_contents = download.content\n",
|
||||
"zip_file_path = data_path / \"downloaded.zip\"\n",
|
||||
"zip_file = open(zip_file_path, \"wb\")\n",
|
||||
"zip_file.write(file_contents)\n",
|
||||
"zip_file.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "bc5f3466",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import zipfile\n",
|
||||
"with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
|
||||
" zip_ref.extractall(data_path)\n",
|
||||
"ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "392ccb67",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>OBJECTID</th>\n",
|
||||
" <th>ID</th>\n",
|
||||
" <th>STATE_NAME</th>\n",
|
||||
" <th>ST_ABBREV</th>\n",
|
||||
" <th>REGION</th>\n",
|
||||
" <th>ACSTOTPOP</th>\n",
|
||||
" <th>D_PM25_2</th>\n",
|
||||
" <th>B_PM25_D2</th>\n",
|
||||
" <th>P_PM25_D2</th>\n",
|
||||
" <th>D_OZONE_2</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>T_PNPL</th>\n",
|
||||
" <th>T_PNPL_D2</th>\n",
|
||||
" <th>T_PRMP</th>\n",
|
||||
" <th>T_PRMP_D2</th>\n",
|
||||
" <th>T_PTSDF</th>\n",
|
||||
" <th>T_PTSDF_D2</th>\n",
|
||||
" <th>T_PWDIS</th>\n",
|
||||
" <th>T_PWDIS_D2</th>\n",
|
||||
" <th>Shape_Length</th>\n",
|
||||
" <th>Shape_Area</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>10010201001</td>\n",
|
||||
" <td>Alabama</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>636</td>\n",
|
||||
" <td>-492.025529412</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>52.0</td>\n",
|
||||
" <td>-1866.38637046</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.071 facilities/km distance (79%ile)</td>\n",
|
||||
" <td>40%ile</td>\n",
|
||||
" <td>0.085 facilities/km distance (23%ile)</td>\n",
|
||||
" <td>53%ile</td>\n",
|
||||
" <td>0.59 facilities/km distance (57%ile)</td>\n",
|
||||
" <td>38%ile</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>13443.155206</td>\n",
|
||||
" <td>6.040790e+06</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>10010201002</td>\n",
|
||||
" <td>Alabama</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>1287</td>\n",
|
||||
" <td>-2053.08341364</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>30.0</td>\n",
|
||||
" <td>-7787.90260177</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.064 facilities/km distance (76%ile)</td>\n",
|
||||
" <td>19%ile</td>\n",
|
||||
" <td>0.074 facilities/km distance (17%ile)</td>\n",
|
||||
" <td>42%ile</td>\n",
|
||||
" <td>0.45 facilities/km distance (52%ile)</td>\n",
|
||||
" <td>23%ile</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>11917.089598</td>\n",
|
||||
" <td>7.834160e+06</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>10010202001</td>\n",
|
||||
" <td>Alabama</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>810</td>\n",
|
||||
" <td>1846.12693767</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>75.0</td>\n",
|
||||
" <td>7002.78371663</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.069 facilities/km distance (78%ile)</td>\n",
|
||||
" <td>85%ile</td>\n",
|
||||
" <td>0.078 facilities/km distance (20%ile)</td>\n",
|
||||
" <td>67%ile</td>\n",
|
||||
" <td>0.65 facilities/km distance (59%ile)</td>\n",
|
||||
" <td>77%ile</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>7770.915121</td>\n",
|
||||
" <td>2.900774e+06</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>10010202002</td>\n",
|
||||
" <td>Alabama</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>1218</td>\n",
|
||||
" <td>1392.07530488</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>72.0</td>\n",
|
||||
" <td>5280.46153188</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.076 facilities/km distance (81%ile)</td>\n",
|
||||
" <td>83%ile</td>\n",
|
||||
" <td>0.087 facilities/km distance (24%ile)</td>\n",
|
||||
" <td>66%ile</td>\n",
|
||||
" <td>1 facilities/km distance (69%ile)</td>\n",
|
||||
" <td>78%ile</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>6506.804784</td>\n",
|
||||
" <td>1.793332e+06</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>10010203001</td>\n",
|
||||
" <td>Alabama</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>2641</td>\n",
|
||||
" <td>-769.374640358</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>48.0</td>\n",
|
||||
" <td>-2911.8926061</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.074 facilities/km distance (80%ile)</td>\n",
|
||||
" <td>32%ile</td>\n",
|
||||
" <td>0.08 facilities/km distance (21%ile)</td>\n",
|
||||
" <td>51%ile</td>\n",
|
||||
" <td>1.2 facilities/km distance (74%ile)</td>\n",
|
||||
" <td>24%ile</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>11070.367848</td>\n",
|
||||
" <td>5.461602e+06</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 124 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" OBJECTID ID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n",
|
||||
"0 1 10010201001 Alabama AL 4 636 \n",
|
||||
"1 2 10010201002 Alabama AL 4 1287 \n",
|
||||
"2 3 10010202001 Alabama AL 4 810 \n",
|
||||
"3 4 10010202002 Alabama AL 4 1218 \n",
|
||||
"4 5 10010203001 Alabama AL 4 2641 \n",
|
||||
"\n",
|
||||
" D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n",
|
||||
"0 -492.025529412 6 52.0 -1866.38637046 ... \n",
|
||||
"1 -2053.08341364 4 30.0 -7787.90260177 ... \n",
|
||||
"2 1846.12693767 8 75.0 7002.78371663 ... \n",
|
||||
"3 1392.07530488 8 72.0 5280.46153188 ... \n",
|
||||
"4 -769.374640358 5 48.0 -2911.8926061 ... \n",
|
||||
"\n",
|
||||
" T_PNPL T_PNPL_D2 \\\n",
|
||||
"0 0.071 facilities/km distance (79%ile) 40%ile \n",
|
||||
"1 0.064 facilities/km distance (76%ile) 19%ile \n",
|
||||
"2 0.069 facilities/km distance (78%ile) 85%ile \n",
|
||||
"3 0.076 facilities/km distance (81%ile) 83%ile \n",
|
||||
"4 0.074 facilities/km distance (80%ile) 32%ile \n",
|
||||
"\n",
|
||||
" T_PRMP T_PRMP_D2 \\\n",
|
||||
"0 0.085 facilities/km distance (23%ile) 53%ile \n",
|
||||
"1 0.074 facilities/km distance (17%ile) 42%ile \n",
|
||||
"2 0.078 facilities/km distance (20%ile) 67%ile \n",
|
||||
"3 0.087 facilities/km distance (24%ile) 66%ile \n",
|
||||
"4 0.08 facilities/km distance (21%ile) 51%ile \n",
|
||||
"\n",
|
||||
" T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n",
|
||||
"0 0.59 facilities/km distance (57%ile) 38%ile None None \n",
|
||||
"1 0.45 facilities/km distance (52%ile) 23%ile None None \n",
|
||||
"2 0.65 facilities/km distance (59%ile) 77%ile None None \n",
|
||||
"3 1 facilities/km distance (69%ile) 78%ile None None \n",
|
||||
"4 1.2 facilities/km distance (74%ile) 24%ile None None \n",
|
||||
"\n",
|
||||
" Shape_Length Shape_Area \n",
|
||||
"0 13443.155206 6.040790e+06 \n",
|
||||
"1 11917.089598 7.834160e+06 \n",
|
||||
"2 7770.915121 2.900774e+06 \n",
|
||||
"3 6506.804784 1.793332e+06 \n",
|
||||
"4 11070.367848 5.461602e+06 \n",
|
||||
"\n",
|
||||
"[5 rows x 124 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"df = pd.read_csv(ejscreen_csv, low_memory=False)\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "0ce9e22a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<bound method DataFrame.count of ID ACSTOTPOP LESSHSPCT LOWINCPCT\n",
|
||||
"0 10010201001 636 0.208134 0.385220\n",
|
||||
"1 10010201002 1287 0.040678 0.163170\n",
|
||||
"2 10010202001 810 0.135563 0.501247\n",
|
||||
"3 10010202002 1218 0.192000 0.393701\n",
|
||||
"4 10010203001 2641 0.125473 0.308217\n",
|
||||
"... ... ... ... ...\n",
|
||||
"220328 721537506011 699 0.391389 0.902718\n",
|
||||
"220329 721537506012 2432 0.185852 0.783717\n",
|
||||
"220330 721537506013 976 0.018116 0.776639\n",
|
||||
"220331 721537506021 1707 0.375422 0.867377\n",
|
||||
"220332 721537506022 804 0.162791 0.942786\n",
|
||||
"\n",
|
||||
"[220333 rows x 4 columns]>"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]\n",
|
||||
"df.head()\n",
|
||||
"df.count"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e051623b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -10,7 +10,7 @@ data_path = Path.cwd() / "data"
|
|||
with requests.Session() as s:
|
||||
# the fips_states_2010.csv is generated from data here
|
||||
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
|
||||
fips_csv_path = data_path.joinpath("fips_states_2010.csv")
|
||||
fips_csv_path = data_path / "fips_states_2010.csv"
|
||||
with open(fips_csv_path) as csv_file:
|
||||
csv_reader = csv.reader(csv_file, delimiter=",")
|
||||
line_count = 0
|
||||
|
@ -27,10 +27,12 @@ with requests.Session() as s:
|
|||
if not os.path.isfile(shp_file_path):
|
||||
print(f"downloading {row[1]}")
|
||||
|
||||
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
|
||||
# But using 2010 for now
|
||||
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
|
||||
download = s.get(cbg_state_url)
|
||||
file_contents = download.content
|
||||
zip_file_path = data_path.joinpath("census", "downloaded.zip")
|
||||
zip_file_path = data_path / "census" / "downloaded.zip"
|
||||
zip_file = open(zip_file_path, "wb")
|
||||
zip_file.write(file_contents)
|
||||
zip_file.close()
|
||||
|
@ -38,7 +40,7 @@ with requests.Session() as s:
|
|||
print(f"extracting {row[1]}")
|
||||
|
||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||
shp_dir_path = data_path.joinpath("census", "shp", fips)
|
||||
shp_dir_path = data_path / "census" / "shp" / fips
|
||||
zip_ref.extractall(shp_dir_path)
|
||||
|
||||
geojson_dir_path = data_path.joinpath(
|
||||
|
|
|
@ -22,8 +22,6 @@ for file in os.listdir(geojson_path):
|
|||
if file.endswith(".json"):
|
||||
geojson_list += f"/home/data/census/geojson/{file} "
|
||||
|
||||
breakpoint()
|
||||
|
||||
if geojson_list == "":
|
||||
print("No GeoJson files found. Please run download_cbg.py first")
|
||||
|
||||
|
@ -35,7 +33,7 @@ else:
|
|||
cmd = (
|
||||
'docker run --rm -it -v "'
|
||||
+ pwd
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 '
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe -s_srs EPSG:4269 -t_srs EPSG:4326 --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 '
|
||||
+ geojson_list
|
||||
)
|
||||
print(cmd)
|
||||
|
@ -51,7 +49,7 @@ else:
|
|||
cmd = (
|
||||
'docker run --rm -it -v "'
|
||||
+ pwd
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe --no-tile-compression -zg -e /home/data/tiles/mvt /home/data/census/geojson/01.json '
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed --no-tile-compression -zg -e /home/data/tiles/mvt '
|
||||
+ geojson_list
|
||||
)
|
||||
print(cmd)
|
||||
|
|
Loading…
Add table
Reference in a new issue