Starting etl for score (#141)

* starting etl for score

* projection fix

* projection flags
This commit is contained in:
Jorge Escobar 2021-06-16 13:47:27 -04:00 committed by GitHub
parent 6f568b0e20
commit 244b3663d1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 385 additions and 7 deletions

1
.gitignore vendored
View file

@ -131,3 +131,4 @@ cython_debug/
# temporary census data
score/data/census
score/data/tiles
score/data/tmp

0
score/etl/__init__.oy Normal file
View file

View file

View file

@ -0,0 +1 @@
# https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip

View file

@ -0,0 +1,376 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "f4d63367",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"data_path = Path.cwd().parent / \"data\" / \"tmp\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0e6eb55e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('C:/opt/justice40-tool/score/data/tmp')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_path"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a1431996",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\opt\\justice40-tool\\score\\venv\\lib\\site-packages\\urllib3\\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gaftp.epa.gov'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
" warnings.warn(\n"
]
}
],
"source": [
"import requests\n",
"download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n",
"file_contents = download.content\n",
"zip_file_path = data_path / \"downloaded.zip\"\n",
"zip_file = open(zip_file_path, \"wb\")\n",
"zip_file.write(file_contents)\n",
"zip_file.close()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "bc5f3466",
"metadata": {},
"outputs": [],
"source": [
"import zipfile\n",
"with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
" zip_ref.extractall(data_path)\n",
"ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\""
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "392ccb67",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>OBJECTID</th>\n",
" <th>ID</th>\n",
" <th>STATE_NAME</th>\n",
" <th>ST_ABBREV</th>\n",
" <th>REGION</th>\n",
" <th>ACSTOTPOP</th>\n",
" <th>D_PM25_2</th>\n",
" <th>B_PM25_D2</th>\n",
" <th>P_PM25_D2</th>\n",
" <th>D_OZONE_2</th>\n",
" <th>...</th>\n",
" <th>T_PNPL</th>\n",
" <th>T_PNPL_D2</th>\n",
" <th>T_PRMP</th>\n",
" <th>T_PRMP_D2</th>\n",
" <th>T_PTSDF</th>\n",
" <th>T_PTSDF_D2</th>\n",
" <th>T_PWDIS</th>\n",
" <th>T_PWDIS_D2</th>\n",
" <th>Shape_Length</th>\n",
" <th>Shape_Area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>10010201001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>636</td>\n",
" <td>-492.025529412</td>\n",
" <td>6</td>\n",
" <td>52.0</td>\n",
" <td>-1866.38637046</td>\n",
" <td>...</td>\n",
" <td>0.071 facilities/km distance (79%ile)</td>\n",
" <td>40%ile</td>\n",
" <td>0.085 facilities/km distance (23%ile)</td>\n",
" <td>53%ile</td>\n",
" <td>0.59 facilities/km distance (57%ile)</td>\n",
" <td>38%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>13443.155206</td>\n",
" <td>6.040790e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>10010201002</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>1287</td>\n",
" <td>-2053.08341364</td>\n",
" <td>4</td>\n",
" <td>30.0</td>\n",
" <td>-7787.90260177</td>\n",
" <td>...</td>\n",
" <td>0.064 facilities/km distance (76%ile)</td>\n",
" <td>19%ile</td>\n",
" <td>0.074 facilities/km distance (17%ile)</td>\n",
" <td>42%ile</td>\n",
" <td>0.45 facilities/km distance (52%ile)</td>\n",
" <td>23%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>11917.089598</td>\n",
" <td>7.834160e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>10010202001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>810</td>\n",
" <td>1846.12693767</td>\n",
" <td>8</td>\n",
" <td>75.0</td>\n",
" <td>7002.78371663</td>\n",
" <td>...</td>\n",
" <td>0.069 facilities/km distance (78%ile)</td>\n",
" <td>85%ile</td>\n",
" <td>0.078 facilities/km distance (20%ile)</td>\n",
" <td>67%ile</td>\n",
" <td>0.65 facilities/km distance (59%ile)</td>\n",
" <td>77%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>7770.915121</td>\n",
" <td>2.900774e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>10010202002</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>1218</td>\n",
" <td>1392.07530488</td>\n",
" <td>8</td>\n",
" <td>72.0</td>\n",
" <td>5280.46153188</td>\n",
" <td>...</td>\n",
" <td>0.076 facilities/km distance (81%ile)</td>\n",
" <td>83%ile</td>\n",
" <td>0.087 facilities/km distance (24%ile)</td>\n",
" <td>66%ile</td>\n",
" <td>1 facilities/km distance (69%ile)</td>\n",
" <td>78%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>6506.804784</td>\n",
" <td>1.793332e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>10010203001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>2641</td>\n",
" <td>-769.374640358</td>\n",
" <td>5</td>\n",
" <td>48.0</td>\n",
" <td>-2911.8926061</td>\n",
" <td>...</td>\n",
" <td>0.074 facilities/km distance (80%ile)</td>\n",
" <td>32%ile</td>\n",
" <td>0.08 facilities/km distance (21%ile)</td>\n",
" <td>51%ile</td>\n",
" <td>1.2 facilities/km distance (74%ile)</td>\n",
" <td>24%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>11070.367848</td>\n",
" <td>5.461602e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 124 columns</p>\n",
"</div>"
],
"text/plain": [
" OBJECTID ID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n",
"0 1 10010201001 Alabama AL 4 636 \n",
"1 2 10010201002 Alabama AL 4 1287 \n",
"2 3 10010202001 Alabama AL 4 810 \n",
"3 4 10010202002 Alabama AL 4 1218 \n",
"4 5 10010203001 Alabama AL 4 2641 \n",
"\n",
" D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n",
"0 -492.025529412 6 52.0 -1866.38637046 ... \n",
"1 -2053.08341364 4 30.0 -7787.90260177 ... \n",
"2 1846.12693767 8 75.0 7002.78371663 ... \n",
"3 1392.07530488 8 72.0 5280.46153188 ... \n",
"4 -769.374640358 5 48.0 -2911.8926061 ... \n",
"\n",
" T_PNPL T_PNPL_D2 \\\n",
"0 0.071 facilities/km distance (79%ile) 40%ile \n",
"1 0.064 facilities/km distance (76%ile) 19%ile \n",
"2 0.069 facilities/km distance (78%ile) 85%ile \n",
"3 0.076 facilities/km distance (81%ile) 83%ile \n",
"4 0.074 facilities/km distance (80%ile) 32%ile \n",
"\n",
" T_PRMP T_PRMP_D2 \\\n",
"0 0.085 facilities/km distance (23%ile) 53%ile \n",
"1 0.074 facilities/km distance (17%ile) 42%ile \n",
"2 0.078 facilities/km distance (20%ile) 67%ile \n",
"3 0.087 facilities/km distance (24%ile) 66%ile \n",
"4 0.08 facilities/km distance (21%ile) 51%ile \n",
"\n",
" T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n",
"0 0.59 facilities/km distance (57%ile) 38%ile None None \n",
"1 0.45 facilities/km distance (52%ile) 23%ile None None \n",
"2 0.65 facilities/km distance (59%ile) 77%ile None None \n",
"3 1 facilities/km distance (69%ile) 78%ile None None \n",
"4 1.2 facilities/km distance (74%ile) 24%ile None None \n",
"\n",
" Shape_Length Shape_Area \n",
"0 13443.155206 6.040790e+06 \n",
"1 11917.089598 7.834160e+06 \n",
"2 7770.915121 2.900774e+06 \n",
"3 6506.804784 1.793332e+06 \n",
"4 11070.367848 5.461602e+06 \n",
"\n",
"[5 rows x 124 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"df = pd.read_csv(ejscreen_csv, low_memory=False)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "0ce9e22a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method DataFrame.count of ID ACSTOTPOP LESSHSPCT LOWINCPCT\n",
"0 10010201001 636 0.208134 0.385220\n",
"1 10010201002 1287 0.040678 0.163170\n",
"2 10010202001 810 0.135563 0.501247\n",
"3 10010202002 1218 0.192000 0.393701\n",
"4 10010203001 2641 0.125473 0.308217\n",
"... ... ... ... ...\n",
"220328 721537506011 699 0.391389 0.902718\n",
"220329 721537506012 2432 0.185852 0.783717\n",
"220330 721537506013 976 0.018116 0.776639\n",
"220331 721537506021 1707 0.375422 0.867377\n",
"220332 721537506022 804 0.162791 0.942786\n",
"\n",
"[220333 rows x 4 columns]>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]\n",
"df.head()\n",
"df.count"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e051623b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -10,7 +10,7 @@ data_path = Path.cwd() / "data"
with requests.Session() as s:
# the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
fips_csv_path = data_path.joinpath("fips_states_2010.csv")
fips_csv_path = data_path / "fips_states_2010.csv"
with open(fips_csv_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
line_count = 0
@ -27,10 +27,12 @@ with requests.Session() as s:
if not os.path.isfile(shp_file_path):
print(f"downloading {row[1]}")
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
# But using 2010 for now
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
download = s.get(cbg_state_url)
file_contents = download.content
zip_file_path = data_path.joinpath("census", "downloaded.zip")
zip_file_path = data_path / "census" / "downloaded.zip"
zip_file = open(zip_file_path, "wb")
zip_file.write(file_contents)
zip_file.close()
@ -38,7 +40,7 @@ with requests.Session() as s:
print(f"extracting {row[1]}")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
shp_dir_path = data_path.joinpath("census", "shp", fips)
shp_dir_path = data_path / "census" / "shp" / fips
zip_ref.extractall(shp_dir_path)
geojson_dir_path = data_path.joinpath(

View file

@ -22,8 +22,6 @@ for file in os.listdir(geojson_path):
if file.endswith(".json"):
geojson_list += f"/home/data/census/geojson/{file} "
breakpoint()
if geojson_list == "":
print("No GeoJson files found. Please run download_cbg.py first")
@ -35,7 +33,7 @@ else:
cmd = (
'docker run --rm -it -v "'
+ pwd
+ '"/:/home klokantech/tippecanoe tippecanoe -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 '
+ '"/:/home klokantech/tippecanoe tippecanoe -s_srs EPSG:4269 -t_srs EPSG:4326 --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 '
+ geojson_list
)
print(cmd)
@ -51,7 +49,7 @@ else:
cmd = (
'docker run --rm -it -v "'
+ pwd
+ '"/:/home klokantech/tippecanoe tippecanoe --no-tile-compression -zg -e /home/data/tiles/mvt /home/data/census/geojson/01.json '
+ '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed --no-tile-compression -zg -e /home/data/tiles/mvt '
+ geojson_list
)
print(cmd)