ACS data baked in for map (#153)

* starting etl for score

* projection fix

* projection flags

* proper ejscreen etl csv generation

* failing CSV merge -- investigating

* checkpoint

* some etl changes

* completed ticket

* small typo
This commit is contained in:
Jorge Escobar 2021-06-17 18:12:39 -04:00 committed by GitHub
commit 78615e9b1a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 321 additions and 356 deletions

View file

@ -2,41 +2,27 @@
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "f4d63367",
"execution_count": 1,
"id": "20aa3891",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import requests\n",
"import zipfile\n",
"import numpy as np\n",
"import pandas as pd\n",
"import csv\n",
"\n",
"data_path = Path.cwd().parent / \"data\" / \"tmp\""
"data_path = Path.cwd().parent / \"data\"\n",
"fips_csv_path = data_path / \"fips_states_2010.csv\"\n",
"csv_path = data_path / \"dataset\" / \"ejscreen_2020\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0e6eb55e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('C:/opt/justice40-tool/score/data/tmp')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_path"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a1431996",
"execution_count": 3,
"id": "67a58c24",
"metadata": {},
"outputs": [
{
@ -49,304 +35,144 @@
}
],
"source": [
"import requests\n",
"download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n",
"file_contents = download.content\n",
"zip_file_path = data_path / \"downloaded.zip\"\n",
"zip_file = open(zip_file_path, \"wb\")\n",
"zip_file_path = data_path / \"tmp\"\n",
"zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n",
"zip_file.write(file_contents)\n",
"zip_file.close()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "bc5f3466",
"execution_count": 4,
"id": "cc3fb9ec",
"metadata": {},
"outputs": [],
"source": [
"import zipfile\n",
"with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
" zip_ref.extractall(data_path)\n",
"ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\""
"with zipfile.ZipFile(zip_file_path / \"downloaded.zip\", \"r\") as zip_ref:\n",
" zip_ref.extractall(zip_file_path)\n",
"ejscreen_csv = data_path / \"tmp\" / \"EJSCREEN_2020_StatePctile.csv\""
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "392ccb67",
"execution_count": 5,
"id": "b25738bb",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>OBJECTID</th>\n",
" <th>ID</th>\n",
" <th>STATE_NAME</th>\n",
" <th>ST_ABBREV</th>\n",
" <th>REGION</th>\n",
" <th>ACSTOTPOP</th>\n",
" <th>D_PM25_2</th>\n",
" <th>B_PM25_D2</th>\n",
" <th>P_PM25_D2</th>\n",
" <th>D_OZONE_2</th>\n",
" <th>...</th>\n",
" <th>T_PNPL</th>\n",
" <th>T_PNPL_D2</th>\n",
" <th>T_PRMP</th>\n",
" <th>T_PRMP_D2</th>\n",
" <th>T_PTSDF</th>\n",
" <th>T_PTSDF_D2</th>\n",
" <th>T_PWDIS</th>\n",
" <th>T_PWDIS_D2</th>\n",
" <th>Shape_Length</th>\n",
" <th>Shape_Area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>10010201001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>636</td>\n",
" <td>-492.025529412</td>\n",
" <td>6</td>\n",
" <td>52.0</td>\n",
" <td>-1866.38637046</td>\n",
" <td>...</td>\n",
" <td>0.071 facilities/km distance (79%ile)</td>\n",
" <td>40%ile</td>\n",
" <td>0.085 facilities/km distance (23%ile)</td>\n",
" <td>53%ile</td>\n",
" <td>0.59 facilities/km distance (57%ile)</td>\n",
" <td>38%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>13443.155206</td>\n",
" <td>6.040790e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>10010201002</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>1287</td>\n",
" <td>-2053.08341364</td>\n",
" <td>4</td>\n",
" <td>30.0</td>\n",
" <td>-7787.90260177</td>\n",
" <td>...</td>\n",
" <td>0.064 facilities/km distance (76%ile)</td>\n",
" <td>19%ile</td>\n",
" <td>0.074 facilities/km distance (17%ile)</td>\n",
" <td>42%ile</td>\n",
" <td>0.45 facilities/km distance (52%ile)</td>\n",
" <td>23%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>11917.089598</td>\n",
" <td>7.834160e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>10010202001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>810</td>\n",
" <td>1846.12693767</td>\n",
" <td>8</td>\n",
" <td>75.0</td>\n",
" <td>7002.78371663</td>\n",
" <td>...</td>\n",
" <td>0.069 facilities/km distance (78%ile)</td>\n",
" <td>85%ile</td>\n",
" <td>0.078 facilities/km distance (20%ile)</td>\n",
" <td>67%ile</td>\n",
" <td>0.65 facilities/km distance (59%ile)</td>\n",
" <td>77%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>7770.915121</td>\n",
" <td>2.900774e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>10010202002</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>1218</td>\n",
" <td>1392.07530488</td>\n",
" <td>8</td>\n",
" <td>72.0</td>\n",
" <td>5280.46153188</td>\n",
" <td>...</td>\n",
" <td>0.076 facilities/km distance (81%ile)</td>\n",
" <td>83%ile</td>\n",
" <td>0.087 facilities/km distance (24%ile)</td>\n",
" <td>66%ile</td>\n",
" <td>1 facilities/km distance (69%ile)</td>\n",
" <td>78%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>6506.804784</td>\n",
" <td>1.793332e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>10010203001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>2641</td>\n",
" <td>-769.374640358</td>\n",
" <td>5</td>\n",
" <td>48.0</td>\n",
" <td>-2911.8926061</td>\n",
" <td>...</td>\n",
" <td>0.074 facilities/km distance (80%ile)</td>\n",
" <td>32%ile</td>\n",
" <td>0.08 facilities/km distance (21%ile)</td>\n",
" <td>51%ile</td>\n",
" <td>1.2 facilities/km distance (74%ile)</td>\n",
" <td>24%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>11070.367848</td>\n",
" <td>5.461602e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 124 columns</p>\n",
"</div>"
],
"text/plain": [
" OBJECTID ID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n",
"0 1 10010201001 Alabama AL 4 636 \n",
"1 2 10010201002 Alabama AL 4 1287 \n",
"2 3 10010202001 Alabama AL 4 810 \n",
"3 4 10010202002 Alabama AL 4 1218 \n",
"4 5 10010203001 Alabama AL 4 2641 \n",
"\n",
" D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n",
"0 -492.025529412 6 52.0 -1866.38637046 ... \n",
"1 -2053.08341364 4 30.0 -7787.90260177 ... \n",
"2 1846.12693767 8 75.0 7002.78371663 ... \n",
"3 1392.07530488 8 72.0 5280.46153188 ... \n",
"4 -769.374640358 5 48.0 -2911.8926061 ... \n",
"\n",
" T_PNPL T_PNPL_D2 \\\n",
"0 0.071 facilities/km distance (79%ile) 40%ile \n",
"1 0.064 facilities/km distance (76%ile) 19%ile \n",
"2 0.069 facilities/km distance (78%ile) 85%ile \n",
"3 0.076 facilities/km distance (81%ile) 83%ile \n",
"4 0.074 facilities/km distance (80%ile) 32%ile \n",
"\n",
" T_PRMP T_PRMP_D2 \\\n",
"0 0.085 facilities/km distance (23%ile) 53%ile \n",
"1 0.074 facilities/km distance (17%ile) 42%ile \n",
"2 0.078 facilities/km distance (20%ile) 67%ile \n",
"3 0.087 facilities/km distance (24%ile) 66%ile \n",
"4 0.08 facilities/km distance (21%ile) 51%ile \n",
"\n",
" T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n",
"0 0.59 facilities/km distance (57%ile) 38%ile None None \n",
"1 0.45 facilities/km distance (52%ile) 23%ile None None \n",
"2 0.65 facilities/km distance (59%ile) 77%ile None None \n",
"3 1 facilities/km distance (69%ile) 78%ile None None \n",
"4 1.2 facilities/km distance (74%ile) 24%ile None None \n",
"\n",
" Shape_Length Shape_Area \n",
"0 13443.155206 6.040790e+06 \n",
"1 11917.089598 7.834160e+06 \n",
"2 7770.915121 2.900774e+06 \n",
"3 6506.804784 1.793332e+06 \n",
"4 11070.367848 5.461602e+06 \n",
"\n",
"[5 rows x 124 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"df = pd.read_csv(ejscreen_csv, low_memory=False)\n",
"df.head()"
"df = pd.read_csv(ejscreen_csv, dtype={'ID': 'string'}, low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "0ce9e22a",
"execution_count": 6,
"id": "e6994f2d",
"metadata": {},
"outputs": [],
"source": [
"df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9fa2077a",
"metadata": {},
"outputs": [],
"source": [
"# write nationwide csv\n",
"df.to_csv(csv_path / f\"usa.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5e5cc12a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method DataFrame.count of ID ACSTOTPOP LESSHSPCT LOWINCPCT\n",
"0 10010201001 636 0.208134 0.385220\n",
"1 10010201002 1287 0.040678 0.163170\n",
"2 10010202001 810 0.135563 0.501247\n",
"3 10010202002 1218 0.192000 0.393701\n",
"4 10010203001 2641 0.125473 0.308217\n",
"... ... ... ... ...\n",
"220328 721537506011 699 0.391389 0.902718\n",
"220329 721537506012 2432 0.185852 0.783717\n",
"220330 721537506013 976 0.018116 0.776639\n",
"220331 721537506021 1707 0.375422 0.867377\n",
"220332 721537506022 804 0.162791 0.942786\n",
"\n",
"[220333 rows x 4 columns]>"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"Generating data01 csv\n",
"Generating data02 csv\n",
"Generating data04 csv\n",
"Generating data05 csv\n",
"Generating data06 csv\n",
"Generating data08 csv\n",
"Generating data09 csv\n",
"Generating data10 csv\n",
"Generating data11 csv\n",
"Generating data12 csv\n",
"Generating data13 csv\n",
"Generating data15 csv\n",
"Generating data16 csv\n",
"Generating data17 csv\n",
"Generating data18 csv\n",
"Generating data19 csv\n",
"Generating data20 csv\n",
"Generating data21 csv\n",
"Generating data22 csv\n",
"Generating data23 csv\n",
"Generating data24 csv\n",
"Generating data25 csv\n",
"Generating data26 csv\n",
"Generating data27 csv\n",
"Generating data28 csv\n",
"Generating data29 csv\n",
"Generating data30 csv\n",
"Generating data31 csv\n",
"Generating data32 csv\n",
"Generating data33 csv\n",
"Generating data34 csv\n",
"Generating data35 csv\n",
"Generating data36 csv\n",
"Generating data37 csv\n",
"Generating data38 csv\n",
"Generating data39 csv\n",
"Generating data40 csv\n",
"Generating data41 csv\n",
"Generating data42 csv\n",
"Generating data44 csv\n",
"Generating data45 csv\n",
"Generating data46 csv\n",
"Generating data47 csv\n",
"Generating data48 csv\n",
"Generating data49 csv\n",
"Generating data50 csv\n",
"Generating data51 csv\n",
"Generating data53 csv\n",
"Generating data54 csv\n",
"Generating data55 csv\n",
"Generating data56 csv\n"
]
}
],
"source": [
"df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]\n",
"df.head()\n",
"df.count"
"# write per state csvs\n",
"with open(fips_csv_path) as csv_file:\n",
" csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
" line_count = 0\n",
"\n",
" for row in csv_reader:\n",
" if line_count == 0:\n",
" line_count += 1\n",
" else:\n",
" fips = row[0].strip()\n",
" print(f\"Generating data{fips} csv\")\n",
" df1 = df[df.ID.str[:2] == fips]\n",
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
" df1.to_csv(csv_path / f\"data{fips}.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e051623b",
"id": "2674fb20",
"metadata": {},
"outputs": [],
"source": []

View file

@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "a664f981",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), 'C:\\\\opt\\\\justice40-tool\\\\score\\\\ipython', 'C:\\\\Python39\\\\python39.zip', 'C:\\\\Python39\\\\DLLs', 'C:\\\\Python39\\\\lib', 'C:\\\\Python39', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv', '', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32\\\\lib', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\Pythonwin', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\IPython\\\\extensions', 'C:\\\\Users\\\\j\\\\.ipython']\n"
]
},
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'utils'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-3-e0c1285d1cc1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mutils\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mdata_path\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcwd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparent\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;34m\"data\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'utils'"
]
}
],
"source": [
"from pathlib import Path\n",
"import pandas as pd\n",
"import csv\n",
"import sys\n",
"\n",
"script_path = Path.cwd().parent / \"scripts\"\n",
"sys.path.insert(0, script_path)\n",
"print(sys.path)\n",
"\n",
"from utils import *\n",
"\n",
"data_path = Path.cwd().parent / \"data\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1b750f0e",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'get_state_fips_codes' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-fec7b31c5df6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# store all fips codes in list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mfips_state_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_state_fips_codes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mfips_state_list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'get_state_fips_codes' is not defined"
]
}
],
"source": [
"# store all fips codes in list\n",
"fips_state_list = get_state_fips_codes\n",
"fips_state_list"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7df430cb",
"metadata": {},
"outputs": [],
"source": [
"# EJSCreen ETL Load\n",
"csv_path = data_path / \"dataset\" / \"ejscreen_2020\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}