ACS data baked in for map (#153)

* starting etl for score

* projection fix

* projection flags

* proper ejscreen etl csv generation

* failing CSV merge -- investigating

* checkpoint

* some etl changes

* completed ticket

* small typo
This commit is contained in:
Jorge Escobar 2021-06-17 18:12:39 -04:00 committed by GitHub
parent eed9bd311d
commit 78615e9b1a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 321 additions and 356 deletions

2
.gitignore vendored
View file

@ -132,3 +132,5 @@ cython_debug/
score/data/census score/data/census
score/data/tiles score/data/tiles
score/data/tmp score/data/tmp
score/data/dataset
score/data/score

0
score/__init__.py Normal file
View file

View file

View file

View file

@ -2,41 +2,27 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 1,
"id": "f4d63367", "id": "20aa3891",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from pathlib import Path\n", "from pathlib import Path\n",
"import requests\n",
"import zipfile\n",
"import numpy as np\n",
"import pandas as pd\n",
"import csv\n",
"\n", "\n",
"data_path = Path.cwd().parent / \"data\" / \"tmp\"" "data_path = Path.cwd().parent / \"data\"\n",
"fips_csv_path = data_path / \"fips_states_2010.csv\"\n",
"csv_path = data_path / \"dataset\" / \"ejscreen_2020\""
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 3,
"id": "0e6eb55e", "id": "67a58c24",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('C:/opt/justice40-tool/score/data/tmp')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_path"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a1431996",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -49,304 +35,144 @@
} }
], ],
"source": [ "source": [
"import requests\n",
"download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n", "download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n",
"file_contents = download.content\n", "file_contents = download.content\n",
"zip_file_path = data_path / \"downloaded.zip\"\n", "zip_file_path = data_path / \"tmp\"\n",
"zip_file = open(zip_file_path, \"wb\")\n", "zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n",
"zip_file.write(file_contents)\n", "zip_file.write(file_contents)\n",
"zip_file.close()" "zip_file.close()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 4,
"id": "bc5f3466", "id": "cc3fb9ec",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import zipfile\n", "with zipfile.ZipFile(zip_file_path / \"downloaded.zip\", \"r\") as zip_ref:\n",
"with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n", " zip_ref.extractall(zip_file_path)\n",
" zip_ref.extractall(data_path)\n", "ejscreen_csv = data_path / \"tmp\" / \"EJSCREEN_2020_StatePctile.csv\""
"ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\""
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 5,
"id": "392ccb67", "id": "b25738bb",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
"outputs": [ "outputs": [],
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>OBJECTID</th>\n",
" <th>ID</th>\n",
" <th>STATE_NAME</th>\n",
" <th>ST_ABBREV</th>\n",
" <th>REGION</th>\n",
" <th>ACSTOTPOP</th>\n",
" <th>D_PM25_2</th>\n",
" <th>B_PM25_D2</th>\n",
" <th>P_PM25_D2</th>\n",
" <th>D_OZONE_2</th>\n",
" <th>...</th>\n",
" <th>T_PNPL</th>\n",
" <th>T_PNPL_D2</th>\n",
" <th>T_PRMP</th>\n",
" <th>T_PRMP_D2</th>\n",
" <th>T_PTSDF</th>\n",
" <th>T_PTSDF_D2</th>\n",
" <th>T_PWDIS</th>\n",
" <th>T_PWDIS_D2</th>\n",
" <th>Shape_Length</th>\n",
" <th>Shape_Area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>10010201001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>636</td>\n",
" <td>-492.025529412</td>\n",
" <td>6</td>\n",
" <td>52.0</td>\n",
" <td>-1866.38637046</td>\n",
" <td>...</td>\n",
" <td>0.071 facilities/km distance (79%ile)</td>\n",
" <td>40%ile</td>\n",
" <td>0.085 facilities/km distance (23%ile)</td>\n",
" <td>53%ile</td>\n",
" <td>0.59 facilities/km distance (57%ile)</td>\n",
" <td>38%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>13443.155206</td>\n",
" <td>6.040790e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>10010201002</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>1287</td>\n",
" <td>-2053.08341364</td>\n",
" <td>4</td>\n",
" <td>30.0</td>\n",
" <td>-7787.90260177</td>\n",
" <td>...</td>\n",
" <td>0.064 facilities/km distance (76%ile)</td>\n",
" <td>19%ile</td>\n",
" <td>0.074 facilities/km distance (17%ile)</td>\n",
" <td>42%ile</td>\n",
" <td>0.45 facilities/km distance (52%ile)</td>\n",
" <td>23%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>11917.089598</td>\n",
" <td>7.834160e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>10010202001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>810</td>\n",
" <td>1846.12693767</td>\n",
" <td>8</td>\n",
" <td>75.0</td>\n",
" <td>7002.78371663</td>\n",
" <td>...</td>\n",
" <td>0.069 facilities/km distance (78%ile)</td>\n",
" <td>85%ile</td>\n",
" <td>0.078 facilities/km distance (20%ile)</td>\n",
" <td>67%ile</td>\n",
" <td>0.65 facilities/km distance (59%ile)</td>\n",
" <td>77%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>7770.915121</td>\n",
" <td>2.900774e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>10010202002</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>1218</td>\n",
" <td>1392.07530488</td>\n",
" <td>8</td>\n",
" <td>72.0</td>\n",
" <td>5280.46153188</td>\n",
" <td>...</td>\n",
" <td>0.076 facilities/km distance (81%ile)</td>\n",
" <td>83%ile</td>\n",
" <td>0.087 facilities/km distance (24%ile)</td>\n",
" <td>66%ile</td>\n",
" <td>1 facilities/km distance (69%ile)</td>\n",
" <td>78%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>6506.804784</td>\n",
" <td>1.793332e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>10010203001</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" <td>4</td>\n",
" <td>2641</td>\n",
" <td>-769.374640358</td>\n",
" <td>5</td>\n",
" <td>48.0</td>\n",
" <td>-2911.8926061</td>\n",
" <td>...</td>\n",
" <td>0.074 facilities/km distance (80%ile)</td>\n",
" <td>32%ile</td>\n",
" <td>0.08 facilities/km distance (21%ile)</td>\n",
" <td>51%ile</td>\n",
" <td>1.2 facilities/km distance (74%ile)</td>\n",
" <td>24%ile</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>11070.367848</td>\n",
" <td>5.461602e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 124 columns</p>\n",
"</div>"
],
"text/plain": [
" OBJECTID ID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n",
"0 1 10010201001 Alabama AL 4 636 \n",
"1 2 10010201002 Alabama AL 4 1287 \n",
"2 3 10010202001 Alabama AL 4 810 \n",
"3 4 10010202002 Alabama AL 4 1218 \n",
"4 5 10010203001 Alabama AL 4 2641 \n",
"\n",
" D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n",
"0 -492.025529412 6 52.0 -1866.38637046 ... \n",
"1 -2053.08341364 4 30.0 -7787.90260177 ... \n",
"2 1846.12693767 8 75.0 7002.78371663 ... \n",
"3 1392.07530488 8 72.0 5280.46153188 ... \n",
"4 -769.374640358 5 48.0 -2911.8926061 ... \n",
"\n",
" T_PNPL T_PNPL_D2 \\\n",
"0 0.071 facilities/km distance (79%ile) 40%ile \n",
"1 0.064 facilities/km distance (76%ile) 19%ile \n",
"2 0.069 facilities/km distance (78%ile) 85%ile \n",
"3 0.076 facilities/km distance (81%ile) 83%ile \n",
"4 0.074 facilities/km distance (80%ile) 32%ile \n",
"\n",
" T_PRMP T_PRMP_D2 \\\n",
"0 0.085 facilities/km distance (23%ile) 53%ile \n",
"1 0.074 facilities/km distance (17%ile) 42%ile \n",
"2 0.078 facilities/km distance (20%ile) 67%ile \n",
"3 0.087 facilities/km distance (24%ile) 66%ile \n",
"4 0.08 facilities/km distance (21%ile) 51%ile \n",
"\n",
" T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n",
"0 0.59 facilities/km distance (57%ile) 38%ile None None \n",
"1 0.45 facilities/km distance (52%ile) 23%ile None None \n",
"2 0.65 facilities/km distance (59%ile) 77%ile None None \n",
"3 1 facilities/km distance (69%ile) 78%ile None None \n",
"4 1.2 facilities/km distance (74%ile) 24%ile None None \n",
"\n",
" Shape_Length Shape_Area \n",
"0 13443.155206 6.040790e+06 \n",
"1 11917.089598 7.834160e+06 \n",
"2 7770.915121 2.900774e+06 \n",
"3 6506.804784 1.793332e+06 \n",
"4 11070.367848 5.461602e+06 \n",
"\n",
"[5 rows x 124 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import numpy as np\n", "df = pd.read_csv(ejscreen_csv, dtype={'ID': 'string'}, low_memory=False)"
"import pandas as pd\n",
"df = pd.read_csv(ejscreen_csv, low_memory=False)\n",
"df.head()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 6,
"id": "0ce9e22a", "id": "e6994f2d",
"metadata": {},
"outputs": [],
"source": [
"df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9fa2077a",
"metadata": {},
"outputs": [],
"source": [
"# write nationwide csv\n",
"df.to_csv(csv_path / f\"usa.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5e5cc12a",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "name": "stdout",
"text/plain": [ "output_type": "stream",
"<bound method DataFrame.count of ID ACSTOTPOP LESSHSPCT LOWINCPCT\n", "text": [
"0 10010201001 636 0.208134 0.385220\n", "Generating data01 csv\n",
"1 10010201002 1287 0.040678 0.163170\n", "Generating data02 csv\n",
"2 10010202001 810 0.135563 0.501247\n", "Generating data04 csv\n",
"3 10010202002 1218 0.192000 0.393701\n", "Generating data05 csv\n",
"4 10010203001 2641 0.125473 0.308217\n", "Generating data06 csv\n",
"... ... ... ... ...\n", "Generating data08 csv\n",
"220328 721537506011 699 0.391389 0.902718\n", "Generating data09 csv\n",
"220329 721537506012 2432 0.185852 0.783717\n", "Generating data10 csv\n",
"220330 721537506013 976 0.018116 0.776639\n", "Generating data11 csv\n",
"220331 721537506021 1707 0.375422 0.867377\n", "Generating data12 csv\n",
"220332 721537506022 804 0.162791 0.942786\n", "Generating data13 csv\n",
"\n", "Generating data15 csv\n",
"[220333 rows x 4 columns]>" "Generating data16 csv\n",
] "Generating data17 csv\n",
}, "Generating data18 csv\n",
"execution_count": 32, "Generating data19 csv\n",
"metadata": {}, "Generating data20 csv\n",
"output_type": "execute_result" "Generating data21 csv\n",
"Generating data22 csv\n",
"Generating data23 csv\n",
"Generating data24 csv\n",
"Generating data25 csv\n",
"Generating data26 csv\n",
"Generating data27 csv\n",
"Generating data28 csv\n",
"Generating data29 csv\n",
"Generating data30 csv\n",
"Generating data31 csv\n",
"Generating data32 csv\n",
"Generating data33 csv\n",
"Generating data34 csv\n",
"Generating data35 csv\n",
"Generating data36 csv\n",
"Generating data37 csv\n",
"Generating data38 csv\n",
"Generating data39 csv\n",
"Generating data40 csv\n",
"Generating data41 csv\n",
"Generating data42 csv\n",
"Generating data44 csv\n",
"Generating data45 csv\n",
"Generating data46 csv\n",
"Generating data47 csv\n",
"Generating data48 csv\n",
"Generating data49 csv\n",
"Generating data50 csv\n",
"Generating data51 csv\n",
"Generating data53 csv\n",
"Generating data54 csv\n",
"Generating data55 csv\n",
"Generating data56 csv\n"
]
} }
], ],
"source": [ "source": [
"df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]\n", "# write per state csvs\n",
"df.head()\n", "with open(fips_csv_path) as csv_file:\n",
"df.count" " csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
" line_count = 0\n",
"\n",
" for row in csv_reader:\n",
" if line_count == 0:\n",
" line_count += 1\n",
" else:\n",
" fips = row[0].strip()\n",
" print(f\"Generating data{fips} csv\")\n",
" df1 = df[df.ID.str[:2] == fips]\n",
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
" df1.to_csv(csv_path / f\"data{fips}.csv\", index = False)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "e051623b", "id": "2674fb20",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []

View file

@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "a664f981",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), 'C:\\\\opt\\\\justice40-tool\\\\score\\\\ipython', 'C:\\\\Python39\\\\python39.zip', 'C:\\\\Python39\\\\DLLs', 'C:\\\\Python39\\\\lib', 'C:\\\\Python39', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv', '', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32\\\\lib', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\Pythonwin', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\IPython\\\\extensions', 'C:\\\\Users\\\\j\\\\.ipython']\n"
]
},
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'utils'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-3-e0c1285d1cc1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mutils\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mdata_path\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcwd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparent\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;34m\"data\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'utils'"
]
}
],
"source": [
"from pathlib import Path\n",
"import pandas as pd\n",
"import csv\n",
"import sys\n",
"\n",
"script_path = Path.cwd().parent / \"scripts\"\n",
"sys.path.insert(0, script_path)\n",
"print(sys.path)\n",
"\n",
"from utils import *\n",
"\n",
"data_path = Path.cwd().parent / \"data\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1b750f0e",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'get_state_fips_codes' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-fec7b31c5df6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# store all fips codes in list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mfips_state_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_state_fips_codes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mfips_state_list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'get_state_fips_codes' is not defined"
]
}
],
"source": [
"# store all fips codes in list\n",
"fips_state_list = get_state_fips_codes\n",
"fips_state_list"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7df430cb",
"metadata": {},
"outputs": [],
"source": [
"# EJSCreen ETL Load\n",
"csv_path = data_path / \"dataset\" / \"ejscreen_2020\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

View file

@ -5,70 +5,64 @@ import os
import json import json
from pathlib import Path from pathlib import Path
from utils import get_state_fips_codes
data_path = Path.cwd() / "data" data_path = Path.cwd() / "data"
with requests.Session() as s: with requests.Session() as s:
# the fips_states_2010.csv is generated from data here # the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
fips_csv_path = data_path / "fips_states_2010.csv" state_fips_codes = get_state_fips_codes()
with open(fips_csv_path) as csv_file: for fips in state_fips_codes:
csv_reader = csv.reader(csv_file, delimiter=",") # check if file exists
line_count = 0 shp_file_path = data_path.joinpath(
for row in csv_reader: "census", "shp", fips, f"tl_2010_{fips}_bg10.shp"
if line_count == 0: )
line_count += 1 if not os.path.isfile(shp_file_path):
print(f"downloading {row[1]}")
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
# But using 2010 for now
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
download = s.get(cbg_state_url)
file_contents = download.content
zip_file_path = data_path / "census" / "downloaded.zip"
zip_file = open(zip_file_path, "wb")
zip_file.write(file_contents)
zip_file.close()
print(f"extracting {row[1]}")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
shp_dir_path = data_path / "census" / "shp" / fips
zip_ref.extractall(shp_dir_path)
geojson_dir_path = data_path.joinpath(
"census",
"geojson",
)
if not os.path.isfile(geojson_dir_path.joinpath(fips + ".json")):
# ogr2ogr
print(f"encoding GeoJSON for {row[1]}")
# PWD is different for Windows
if os.name == "nt":
pwd = "%cd%"
else: else:
fips = row[0].strip() pwd = "${PWD}"
cmd = (
# check if file exists 'docker run --rm -it -v "'
shp_file_path = data_path.joinpath( + pwd
"census", "shp", fips, f"tl_2010_{fips}_bg10.shp" + '"/:/home osgeo/gdal:alpine-ultrasmall-latest ogr2ogr -f GeoJSON /home/data/census/geojson/'
) + fips
if not os.path.isfile(shp_file_path): + ".json /home/data/census/shp/"
print(f"downloading {row[1]}") + fips
+ "/tl_2010_"
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/ + fips
# But using 2010 for now + "_bg10.shp"
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip" )
download = s.get(cbg_state_url) print(cmd)
file_contents = download.content os.system(cmd)
zip_file_path = data_path / "census" / "downloaded.zip"
zip_file = open(zip_file_path, "wb")
zip_file.write(file_contents)
zip_file.close()
print(f"extracting {row[1]}")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
shp_dir_path = data_path / "census" / "shp" / fips
zip_ref.extractall(shp_dir_path)
geojson_dir_path = data_path.joinpath(
"census",
"geojson",
)
if not os.path.isfile(geojson_dir_path.joinpath(fips + ".json")):
# ogr2ogr
print(f"encoding GeoJSON for {row[1]}")
# PWD is different for Windows
if os.name == "nt":
pwd = "%cd%"
else:
pwd = "${PWD}"
cmd = (
'docker run --rm -it -v "'
+ pwd
+ '"/:/home osgeo/gdal:alpine-ultrasmall-latest ogr2ogr -f GeoJSON /home/data/census/geojson/'
+ fips
+ ".json /home/data/census/shp/"
+ fips
+ "/tl_2010_"
+ fips
+ "_bg10.shp"
)
print(cmd)
os.system(cmd)
# generate CBG CSV table for pandas # generate CBG CSV table for pandas
## load in memory ## load in memory
@ -87,10 +81,7 @@ with requests.Session() as s:
cbg_per_state_list[geoid10_state_id] = [] cbg_per_state_list[geoid10_state_id] = []
cbg_per_state_list[geoid10_state_id].append(geoid10) cbg_per_state_list[geoid10_state_id].append(geoid10)
csv_dir_path = data_path.joinpath( csv_dir_path = data_path / "census" / "csv"
"census",
"csv",
)
## write to individual state csv ## write to individual state csv
for state_id in cbg_per_state_list: for state_id in cbg_per_state_list:
geoid10_list = cbg_per_state_list[state_id] geoid10_list = cbg_per_state_list[state_id]

View file

@ -2,6 +2,8 @@ import os
from pathlib import Path from pathlib import Path
import shutil import shutil
from utils import get_state_fips_codes
data_path = Path.cwd() / "data" data_path = Path.cwd() / "data"
# remove existing mbtiles file # remove existing mbtiles file
@ -14,17 +16,41 @@ mvt_tiles_path = data_path / "tiles" / "mvt"
if os.path.exists(mvt_tiles_path): if os.path.exists(mvt_tiles_path):
shutil.rmtree(mvt_tiles_path) shutil.rmtree(mvt_tiles_path)
# Merge scores into json
# TODO: for this first pass, just merging ACS EJScren indicators
# Per https://github.com/usds/justice40-tool/issues/102
if os.name == "nt":
pwd = "%cd%"
else:
pwd = "${PWD}"
state_fips_codes = get_state_fips_codes()
for fips in state_fips_codes:
cmd = (
'docker run --rm -v "'
+ pwd
+ '"/:/home '
+ "osgeo/gdal:alpine-small-latest ogr2ogr -f GeoJSON "
+ f"-sql \"SELECT * FROM tl_2010_{fips}_bg10 LEFT JOIN '/home/data/dataset/ejscreen_2020/data{fips}.csv'.data{fips} ON tl_2010_{fips}_bg10.GEOID10 = data{fips}.ID\" "
+ f"/home/data/score/geojson/{fips}.json /home/data/census/shp/{fips}/tl_2010_{fips}_bg10.dbf"
)
print(cmd)
os.system(cmd)
# get a list of all json files to plug in the docker commands below # get a list of all json files to plug in the docker commands below
# (workaround since *.json doesn't seem to work) # (workaround since *.json doesn't seem to work)
geojson_list = "" geojson_list = ""
geojson_path = data_path / "census" / "geojson" geojson_path = data_path / "score" / "geojson"
for file in os.listdir(geojson_path): for file in os.listdir(geojson_path):
if file.endswith(".json"): if file.endswith(".json"):
geojson_list += f"/home/data/census/geojson/{file} " geojson_list += f"/home/data/score/geojson/{file} "
if geojson_list == "": if geojson_list == "":
print("No GeoJson files found. Please run download_cbg.py first") print("No GeoJson files found. Please run scripts/download_cbg.py first")
# generate mbtiles file
# PWD is different for Windows # PWD is different for Windows
if os.name == "nt": if os.name == "nt":
pwd = "%cd%" pwd = "%cd%"
@ -33,7 +59,7 @@ else:
cmd = ( cmd = (
'docker run --rm -it -v "' 'docker run --rm -it -v "'
+ pwd + pwd
+ '"/:/home klokantech/tippecanoe tippecanoe -s_srs EPSG:4269 -t_srs EPSG:4326 --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 ' + '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 '
+ geojson_list + geojson_list
) )
print(cmd) print(cmd)

20
score/scripts/utils.py Normal file
View file

@ -0,0 +1,20 @@
# common usage functions
import csv
from pathlib import Path
def get_state_fips_codes():
data_path = Path.cwd() / "data"
fips_csv_path = data_path / "fips_states_2010.csv"
fips_state_list = []
with open(fips_csv_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
line_count = 0
for row in csv_reader:
if line_count == 0:
line_count += 1
else:
fips = row[0].strip()
fips_state_list.append(fips)
return fips_state_list