From 78615e9b1ac63cb4ea44d5ffc08acbdeb984e70f Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Thu, 17 Jun 2021 18:12:39 -0400 Subject: [PATCH] ACS data baked in for map (#153) * starting etl for score * projection fix * projection flags * proper ejscreen etl csv generation * failing CSV merge -- investigating * checkpoint * some etl changes * completed ticket * small typo --- .gitignore | 2 + score/__init__.py | 0 score/data/dataset/ejscreen_2020/__init__.py | 0 score/data/score/geojson/__init__.py | 0 score/data/tmp/__init__.py | 0 score/ipython/ejscreen_etl.ipynb | 406 ++++++------------- score/ipython/score_calc_0.1.ipynb | 100 +++++ score/scripts/__init__.py | 0 score/scripts/download_cbg.py | 115 +++--- score/scripts/generate_mbtiles.py | 34 +- score/scripts/utils.py | 20 + 11 files changed, 321 insertions(+), 356 deletions(-) create mode 100644 score/__init__.py create mode 100644 score/data/dataset/ejscreen_2020/__init__.py create mode 100644 score/data/score/geojson/__init__.py create mode 100644 score/data/tmp/__init__.py create mode 100644 score/ipython/score_calc_0.1.ipynb create mode 100644 score/scripts/__init__.py create mode 100644 score/scripts/utils.py diff --git a/.gitignore b/.gitignore index 6d4c3ea1..12dd1862 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,5 @@ cython_debug/ score/data/census score/data/tiles score/data/tmp +score/data/dataset +score/data/score diff --git a/score/__init__.py b/score/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/score/data/dataset/ejscreen_2020/__init__.py b/score/data/dataset/ejscreen_2020/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/score/data/score/geojson/__init__.py b/score/data/score/geojson/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/score/data/tmp/__init__.py b/score/data/tmp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/score/ipython/ejscreen_etl.ipynb b/score/ipython/ejscreen_etl.ipynb index 3b24556b..5a95a935 100644 --- a/score/ipython/ejscreen_etl.ipynb +++ b/score/ipython/ejscreen_etl.ipynb @@ -2,41 +2,27 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, - "id": "f4d63367", + "execution_count": 1, + "id": "20aa3891", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", + "import requests\n", + "import zipfile\n", + "import numpy as np\n", + "import pandas as pd\n", + "import csv\n", "\n", - "data_path = Path.cwd().parent / \"data\" / \"tmp\"" + "data_path = Path.cwd().parent / \"data\"\n", + "fips_csv_path = data_path / \"fips_states_2010.csv\"\n", + "csv_path = data_path / \"dataset\" / \"ejscreen_2020\"" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "0e6eb55e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "WindowsPath('C:/opt/justice40-tool/score/data/tmp')" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_path" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "a1431996", + "execution_count": 3, + "id": "67a58c24", "metadata": {}, "outputs": [ { @@ -49,304 +35,144 @@ } ], "source": [ - "import requests\n", "download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n", "file_contents = download.content\n", - "zip_file_path = data_path / \"downloaded.zip\"\n", - "zip_file = open(zip_file_path, \"wb\")\n", + "zip_file_path = data_path / \"tmp\"\n", + "zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n", "zip_file.write(file_contents)\n", "zip_file.close()" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "bc5f3466", + "execution_count": 4, + "id": "cc3fb9ec", "metadata": {}, "outputs": [], "source": [ - "import zipfile\n", - "with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n", - " zip_ref.extractall(data_path)\n", - "ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\"" + "with zipfile.ZipFile(zip_file_path / \"downloaded.zip\", \"r\") as zip_ref:\n", + " zip_ref.extractall(zip_file_path)\n", + "ejscreen_csv = data_path / \"tmp\" / \"EJSCREEN_2020_StatePctile.csv\"" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "392ccb67", + "execution_count": 5, + "id": "b25738bb", "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
OBJECTIDIDSTATE_NAMEST_ABBREVREGIONACSTOTPOPD_PM25_2B_PM25_D2P_PM25_D2D_OZONE_2...T_PNPLT_PNPL_D2T_PRMPT_PRMP_D2T_PTSDFT_PTSDF_D2T_PWDIST_PWDIS_D2Shape_LengthShape_Area
0110010201001AlabamaAL4636-492.025529412652.0-1866.38637046...0.071 facilities/km distance (79%ile)40%ile0.085 facilities/km distance (23%ile)53%ile0.59 facilities/km distance (57%ile)38%ileNoneNone13443.1552066.040790e+06
1210010201002AlabamaAL41287-2053.08341364430.0-7787.90260177...0.064 facilities/km distance (76%ile)19%ile0.074 facilities/km distance (17%ile)42%ile0.45 facilities/km distance (52%ile)23%ileNoneNone11917.0895987.834160e+06
2310010202001AlabamaAL48101846.12693767875.07002.78371663...0.069 facilities/km distance (78%ile)85%ile0.078 facilities/km distance (20%ile)67%ile0.65 facilities/km distance (59%ile)77%ileNoneNone7770.9151212.900774e+06
3410010202002AlabamaAL412181392.07530488872.05280.46153188...0.076 facilities/km distance (81%ile)83%ile0.087 facilities/km distance (24%ile)66%ile1 facilities/km distance (69%ile)78%ileNoneNone6506.8047841.793332e+06
4510010203001AlabamaAL42641-769.374640358548.0-2911.8926061...0.074 facilities/km distance (80%ile)32%ile0.08 facilities/km distance (21%ile)51%ile1.2 facilities/km distance (74%ile)24%ileNoneNone11070.3678485.461602e+06
\n", - "

5 rows × 124 columns

\n", - "
" - ], - "text/plain": [ - " OBJECTID ID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n", - "0 1 10010201001 Alabama AL 4 636 \n", - "1 2 10010201002 Alabama AL 4 1287 \n", - "2 3 10010202001 Alabama AL 4 810 \n", - "3 4 10010202002 Alabama AL 4 1218 \n", - "4 5 10010203001 Alabama AL 4 2641 \n", - "\n", - " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n", - "0 -492.025529412 6 52.0 -1866.38637046 ... \n", - "1 -2053.08341364 4 30.0 -7787.90260177 ... \n", - "2 1846.12693767 8 75.0 7002.78371663 ... \n", - "3 1392.07530488 8 72.0 5280.46153188 ... \n", - "4 -769.374640358 5 48.0 -2911.8926061 ... \n", - "\n", - " T_PNPL T_PNPL_D2 \\\n", - "0 0.071 facilities/km distance (79%ile) 40%ile \n", - "1 0.064 facilities/km distance (76%ile) 19%ile \n", - "2 0.069 facilities/km distance (78%ile) 85%ile \n", - "3 0.076 facilities/km distance (81%ile) 83%ile \n", - "4 0.074 facilities/km distance (80%ile) 32%ile \n", - "\n", - " T_PRMP T_PRMP_D2 \\\n", - "0 0.085 facilities/km distance (23%ile) 53%ile \n", - "1 0.074 facilities/km distance (17%ile) 42%ile \n", - "2 0.078 facilities/km distance (20%ile) 67%ile \n", - "3 0.087 facilities/km distance (24%ile) 66%ile \n", - "4 0.08 facilities/km distance (21%ile) 51%ile \n", - "\n", - " T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n", - "0 0.59 facilities/km distance (57%ile) 38%ile None None \n", - "1 0.45 facilities/km distance (52%ile) 23%ile None None \n", - "2 0.65 facilities/km distance (59%ile) 77%ile None None \n", - "3 1 facilities/km distance (69%ile) 78%ile None None \n", - "4 1.2 facilities/km distance (74%ile) 24%ile None None \n", - "\n", - " Shape_Length Shape_Area \n", - "0 13443.155206 6.040790e+06 \n", - "1 11917.089598 7.834160e+06 \n", - "2 7770.915121 2.900774e+06 \n", - "3 6506.804784 1.793332e+06 \n", - "4 11070.367848 5.461602e+06 \n", - "\n", - "[5 rows x 124 columns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "df = pd.read_csv(ejscreen_csv, low_memory=False)\n", - "df.head()" + "df = pd.read_csv(ejscreen_csv, dtype={'ID': 'string'}, low_memory=False)" ] }, { "cell_type": "code", - "execution_count": 32, - "id": "0ce9e22a", + "execution_count": 6, + "id": "e6994f2d", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9fa2077a", + "metadata": {}, + "outputs": [], + "source": [ + "# write nationwide csv\n", + "df.to_csv(csv_path / f\"usa.csv\", index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5e5cc12a", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating data01 csv\n", + "Generating data02 csv\n", + "Generating data04 csv\n", + "Generating data05 csv\n", + "Generating data06 csv\n", + "Generating data08 csv\n", + "Generating data09 csv\n", + "Generating data10 csv\n", + "Generating data11 csv\n", + "Generating data12 csv\n", + "Generating data13 csv\n", + "Generating data15 csv\n", + "Generating data16 csv\n", + "Generating data17 csv\n", + "Generating data18 csv\n", + "Generating data19 csv\n", + "Generating data20 csv\n", + "Generating data21 csv\n", + "Generating data22 csv\n", + "Generating data23 csv\n", + "Generating data24 csv\n", + "Generating data25 csv\n", + "Generating data26 csv\n", + "Generating data27 csv\n", + "Generating data28 csv\n", + "Generating data29 csv\n", + "Generating data30 csv\n", + "Generating data31 csv\n", + "Generating data32 csv\n", + "Generating data33 csv\n", + "Generating data34 csv\n", + "Generating data35 csv\n", + "Generating data36 csv\n", + "Generating data37 csv\n", + "Generating data38 csv\n", + "Generating data39 csv\n", + "Generating data40 csv\n", + "Generating data41 csv\n", + "Generating data42 csv\n", + "Generating data44 csv\n", + "Generating data45 csv\n", + "Generating data46 csv\n", + "Generating data47 csv\n", + "Generating data48 csv\n", + "Generating data49 csv\n", + "Generating data50 csv\n", + "Generating data51 csv\n", + "Generating data53 csv\n", + "Generating data54 csv\n", + "Generating data55 csv\n", + "Generating data56 csv\n" + ] } ], "source": [ - "df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]\n", - "df.head()\n", - "df.count" + "# write per state csvs\n", + "with open(fips_csv_path) as csv_file:\n", + " csv_reader = csv.reader(csv_file, delimiter=\",\")\n", + " line_count = 0\n", + "\n", + " for row in csv_reader:\n", + " if line_count == 0:\n", + " line_count += 1\n", + " else:\n", + " fips = row[0].strip()\n", + " print(f\"Generating data{fips} csv\")\n", + " df1 = df[df.ID.str[:2] == fips]\n", + " # we need to name the file data01.csv for ogr2ogr csv merge to work\n", + " df1.to_csv(csv_path / f\"data{fips}.csv\", index = False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "e051623b", + "id": "2674fb20", "metadata": {}, "outputs": [], "source": [] diff --git a/score/ipython/score_calc_0.1.ipynb b/score/ipython/score_calc_0.1.ipynb new file mode 100644 index 00000000..781bac64 --- /dev/null +++ b/score/ipython/score_calc_0.1.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "a664f981", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), 'C:\\\\opt\\\\justice40-tool\\\\score\\\\ipython', 'C:\\\\Python39\\\\python39.zip', 'C:\\\\Python39\\\\DLLs', 'C:\\\\Python39\\\\lib', 'C:\\\\Python39', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv', '', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32\\\\lib', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\Pythonwin', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\IPython\\\\extensions', 'C:\\\\Users\\\\j\\\\.ipython']\n" + ] + }, + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'utils'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mutils\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[0mdata_path\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcwd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparent\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;34m\"data\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'utils'" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import pandas as pd\n", + "import csv\n", + "import sys\n", + "\n", + "script_path = Path.cwd().parent / \"scripts\"\n", + "sys.path.insert(0, script_path)\n", + "print(sys.path)\n", + "\n", + "from utils import *\n", + "\n", + "data_path = Path.cwd().parent / \"data\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1b750f0e", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'get_state_fips_codes' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# store all fips codes in list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mfips_state_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_state_fips_codes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mfips_state_list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'get_state_fips_codes' is not defined" + ] + } + ], + "source": [ + "# store all fips codes in list\n", + "fips_state_list = get_state_fips_codes\n", + "fips_state_list" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7df430cb", + "metadata": {}, + "outputs": [], + "source": [ + "# EJSCreen ETL Load\n", + "csv_path = data_path / \"dataset\" / \"ejscreen_2020\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/scripts/__init__.py b/score/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/score/scripts/download_cbg.py b/score/scripts/download_cbg.py index ff58451c..b091d431 100644 --- a/score/scripts/download_cbg.py +++ b/score/scripts/download_cbg.py @@ -5,70 +5,64 @@ import os import json from pathlib import Path +from utils import get_state_fips_codes + data_path = Path.cwd() / "data" with requests.Session() as s: # the fips_states_2010.csv is generated from data here # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html - fips_csv_path = data_path / "fips_states_2010.csv" - with open(fips_csv_path) as csv_file: - csv_reader = csv.reader(csv_file, delimiter=",") - line_count = 0 - for row in csv_reader: - if line_count == 0: - line_count += 1 + state_fips_codes = get_state_fips_codes() + for fips in state_fips_codes: + # check if file exists + shp_file_path = data_path.joinpath( + "census", "shp", fips, f"tl_2010_{fips}_bg10.shp" + ) + if not os.path.isfile(shp_file_path): + print(f"downloading {row[1]}") + + # 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/ + # But using 2010 for now + cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip" + download = s.get(cbg_state_url) + file_contents = download.content + zip_file_path = data_path / "census" / "downloaded.zip" + zip_file = open(zip_file_path, "wb") + zip_file.write(file_contents) + zip_file.close() + + print(f"extracting {row[1]}") + + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + shp_dir_path = data_path / "census" / "shp" / fips + zip_ref.extractall(shp_dir_path) + + geojson_dir_path = data_path.joinpath( + "census", + "geojson", + ) + if not os.path.isfile(geojson_dir_path.joinpath(fips + ".json")): + # ogr2ogr + print(f"encoding GeoJSON for {row[1]}") + + # PWD is different for Windows + if os.name == "nt": + pwd = "%cd%" else: - fips = row[0].strip() - - # check if file exists - shp_file_path = data_path.joinpath( - "census", "shp", fips, f"tl_2010_{fips}_bg10.shp" - ) - if not os.path.isfile(shp_file_path): - print(f"downloading {row[1]}") - - # 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/ - # But using 2010 for now - cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip" - download = s.get(cbg_state_url) - file_contents = download.content - zip_file_path = data_path / "census" / "downloaded.zip" - zip_file = open(zip_file_path, "wb") - zip_file.write(file_contents) - zip_file.close() - - print(f"extracting {row[1]}") - - with zipfile.ZipFile(zip_file_path, "r") as zip_ref: - shp_dir_path = data_path / "census" / "shp" / fips - zip_ref.extractall(shp_dir_path) - - geojson_dir_path = data_path.joinpath( - "census", - "geojson", - ) - if not os.path.isfile(geojson_dir_path.joinpath(fips + ".json")): - # ogr2ogr - print(f"encoding GeoJSON for {row[1]}") - - # PWD is different for Windows - if os.name == "nt": - pwd = "%cd%" - else: - pwd = "${PWD}" - cmd = ( - 'docker run --rm -it -v "' - + pwd - + '"/:/home osgeo/gdal:alpine-ultrasmall-latest ogr2ogr -f GeoJSON /home/data/census/geojson/' - + fips - + ".json /home/data/census/shp/" - + fips - + "/tl_2010_" - + fips - + "_bg10.shp" - ) - print(cmd) - os.system(cmd) + pwd = "${PWD}" + cmd = ( + 'docker run --rm -it -v "' + + pwd + + '"/:/home osgeo/gdal:alpine-ultrasmall-latest ogr2ogr -f GeoJSON /home/data/census/geojson/' + + fips + + ".json /home/data/census/shp/" + + fips + + "/tl_2010_" + + fips + + "_bg10.shp" + ) + print(cmd) + os.system(cmd) # generate CBG CSV table for pandas ## load in memory @@ -87,10 +81,7 @@ with requests.Session() as s: cbg_per_state_list[geoid10_state_id] = [] cbg_per_state_list[geoid10_state_id].append(geoid10) - csv_dir_path = data_path.joinpath( - "census", - "csv", - ) + csv_dir_path = data_path / "census" / "csv" ## write to individual state csv for state_id in cbg_per_state_list: geoid10_list = cbg_per_state_list[state_id] diff --git a/score/scripts/generate_mbtiles.py b/score/scripts/generate_mbtiles.py index 22678438..ad85ea76 100644 --- a/score/scripts/generate_mbtiles.py +++ b/score/scripts/generate_mbtiles.py @@ -2,6 +2,8 @@ import os from pathlib import Path import shutil +from utils import get_state_fips_codes + data_path = Path.cwd() / "data" # remove existing mbtiles file @@ -14,17 +16,41 @@ mvt_tiles_path = data_path / "tiles" / "mvt" if os.path.exists(mvt_tiles_path): shutil.rmtree(mvt_tiles_path) +# Merge scores into json +# TODO: for this first pass, just merging ACS EJScren indicators +# Per https://github.com/usds/justice40-tool/issues/102 + +if os.name == "nt": + pwd = "%cd%" +else: + pwd = "${PWD}" + +state_fips_codes = get_state_fips_codes() +for fips in state_fips_codes: + cmd = ( + 'docker run --rm -v "' + + pwd + + '"/:/home ' + + "osgeo/gdal:alpine-small-latest ogr2ogr -f GeoJSON " + + f"-sql \"SELECT * FROM tl_2010_{fips}_bg10 LEFT JOIN '/home/data/dataset/ejscreen_2020/data{fips}.csv'.data{fips} ON tl_2010_{fips}_bg10.GEOID10 = data{fips}.ID\" " + + f"/home/data/score/geojson/{fips}.json /home/data/census/shp/{fips}/tl_2010_{fips}_bg10.dbf" + ) + print(cmd) + os.system(cmd) + # get a list of all json files to plug in the docker commands below # (workaround since *.json doesn't seem to work) geojson_list = "" -geojson_path = data_path / "census" / "geojson" +geojson_path = data_path / "score" / "geojson" for file in os.listdir(geojson_path): if file.endswith(".json"): - geojson_list += f"/home/data/census/geojson/{file} " + geojson_list += f"/home/data/score/geojson/{file} " if geojson_list == "": - print("No GeoJson files found. Please run download_cbg.py first") + print("No GeoJson files found. Please run scripts/download_cbg.py first") + +# generate mbtiles file # PWD is different for Windows if os.name == "nt": pwd = "%cd%" @@ -33,7 +59,7 @@ else: cmd = ( 'docker run --rm -it -v "' + pwd - + '"/:/home klokantech/tippecanoe tippecanoe -s_srs EPSG:4269 -t_srs EPSG:4326 --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 ' + + '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 ' + geojson_list ) print(cmd) diff --git a/score/scripts/utils.py b/score/scripts/utils.py new file mode 100644 index 00000000..48b656db --- /dev/null +++ b/score/scripts/utils.py @@ -0,0 +1,20 @@ +# common usage functions +import csv +from pathlib import Path + + +def get_state_fips_codes(): + data_path = Path.cwd() / "data" + fips_csv_path = data_path / "fips_states_2010.csv" + fips_state_list = [] + with open(fips_csv_path) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=",") + line_count = 0 + + for row in csv_reader: + if line_count == 0: + line_count += 1 + else: + fips = row[0].strip() + fips_state_list.append(fips) + return fips_state_list