ACS data baked in for map (#153)

* starting etl for score * projection fix * projection flags * proper ejscreen etl csv generation * failing CSV merge -- investigating * checkpoint * some etl changes * completed ticket * small typo
2025-02-23 01:54:18 -08:00 · 2021-06-17 18:12:39 -04:00 · 2021-06-17 18:12:39 -04:00 · 78615e9b1a
commit 78615e9b1a
parent eed9bd311d
11 changed files with 321 additions and 356 deletions
--- a/.gitignore
+++ b/.gitignore
@ -132,3 +132,5 @@ cython_debug/
 score/data/census
 score/data/tiles
 score/data/tmp
 score/data/dataset
 score/data/score
--- a/score/init.py
+++ b/score/init.py
--- a/score/data/dataset/ejscreen_2020/init.py
+++ b/score/data/dataset/ejscreen_2020/init.py
--- a/score/data/score/geojson/init.py
+++ b/score/data/score/geojson/init.py
--- a/score/data/tmp/init.py
+++ b/score/data/tmp/init.py
--- a/score/ipython/ejscreen_etl.ipynb
+++ b/score/ipython/ejscreen_etl.ipynb
@ -2,41 +2,27 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
-   "id": "f4d63367",
+   "id": "20aa3891",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import requests\n",
    "import zipfile\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import csv\n",
    "\n",
-    "data_path = Path.cwd().parent / \"data\" / \"tmp\""
+    "data_path = Path.cwd().parent / \"data\"\n",
    "fips_csv_path = data_path / \"fips_states_2010.csv\"\n",
    "csv_path = data_path / \"dataset\" / \"ejscreen_2020\""
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
-   "id": "0e6eb55e",
+   "id": "67a58c24",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "WindowsPath('C:/opt/justice40-tool/score/data/tmp')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a1431996",
   "metadata": {},
   "outputs": [
    {
@ -49,304 +35,144 @@
    }
   ],
   "source": [
    "import requests\n",
    "download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n",
    "file_contents = download.content\n",
-    "zip_file_path = data_path / \"downloaded.zip\"\n",
+    "zip_file_path = data_path / \"tmp\"\n",
-    "zip_file = open(zip_file_path, \"wb\")\n",
+    "zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n",
    "zip_file.write(file_contents)\n",
    "zip_file.close()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 4,
-   "id": "bc5f3466",
+   "id": "cc3fb9ec",
   "metadata": {},
   "outputs": [],
   "source": [
-    "import zipfile\n",
+    "with zipfile.ZipFile(zip_file_path / \"downloaded.zip\", \"r\") as zip_ref:\n",
-    "with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
+    "    zip_ref.extractall(zip_file_path)\n",
-    "    zip_ref.extractall(data_path)\n",
+    "ejscreen_csv = data_path / \"tmp\" / \"EJSCREEN_2020_StatePctile.csv\""
    "ejscreen_csv = data_path / \"EJSCREEN_2020_StatePctile.csv\""
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 5,
-   "id": "392ccb67",
+   "id": "b25738bb",
   "metadata": {
    "scrolled": true
   },
-   "outputs": [
+   "outputs": [],
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>OBJECTID</th>\n",
       "      <th>ID</th>\n",
       "      <th>STATE_NAME</th>\n",
       "      <th>ST_ABBREV</th>\n",
       "      <th>REGION</th>\n",
       "      <th>ACSTOTPOP</th>\n",
       "      <th>D_PM25_2</th>\n",
       "      <th>B_PM25_D2</th>\n",
       "      <th>P_PM25_D2</th>\n",
       "      <th>D_OZONE_2</th>\n",
       "      <th>...</th>\n",
       "      <th>T_PNPL</th>\n",
       "      <th>T_PNPL_D2</th>\n",
       "      <th>T_PRMP</th>\n",
       "      <th>T_PRMP_D2</th>\n",
       "      <th>T_PTSDF</th>\n",
       "      <th>T_PTSDF_D2</th>\n",
       "      <th>T_PWDIS</th>\n",
       "      <th>T_PWDIS_D2</th>\n",
       "      <th>Shape_Length</th>\n",
       "      <th>Shape_Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>10010201001</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>AL</td>\n",
       "      <td>4</td>\n",
       "      <td>636</td>\n",
       "      <td>-492.025529412</td>\n",
       "      <td>6</td>\n",
       "      <td>52.0</td>\n",
       "      <td>-1866.38637046</td>\n",
       "      <td>...</td>\n",
       "      <td>0.071 facilities/km distance (79%ile)</td>\n",
       "      <td>40%ile</td>\n",
       "      <td>0.085 facilities/km distance (23%ile)</td>\n",
       "      <td>53%ile</td>\n",
       "      <td>0.59 facilities/km distance (57%ile)</td>\n",
       "      <td>38%ile</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>13443.155206</td>\n",
       "      <td>6.040790e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>10010201002</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>AL</td>\n",
       "      <td>4</td>\n",
       "      <td>1287</td>\n",
       "      <td>-2053.08341364</td>\n",
       "      <td>4</td>\n",
       "      <td>30.0</td>\n",
       "      <td>-7787.90260177</td>\n",
       "      <td>...</td>\n",
       "      <td>0.064 facilities/km distance (76%ile)</td>\n",
       "      <td>19%ile</td>\n",
       "      <td>0.074 facilities/km distance (17%ile)</td>\n",
       "      <td>42%ile</td>\n",
       "      <td>0.45 facilities/km distance (52%ile)</td>\n",
       "      <td>23%ile</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>11917.089598</td>\n",
       "      <td>7.834160e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>10010202001</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>AL</td>\n",
       "      <td>4</td>\n",
       "      <td>810</td>\n",
       "      <td>1846.12693767</td>\n",
       "      <td>8</td>\n",
       "      <td>75.0</td>\n",
       "      <td>7002.78371663</td>\n",
       "      <td>...</td>\n",
       "      <td>0.069 facilities/km distance (78%ile)</td>\n",
       "      <td>85%ile</td>\n",
       "      <td>0.078 facilities/km distance (20%ile)</td>\n",
       "      <td>67%ile</td>\n",
       "      <td>0.65 facilities/km distance (59%ile)</td>\n",
       "      <td>77%ile</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>7770.915121</td>\n",
       "      <td>2.900774e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>10010202002</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>AL</td>\n",
       "      <td>4</td>\n",
       "      <td>1218</td>\n",
       "      <td>1392.07530488</td>\n",
       "      <td>8</td>\n",
       "      <td>72.0</td>\n",
       "      <td>5280.46153188</td>\n",
       "      <td>...</td>\n",
       "      <td>0.076 facilities/km distance (81%ile)</td>\n",
       "      <td>83%ile</td>\n",
       "      <td>0.087 facilities/km distance (24%ile)</td>\n",
       "      <td>66%ile</td>\n",
       "      <td>1 facilities/km distance (69%ile)</td>\n",
       "      <td>78%ile</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>6506.804784</td>\n",
       "      <td>1.793332e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>10010203001</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>AL</td>\n",
       "      <td>4</td>\n",
       "      <td>2641</td>\n",
       "      <td>-769.374640358</td>\n",
       "      <td>5</td>\n",
       "      <td>48.0</td>\n",
       "      <td>-2911.8926061</td>\n",
       "      <td>...</td>\n",
       "      <td>0.074 facilities/km distance (80%ile)</td>\n",
       "      <td>32%ile</td>\n",
       "      <td>0.08 facilities/km distance (21%ile)</td>\n",
       "      <td>51%ile</td>\n",
       "      <td>1.2 facilities/km distance (74%ile)</td>\n",
       "      <td>24%ile</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>11070.367848</td>\n",
       "      <td>5.461602e+06</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 124 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   OBJECTID           ID STATE_NAME ST_ABBREV  REGION  ACSTOTPOP  \\\n",
       "0         1  10010201001    Alabama        AL       4        636   \n",
       "1         2  10010201002    Alabama        AL       4       1287   \n",
       "2         3  10010202001    Alabama        AL       4        810   \n",
       "3         4  10010202002    Alabama        AL       4       1218   \n",
       "4         5  10010203001    Alabama        AL       4       2641   \n",
       "\n",
       "         D_PM25_2  B_PM25_D2 P_PM25_D2       D_OZONE_2  ...  \\\n",
       "0  -492.025529412          6      52.0  -1866.38637046  ...   \n",
       "1  -2053.08341364          4      30.0  -7787.90260177  ...   \n",
       "2   1846.12693767          8      75.0   7002.78371663  ...   \n",
       "3   1392.07530488          8      72.0   5280.46153188  ...   \n",
       "4  -769.374640358          5      48.0   -2911.8926061  ...   \n",
       "\n",
       "                                  T_PNPL T_PNPL_D2  \\\n",
       "0  0.071 facilities/km distance (79%ile)    40%ile   \n",
       "1  0.064 facilities/km distance (76%ile)    19%ile   \n",
       "2  0.069 facilities/km distance (78%ile)    85%ile   \n",
       "3  0.076 facilities/km distance (81%ile)    83%ile   \n",
       "4  0.074 facilities/km distance (80%ile)    32%ile   \n",
       "\n",
       "                                  T_PRMP  T_PRMP_D2  \\\n",
       "0  0.085 facilities/km distance (23%ile)     53%ile   \n",
       "1  0.074 facilities/km distance (17%ile)     42%ile   \n",
       "2  0.078 facilities/km distance (20%ile)     67%ile   \n",
       "3  0.087 facilities/km distance (24%ile)     66%ile   \n",
       "4   0.08 facilities/km distance (21%ile)     51%ile   \n",
       "\n",
       "                                T_PTSDF T_PTSDF_D2  T_PWDIS T_PWDIS_D2  \\\n",
       "0  0.59 facilities/km distance (57%ile)     38%ile     None       None   \n",
       "1  0.45 facilities/km distance (52%ile)     23%ile     None       None   \n",
       "2  0.65 facilities/km distance (59%ile)     77%ile     None       None   \n",
       "3     1 facilities/km distance (69%ile)     78%ile     None       None   \n",
       "4   1.2 facilities/km distance (74%ile)     24%ile     None       None   \n",
       "\n",
       "   Shape_Length    Shape_Area  \n",
       "0  13443.155206  6.040790e+06  \n",
       "1  11917.089598  7.834160e+06  \n",
       "2   7770.915121  2.900774e+06  \n",
       "3   6506.804784  1.793332e+06  \n",
       "4  11070.367848  5.461602e+06  \n",
       "\n",
       "[5 rows x 124 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "import numpy as np\n",
+    "df = pd.read_csv(ejscreen_csv, dtype={'ID': 'string'}, low_memory=False)"
    "import pandas as pd\n",
    "df = pd.read_csv(ejscreen_csv, low_memory=False)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 6,
-   "id": "0ce9e22a",
+   "id": "e6994f2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9fa2077a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write nationwide csv\n",
    "df.to_csv(csv_path / f\"usa.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5e5cc12a",
   "metadata": {},
   "outputs": [
    {
-     "data": {
+     "name": "stdout",
-      "text/plain": [
+     "output_type": "stream",
-       "<bound method DataFrame.count of                   ID  ACSTOTPOP  LESSHSPCT  LOWINCPCT\n",
+     "text": [
-       "0        10010201001        636   0.208134   0.385220\n",
+      "Generating data01 csv\n",
-       "1        10010201002       1287   0.040678   0.163170\n",
+      "Generating data02 csv\n",
-       "2        10010202001        810   0.135563   0.501247\n",
+      "Generating data04 csv\n",
-       "3        10010202002       1218   0.192000   0.393701\n",
+      "Generating data05 csv\n",
-       "4        10010203001       2641   0.125473   0.308217\n",
+      "Generating data06 csv\n",
-       "...              ...        ...        ...        ...\n",
+      "Generating data08 csv\n",
-       "220328  721537506011        699   0.391389   0.902718\n",
+      "Generating data09 csv\n",
-       "220329  721537506012       2432   0.185852   0.783717\n",
+      "Generating data10 csv\n",
-       "220330  721537506013        976   0.018116   0.776639\n",
+      "Generating data11 csv\n",
-       "220331  721537506021       1707   0.375422   0.867377\n",
+      "Generating data12 csv\n",
-       "220332  721537506022        804   0.162791   0.942786\n",
+      "Generating data13 csv\n",
-       "\n",
+      "Generating data15 csv\n",
-       "[220333 rows x 4 columns]>"
+      "Generating data16 csv\n",
-      ]
+      "Generating data17 csv\n",
-     },
+      "Generating data18 csv\n",
-     "execution_count": 32,
+      "Generating data19 csv\n",
-     "metadata": {},
+      "Generating data20 csv\n",
-     "output_type": "execute_result"
+      "Generating data21 csv\n",
      "Generating data22 csv\n",
      "Generating data23 csv\n",
      "Generating data24 csv\n",
      "Generating data25 csv\n",
      "Generating data26 csv\n",
      "Generating data27 csv\n",
      "Generating data28 csv\n",
      "Generating data29 csv\n",
      "Generating data30 csv\n",
      "Generating data31 csv\n",
      "Generating data32 csv\n",
      "Generating data33 csv\n",
      "Generating data34 csv\n",
      "Generating data35 csv\n",
      "Generating data36 csv\n",
      "Generating data37 csv\n",
      "Generating data38 csv\n",
      "Generating data39 csv\n",
      "Generating data40 csv\n",
      "Generating data41 csv\n",
      "Generating data42 csv\n",
      "Generating data44 csv\n",
      "Generating data45 csv\n",
      "Generating data46 csv\n",
      "Generating data47 csv\n",
      "Generating data48 csv\n",
      "Generating data49 csv\n",
      "Generating data50 csv\n",
      "Generating data51 csv\n",
      "Generating data53 csv\n",
      "Generating data54 csv\n",
      "Generating data55 csv\n",
      "Generating data56 csv\n"
     ]
    }
   ],
   "source": [
-    "df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]\n",
+    "# write per state csvs\n",
-    "df.head()\n",
+    "with open(fips_csv_path) as csv_file:\n",
-    "df.count"
+    "    csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
    "    line_count = 0\n",
    "\n",
    "    for row in csv_reader:\n",
    "        if line_count == 0:\n",
    "            line_count += 1\n",
    "        else:\n",
    "            fips = row[0].strip()\n",
    "            print(f\"Generating data{fips} csv\")\n",
    "            df1 = df[df.ID.str[:2] == fips]\n",
    "            # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
    "            df1.to_csv(csv_path / f\"data{fips}.csv\", index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "e051623b",
+   "id": "2674fb20",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/score/ipython/score_calc_0.1.ipynb
+++ b/score/ipython/score_calc_0.1.ipynb
@ -0,0 +1,100 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a664f981",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), WindowsPath('C:/opt/justice40-tool/score/scripts'), 'C:\\\\opt\\\\justice40-tool\\\\score\\\\ipython', 'C:\\\\Python39\\\\python39.zip', 'C:\\\\Python39\\\\DLLs', 'C:\\\\Python39\\\\lib', 'C:\\\\Python39', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv', '', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\win32\\\\lib', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\Pythonwin', 'c:\\\\opt\\\\justice40-tool\\\\score\\\\venv\\\\lib\\\\site-packages\\\\IPython\\\\extensions', 'C:\\\\Users\\\\j\\\\.ipython']\n"
     ]
    },
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'utils'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-3-e0c1285d1cc1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mutils\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[0mdata_path\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcwd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparent\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;34m\"data\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'utils'"
     ]
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import csv\n",
    "import sys\n",
    "\n",
    "script_path = Path.cwd().parent / \"scripts\"\n",
    "sys.path.insert(0, script_path)\n",
    "print(sys.path)\n",
    "\n",
    "from utils import *\n",
    "\n",
    "data_path = Path.cwd().parent / \"data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1b750f0e",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'get_state_fips_codes' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-2-fec7b31c5df6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# store all fips codes in list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mfips_state_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_state_fips_codes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mfips_state_list\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'get_state_fips_codes' is not defined"
     ]
    }
   ],
   "source": [
    "# store all fips codes in list\n",
    "fips_state_list = get_state_fips_codes\n",
    "fips_state_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7df430cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# EJSCreen ETL Load\n",
    "csv_path = data_path / \"dataset\" / \"ejscreen_2020\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/score/scripts/init.py
+++ b/score/scripts/init.py
--- a/score/scripts/download_cbg.py
+++ b/score/scripts/download_cbg.py
@ -5,70 +5,64 @@ import os
 import json
 from pathlib import Path
 from utils import get_state_fips_codes
 data_path = Path.cwd() / "data"
 with requests.Session() as s:
    # the fips_states_2010.csv is generated from data here
    # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
-    fips_csv_path = data_path / "fips_states_2010.csv"
+    state_fips_codes = get_state_fips_codes()
-    with open(fips_csv_path) as csv_file:
+    for fips in state_fips_codes:
-        csv_reader = csv.reader(csv_file, delimiter=",")
+        # check if file exists
-        line_count = 0
+        shp_file_path = data_path.joinpath(
-        for row in csv_reader:
+            "census", "shp", fips, f"tl_2010_{fips}_bg10.shp"
-            if line_count == 0:
+        )
-                line_count += 1
+        if not os.path.isfile(shp_file_path):
            print(f"downloading {row[1]}")
            # 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
            # But using 2010 for now
            cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
            download = s.get(cbg_state_url)
            file_contents = download.content
            zip_file_path = data_path / "census" / "downloaded.zip"
            zip_file = open(zip_file_path, "wb")
            zip_file.write(file_contents)
            zip_file.close()
            print(f"extracting {row[1]}")
            with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
                shp_dir_path = data_path / "census" / "shp" / fips
                zip_ref.extractall(shp_dir_path)
        geojson_dir_path = data_path.joinpath(
            "census",
            "geojson",
        )
        if not os.path.isfile(geojson_dir_path.joinpath(fips + ".json")):
            # ogr2ogr
            print(f"encoding GeoJSON for {row[1]}")
            # PWD is different for Windows
            if os.name == "nt":
                pwd = "%cd%"
            else:
-                fips = row[0].strip()
+                pwd = "${PWD}"
-
+            cmd = (
-                # check if file exists
+                'docker run --rm -it -v "'
-                shp_file_path = data_path.joinpath(
+                + pwd
-                    "census", "shp", fips, f"tl_2010_{fips}_bg10.shp"
+                + '"/:/home osgeo/gdal:alpine-ultrasmall-latest ogr2ogr -f GeoJSON /home/data/census/geojson/'
-                )
+                + fips
-                if not os.path.isfile(shp_file_path):
+                + ".json /home/data/census/shp/"
-                    print(f"downloading {row[1]}")
+                + fips
-
+                + "/tl_2010_"
-                    # 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
+                + fips
-                    # But using 2010 for now
+                + "_bg10.shp"
-                    cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
+            )
-                    download = s.get(cbg_state_url)
+            print(cmd)
-                    file_contents = download.content
+            os.system(cmd)
                    zip_file_path = data_path / "census" / "downloaded.zip"
                    zip_file = open(zip_file_path, "wb")
                    zip_file.write(file_contents)
                    zip_file.close()
                    print(f"extracting {row[1]}")
                    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
                        shp_dir_path = data_path / "census" / "shp" / fips
                        zip_ref.extractall(shp_dir_path)
                geojson_dir_path = data_path.joinpath(
                    "census",
                    "geojson",
                )
                if not os.path.isfile(geojson_dir_path.joinpath(fips + ".json")):
                    # ogr2ogr
                    print(f"encoding GeoJSON for {row[1]}")
                    # PWD is different for Windows
                    if os.name == "nt":
                        pwd = "%cd%"
                    else:
                        pwd = "${PWD}"
                    cmd = (
                        'docker run --rm -it -v "'
                        + pwd
                        + '"/:/home osgeo/gdal:alpine-ultrasmall-latest ogr2ogr -f GeoJSON /home/data/census/geojson/'
                        + fips
                        + ".json /home/data/census/shp/"
                        + fips
                        + "/tl_2010_"
                        + fips
                        + "_bg10.shp"
                    )
                    print(cmd)
                    os.system(cmd)
    # generate CBG CSV table for pandas
    ## load in memory
@ -87,10 +81,7 @@ with requests.Session() as s:
                        cbg_per_state_list[geoid10_state_id] = []
                    cbg_per_state_list[geoid10_state_id].append(geoid10)
-    csv_dir_path = data_path.joinpath(
+    csv_dir_path = data_path / "census" / "csv"
        "census",
        "csv",
    )
    ## write to individual state csv
    for state_id in cbg_per_state_list:
        geoid10_list = cbg_per_state_list[state_id]
--- a/score/scripts/generate_mbtiles.py
+++ b/score/scripts/generate_mbtiles.py
@ -2,6 +2,8 @@ import os
 from pathlib import Path
 import shutil
 from utils import get_state_fips_codes
 data_path = Path.cwd() / "data"
 # remove existing mbtiles file
@ -14,17 +16,41 @@ mvt_tiles_path = data_path / "tiles" / "mvt"
 if os.path.exists(mvt_tiles_path):
    shutil.rmtree(mvt_tiles_path)
 # Merge scores into json
 # TODO: for this first pass, just merging ACS EJScren indicators
 #       Per https://github.com/usds/justice40-tool/issues/102
 if os.name == "nt":
    pwd = "%cd%"
 else:
    pwd = "${PWD}"
 state_fips_codes = get_state_fips_codes()
 for fips in state_fips_codes:
    cmd = (
        'docker run --rm -v "'
        + pwd
        + '"/:/home '
        + "osgeo/gdal:alpine-small-latest ogr2ogr -f GeoJSON "
        + f"-sql \"SELECT * FROM tl_2010_{fips}_bg10 LEFT JOIN '/home/data/dataset/ejscreen_2020/data{fips}.csv'.data{fips} ON tl_2010_{fips}_bg10.GEOID10 = data{fips}.ID\" "
        + f"/home/data/score/geojson/{fips}.json /home/data/census/shp/{fips}/tl_2010_{fips}_bg10.dbf"
    )
    print(cmd)
    os.system(cmd)
 # get a list of all json files to plug in the docker commands below
 # (workaround since *.json doesn't seem to work)
 geojson_list = ""
-geojson_path = data_path / "census" / "geojson"
+geojson_path = data_path / "score" / "geojson"
 for file in os.listdir(geojson_path):
    if file.endswith(".json"):
-        geojson_list += f"/home/data/census/geojson/{file} "
+        geojson_list += f"/home/data/score/geojson/{file} "
 if geojson_list == "":
-    print("No GeoJson files found. Please run download_cbg.py first")
+    print("No GeoJson files found. Please run scripts/download_cbg.py first")
 # generate mbtiles file
 # PWD is different for Windows
 if os.name == "nt":
    pwd = "%cd%"
@ -33,7 +59,7 @@ else:
 cmd = (
    'docker run --rm -it -v "'
    + pwd
-    + '"/:/home klokantech/tippecanoe tippecanoe -s_srs EPSG:4269 -t_srs EPSG:4326 --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 '
+    + '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 '
    + geojson_list
 )
 print(cmd)
--- a/score/scripts/utils.py
+++ b/score/scripts/utils.py
@ -0,0 +1,20 @@
 # common usage functions
 import csv
 from pathlib import Path
 def get_state_fips_codes():
    data_path = Path.cwd() / "data"
    fips_csv_path = data_path / "fips_states_2010.csv"
    fips_state_list = []
    with open(fips_csv_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=",")
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
            else:
                fips = row[0].strip()
                fips_state_list.append(fips)
    return fips_state_list