mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 01:31:25 -08:00
Modularization + Poetry + Docker (#213)
* reorg
* added configuration management; initial click cmds
* reset dirs completed
* major modularization effort
* prepping mbtiles
* first round of PR review updates
* round 2 of feedback review
* checkpoint
* habemus dockerfile 🎉
* updated dock-er-compose with long running container
* census generation works
* logging working
* updated README
* updated README
* last small update to README
* added instructions for log visualization
* census etl update for reusable fips module
* ejscreem etl updated
* further modularization
* score modularization
* tmp cleanup
This commit is contained in:
parent
6f4087d247
commit
67c73dde2a
29 changed files with 2383 additions and 433 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
@ -1,5 +1,6 @@
|
|||
*.env
|
||||
.idea
|
||||
.vscode
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
@ -128,7 +129,11 @@ dmypy.json
|
|||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# temporary census data
|
||||
# Ignore dynaconf secret files
|
||||
score/.secrets.*
|
||||
|
||||
# ignore data
|
||||
score/data
|
||||
score/data/census
|
||||
score/data/tiles
|
||||
score/data/tmp
|
||||
|
|
15
docker-compose.yml
Normal file
15
docker-compose.yml
Normal file
|
@ -0,0 +1,15 @@
|
|||
version: "3.4"
|
||||
services:
|
||||
score:
|
||||
image: j40_score
|
||||
container_name: j40_score_1
|
||||
build: score
|
||||
ports:
|
||||
- 8888:8888
|
||||
volumes:
|
||||
- ./score:/score
|
||||
stdin_open: true
|
||||
tty: true
|
||||
environment:
|
||||
ENV_FOR_DYNACONF: development
|
||||
PYTHONUNBUFFERED: 1
|
4
score/.vscode/settings.json
vendored
4
score/.vscode/settings.json
vendored
|
@ -1,4 +0,0 @@
|
|||
{
|
||||
"python.pythonPath": "venv\\Scripts\\python.exe",
|
||||
"python.dataScience.sendSelectionToInteractiveWindow": false
|
||||
}
|
33
score/Dockerfile
Normal file
33
score/Dockerfile
Normal file
|
@ -0,0 +1,33 @@
|
|||
FROM ubuntu:20.04
|
||||
|
||||
# Install packages
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
make \
|
||||
gcc \
|
||||
git \
|
||||
unzip \
|
||||
wget \
|
||||
python3-dev \
|
||||
python3-pip
|
||||
|
||||
# tippeanoe
|
||||
ENV TZ=America/Los_Angeles
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
RUN apt-get install -y software-properties-common libsqlite3-dev zlib1g-dev
|
||||
RUN apt-add-repository -y ppa:git-core/ppa
|
||||
RUN mkdir -p /tmp/tippecanoe-src && git clone https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe-src
|
||||
WORKDIR /tmp/tippecanoe-src
|
||||
RUN /bin/sh -c make && make install
|
||||
|
||||
## gdal
|
||||
RUN add-apt-repository ppa:ubuntugis/ppa
|
||||
RUN apt-get -y install gdal-bin
|
||||
|
||||
# Prepare python packages
|
||||
WORKDIR /score
|
||||
RUN pip3 install --upgrade pip setuptools wheel
|
||||
COPY . .
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip3 install -r requirements.txt
|
|
@ -1,26 +1,63 @@
|
|||
# Justice 40 Score generator
|
||||
# Justice 40 Score application
|
||||
|
||||
## Setup
|
||||
## Running using Docker
|
||||
|
||||
We use Docker to install the necessary libraries in a container that can be run in any operating system.
|
||||
|
||||
To build the docker container the first time, make sure you're in the root directory of the repository and run `docker-compose build`
|
||||
|
||||
Then run commands, opening a new terminal window or tab:
|
||||
|
||||
- Get help: `docker run --rm -it j40_score /bin/sh -c "python3 application.py" python3 application.py --help"`
|
||||
- Clean up the data directories: `docker run --rm -it j40_score /bin/sh -c "python3 application.py data-cleanup"`
|
||||
- Generate census data: `docker run --rm -it j40_score /bin/sh -c "python3 application.py census-data-download"`
|
||||
|
||||
## Log visualization
|
||||
|
||||
If you want to visualize logs, the following temporary workaround can be used:
|
||||
|
||||
- Run `docker-compose up` on the root of the repo
|
||||
- Open a new tab on your terminal
|
||||
- Then run any command for the application using this format: `docker exec j40_score_1 python3 application.py [command]`
|
||||
|
||||
## Local development
|
||||
|
||||
You can run the Python code locally to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. Also to generate tiles for a local map, you will need [Mapbox tippeanoe](https://github.com/mapbox/tippecanoe)
|
||||
|
||||
- Start a terminal
|
||||
- Make sure you have Python 3.9 installed: `python -V` or `python3 -V`
|
||||
- Create a `virtualenv` in this folder: `python -m venv venv`
|
||||
- Activate the virtualenv
|
||||
- Windows: `./venv/Scripts/activate`
|
||||
- Mac/Linux: `source venv/bin/activate`
|
||||
- Install packages: `pip install -r requirements.txt`
|
||||
- If you are a Windows user, you might need to install Build Tools for Visual Studio. [Instructions here](https://stackoverflow.com/a/54136652)
|
||||
- We use [Poetry](https://python-poetry.org/) for managing dependencies and building the application. Please follow the instructions on their site to download.
|
||||
- Install Poetry requirements with `poetry install`
|
||||
|
||||
## Running the Jupyter notebook
|
||||
### Downloading Census Block Groups GeoJSON and Generating CBG CSVs
|
||||
|
||||
- Make sure you have Docker running in your machine
|
||||
- Start a terminal
|
||||
- Change to this directory (i.e. `cd score`)
|
||||
- If you want to clear out all data and tiles from all directories, you can run: `poetry run python application.py data-cleanup`.
|
||||
- Then run `poetry run python application.py census-data-download`
|
||||
Note: Census files are not kept in the repository and the download directories are ignored by Git
|
||||
|
||||
### Generating mbtiles
|
||||
|
||||
- TBD
|
||||
|
||||
### Serve the map locally
|
||||
|
||||
- Start a terminal
|
||||
- Change to this directory (i.e. `cd score`)
|
||||
- Activate your virtualenv (see above)
|
||||
- Type `jupyter notebook`. Your browser should open with a Jupyter Notebook tab
|
||||
- Run: `docker run --rm -it -v ${PWD}/data/tiles:/data -p 8080:80 maptiler/tileserver-gl`
|
||||
|
||||
## Activating variable-enabled Markdown for Jupyter notebooks
|
||||
### Running Jupyter notebooks
|
||||
|
||||
- Start a terminal
|
||||
- Change to this directory (i.e. `cd score`)
|
||||
- Run `poetry run jupyter notebook`. Your browser should open with a Jupyter Notebook tab
|
||||
|
||||
### Activating variable-enabled Markdown for Jupyter notebooks
|
||||
|
||||
- Change to this directory (i.e. `cd score`)
|
||||
- Activate a Poetry Shell (see above)
|
||||
- Run `jupyter contrib nbextension install --user`
|
||||
- Run `jupyter nbextension enable python-markdown/main`
|
||||
- Make sure you've loaded the Jupyter notebook in a "Trusted" state. (See button near
|
||||
|
@ -29,21 +66,6 @@
|
|||
For more information, see [nbextensions docs](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html) and
|
||||
see [python-markdown docs](https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tree/master/src/jupyter_contrib_nbextensions/nbextensions/python-markdown).
|
||||
|
||||
## Downloading Census Block Groups GeoJSON and Generating CBG CSVs
|
||||
## Miscellaneous
|
||||
|
||||
- Make sure you have Docker running in your machine
|
||||
- Start a terminal
|
||||
- Change to this directory (i.e. `cd score`)
|
||||
- Activate your virtualenv (see above)
|
||||
- Run `python scripts/download_cbg.py`
|
||||
Note: Census files are not kept in the repository and the download directories are ignored by Git
|
||||
|
||||
## Generating mbtiles
|
||||
|
||||
- Change to this directory (i.e. `cd score`)
|
||||
- Activate your virtualenv (see above)
|
||||
- Run the following script: `python .\scripts\generate_mbtiles.py`
|
||||
|
||||
## Serve the map locally
|
||||
|
||||
- Run: `docker run --rm -it -v ${PWD}/data/tiles:/data -p 8080:80 klokantech/tileserver-gl`
|
||||
- To export packages from Poetry to `requirements.txt` run `poetry export --without-hashes > requirements.txt`
|
||||
|
|
60
score/application.py
Normal file
60
score/application.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
from config import settings
|
||||
import click
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
from etl.sources.census.etl_utils import reset_data_directories as census_reset
|
||||
from utils import remove_files_from_dir, remove_all_from_dir, get_module_logger
|
||||
from etl.sources.census.etl import download_census_csvs
|
||||
|
||||
|
||||
settings.APP_ROOT = Path.cwd()
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
|
||||
@cli.command(
|
||||
help="Clean up all data folders",
|
||||
)
|
||||
def data_cleanup():
|
||||
|
||||
data_path = settings.APP_ROOT / "data"
|
||||
|
||||
# census directories
|
||||
logger.info(f"Initializing all census data")
|
||||
census_reset(data_path)
|
||||
|
||||
# dataset directory
|
||||
logger.info(f"Initializing all dataset directoriees")
|
||||
remove_all_from_dir(data_path / "dataset")
|
||||
|
||||
# score directory
|
||||
logger.info(f"Initializing all score data")
|
||||
remove_files_from_dir(data_path / "score" / "csv", ".csv")
|
||||
remove_files_from_dir(data_path / "score" / "geojson", ".json")
|
||||
|
||||
# cleanup tmp dir
|
||||
logger.info(f"Initializing all temp directoriees")
|
||||
remove_all_from_dir(data_path / "tmp")
|
||||
|
||||
logger.info("Cleaned up all data files")
|
||||
|
||||
|
||||
@cli.command(
|
||||
help="Census data download",
|
||||
)
|
||||
def census_data_download():
|
||||
logger.info("Downloading census data")
|
||||
data_path = settings.APP_ROOT / "data"
|
||||
download_census_csvs(data_path)
|
||||
|
||||
logger.info("Completed downloading census data")
|
||||
exit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
11
score/config.py
Normal file
11
score/config.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from dynaconf import Dynaconf
|
||||
|
||||
settings = Dynaconf(
|
||||
envvar_prefix="DYNACONF",
|
||||
settings_files=["settings.toml", ".secrets.toml"],
|
||||
environments=True,
|
||||
)
|
||||
|
||||
# To set an environment use:
|
||||
# Linux/OSX: export ENV_FOR_DYNACONF=staging
|
||||
# Windows: set ENV_FOR_DYNACONF=staging
|
|
@ -1,52 +0,0 @@
|
|||
fips,state_name
|
||||
01 ,Alabama
|
||||
02 ,Alaska
|
||||
04 ,Arizona
|
||||
05 ,Arkansas
|
||||
06 ,California
|
||||
08 ,Colorado
|
||||
09 ,Connecticut
|
||||
10 ,Delaware
|
||||
11 ,District of Columbia
|
||||
12 ,Florida
|
||||
13 ,Georgia
|
||||
15 ,Hawaii
|
||||
16 ,Idaho
|
||||
17 ,Illinois
|
||||
18 ,Indiana
|
||||
19 ,Iowa
|
||||
20 ,Kansas
|
||||
21 ,Kentucky
|
||||
22 ,Louisiana
|
||||
23 ,Maine
|
||||
24 ,Maryland
|
||||
25 ,Massachusetts
|
||||
26 ,Michigan
|
||||
27 ,Minnesota
|
||||
28 ,Mississippi
|
||||
29 ,Missouri
|
||||
30 ,Montana
|
||||
31 ,Nebraska
|
||||
32 ,Nevada
|
||||
33 ,New Hampshire
|
||||
34 ,New Jersey
|
||||
35 ,New Mexico
|
||||
36 ,New York
|
||||
37 ,North Carolina
|
||||
38 ,North Dakota
|
||||
39 ,Ohio
|
||||
40 ,Oklahoma
|
||||
41 ,Oregon
|
||||
42 ,Pennsylvania
|
||||
44 ,Rhode Island
|
||||
45 ,South Carolina
|
||||
46 ,South Dakota
|
||||
47 ,Tennessee
|
||||
48 ,Texas
|
||||
49 ,Utah
|
||||
50 ,Vermont
|
||||
51 ,Virginia
|
||||
53 ,Washington
|
||||
54 ,West Virginia
|
||||
55 ,Wisconsin
|
||||
56 ,Wyoming
|
|
0
score/etl/sources/census/__init__.py
Normal file
0
score/etl/sources/census/__init__.py
Normal file
93
score/etl/sources/census/etl.py
Normal file
93
score/etl/sources/census/etl.py
Normal file
|
@ -0,0 +1,93 @@
|
|||
import csv
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from .etl_utils import get_state_fips_codes
|
||||
from utils import unzip_file_from_url, get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def download_census_csvs(data_path: Path) -> None:
|
||||
# the fips_states_2010.csv is generated from data here
|
||||
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
|
||||
state_fips_codes = get_state_fips_codes(data_path)
|
||||
for fips in state_fips_codes:
|
||||
# check if file exists
|
||||
shp_file_path = data_path / "census" / "shp" / fips / f"tl_2010_{fips}_bg10.shp"
|
||||
|
||||
if not os.path.isfile(shp_file_path):
|
||||
logger.info(f"Downloading {fips}")
|
||||
|
||||
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
|
||||
# But using 2010 for now
|
||||
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
|
||||
unzip_file_from_url(
|
||||
cbg_state_url, data_path / "tmp", data_path / "census" / "shp" / fips
|
||||
)
|
||||
|
||||
geojson_dir_path = data_path / "census" / "geojson"
|
||||
|
||||
cmd = (
|
||||
"ogr2ogr -f GeoJSON data/census/geojson/"
|
||||
+ fips
|
||||
+ ".json data/census/shp/"
|
||||
+ fips
|
||||
+ "/tl_2010_"
|
||||
+ fips
|
||||
+ "_bg10.shp"
|
||||
)
|
||||
os.system(cmd)
|
||||
|
||||
# generate CBG CSV table for pandas
|
||||
## load in memory
|
||||
cbg_national = [] # in-memory global list
|
||||
cbg_per_state: dict = {} # in-memory dict per state
|
||||
for file in os.listdir(geojson_dir_path):
|
||||
if file.endswith(".json"):
|
||||
logger.info(f"Ingesting geoid10 for file {file}")
|
||||
with open(geojson_dir_path / file) as f:
|
||||
geojson = json.load(f)
|
||||
for feature in geojson["features"]:
|
||||
geoid10 = feature["properties"]["GEOID10"]
|
||||
cbg_national.append(str(geoid10))
|
||||
geoid10_state_id = geoid10[:2]
|
||||
if not cbg_per_state.get(geoid10_state_id):
|
||||
cbg_per_state[geoid10_state_id] = []
|
||||
cbg_per_state[geoid10_state_id].append(geoid10)
|
||||
|
||||
csv_dir_path = data_path / "census" / "csv"
|
||||
## write to individual state csv
|
||||
for state_id in cbg_per_state:
|
||||
geoid10_list = cbg_per_state[state_id]
|
||||
with open(
|
||||
csv_dir_path / f"{state_id}.csv", mode="w", newline=""
|
||||
) as cbg_csv_file:
|
||||
cbg_csv_file_writer = csv.writer(
|
||||
cbg_csv_file,
|
||||
delimiter=",",
|
||||
quotechar='"',
|
||||
quoting=csv.QUOTE_MINIMAL,
|
||||
)
|
||||
|
||||
for geoid10 in geoid10_list:
|
||||
cbg_csv_file_writer.writerow(
|
||||
[
|
||||
geoid10,
|
||||
]
|
||||
)
|
||||
|
||||
## write US csv
|
||||
with open(csv_dir_path / "us.csv", mode="w", newline="") as cbg_csv_file:
|
||||
cbg_csv_file_writer = csv.writer(
|
||||
cbg_csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
|
||||
)
|
||||
for geoid10 in cbg_national:
|
||||
cbg_csv_file_writer.writerow(
|
||||
[
|
||||
geoid10,
|
||||
]
|
||||
)
|
||||
|
||||
logger.info("Census block groups downloading complete")
|
47
score/etl/sources/census/etl_utils.py
Normal file
47
score/etl/sources/census/etl_utils.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
from pathlib import Path
|
||||
import csv
|
||||
import os
|
||||
from config import settings
|
||||
|
||||
from utils import remove_files_from_dir, remove_all_dirs_from_dir, unzip_file_from_url
|
||||
|
||||
|
||||
def reset_data_directories(data_path: Path) -> None:
|
||||
census_data_path = data_path / "census"
|
||||
|
||||
# csv
|
||||
csv_path = census_data_path / "csv"
|
||||
remove_files_from_dir(csv_path, ".csv")
|
||||
|
||||
# geojson
|
||||
geojson_path = census_data_path / "geojson"
|
||||
remove_files_from_dir(geojson_path, ".json")
|
||||
|
||||
# shp
|
||||
shp_path = census_data_path / "shp"
|
||||
remove_all_dirs_from_dir(shp_path)
|
||||
|
||||
|
||||
def get_state_fips_codes(data_path: Path) -> list:
|
||||
fips_csv_path = data_path / "census" / "csv" / "fips_states_2010.csv"
|
||||
|
||||
# check if file exists
|
||||
if not os.path.isfile(fips_csv_path):
|
||||
unzip_file_from_url(
|
||||
settings.AWS_JUSTICE40_DATA_URL + "/Census/fips_states_2010.zip",
|
||||
data_path / "tmp",
|
||||
data_path / "census" / "csv",
|
||||
)
|
||||
|
||||
fips_state_list = []
|
||||
with open(fips_csv_path) as csv_file:
|
||||
csv_reader = csv.reader(csv_file, delimiter=",")
|
||||
line_count = 0
|
||||
|
||||
for row in csv_reader:
|
||||
if line_count == 0:
|
||||
line_count += 1
|
||||
else:
|
||||
fips = row[0].strip()
|
||||
fips_state_list.append(fips)
|
||||
return fips_state_list
|
0
score/etl/sources/ejscreen/__init__.py
Normal file
0
score/etl/sources/ejscreen/__init__.py
Normal file
|
@ -12,11 +12,17 @@
|
|||
"import csv\n",
|
||||
"from pathlib import Path\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"module_path = os.path.abspath(os.path.join('..'))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"from etl.sources.census.etl_utils import get_state_fips_codes\n",
|
||||
"\n",
|
||||
"ACS_YEAR = 2019\n",
|
||||
"\n",
|
||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
|
||||
"OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
|
||||
"\n",
|
||||
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
||||
|
@ -57,27 +63,19 @@
|
|||
"\n",
|
||||
"\n",
|
||||
"dfs = []\n",
|
||||
"with open(FIPS_CSV_PATH) as csv_file:\n",
|
||||
" csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
|
||||
" line_count = 0\n",
|
||||
"for fips in get_state_fips_codes(DATA_PATH):\n",
|
||||
" print(f\"Downloading data for state/territory with FIPS code {fips}\")\n",
|
||||
"\n",
|
||||
" for row in csv_reader:\n",
|
||||
" if line_count == 0:\n",
|
||||
" line_count += 1\n",
|
||||
" else:\n",
|
||||
" fips = row[0].strip()\n",
|
||||
" print(f\"Downloading data for state/territory with FIPS code {fips}\")\n",
|
||||
"\n",
|
||||
" dfs.append(\n",
|
||||
" censusdata.download(\n",
|
||||
" src=\"acs5\",\n",
|
||||
" year=ACS_YEAR,\n",
|
||||
" geo=censusdata.censusgeo(\n",
|
||||
" [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n",
|
||||
" ),\n",
|
||||
" var=[\"B23025_005E\", \"B23025_003E\"],\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" dfs.append(\n",
|
||||
" censusdata.download(\n",
|
||||
" src=\"acs5\",\n",
|
||||
" year=ACS_YEAR,\n",
|
||||
" geo=censusdata.censusgeo(\n",
|
||||
" [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n",
|
||||
" ),\n",
|
||||
" var=[\"B23025_005E\", \"B23025_003E\"],\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"df = pd.concat(dfs)\n",
|
||||
"\n",
|
||||
|
|
|
@ -8,33 +8,25 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import csv\n",
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"data_path = Path.cwd().parent / \"data\"\n",
|
||||
"fips_csv_path = data_path / \"fips_states_2010.csv\"\n",
|
||||
"csv_path = data_path / \"dataset\" / \"ejscreen_2020\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "67a58c24",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"download = requests.get(\n",
|
||||
" \"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\",\n",
|
||||
" verify=False,\n",
|
||||
")\n",
|
||||
"file_contents = download.content\n",
|
||||
"zip_file_path = data_path / \"tmp\"\n",
|
||||
"zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n",
|
||||
"zip_file.write(file_contents)\n",
|
||||
"zip_file.close()"
|
||||
"module_path = os.path.abspath(os.path.join('..'))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"from etl.sources.census.etl_utils import get_state_fips_codes\n",
|
||||
"from utils import unzip_file_from_url, remove_all_from_dir\n",
|
||||
"\n",
|
||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"TMP_PATH = DATA_PATH / \"tmp\"\n",
|
||||
"EJSCREEN_FTP_URL = \"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\"\n",
|
||||
"EJSCREEN_CSV = TMP_PATH / \"EJSCREEN_2020_StatePctile.csv\"\n",
|
||||
"CSV_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2020\"\n",
|
||||
"print(DATA_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -44,9 +36,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with zipfile.ZipFile(zip_file_path / \"downloaded.zip\", \"r\") as zip_ref:\n",
|
||||
" zip_ref.extractall(zip_file_path)\n",
|
||||
"ejscreen_csv = data_path / \"tmp\" / \"EJSCREEN_2020_StatePctile.csv\""
|
||||
"# download file from ejscreen ftp\n",
|
||||
"unzip_file_from_url(EJSCREEN_FTP_URL, TMP_PATH, TMP_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -58,7 +49,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_csv(ejscreen_csv, dtype={\"ID\": \"string\"}, low_memory=False)"
|
||||
"df = pd.read_csv(EJSCREEN_CSV, dtype={\"ID\": \"string\"}, low_memory=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -69,8 +60,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# write nationwide csv\n",
|
||||
"csv_path.mkdir(parents=True, exist_ok=True)\n",
|
||||
"df.to_csv(csv_path / f\"usa.csv\", index=False)"
|
||||
"CSV_PATH.mkdir(parents=True, exist_ok=True)\n",
|
||||
"df.to_csv(CSV_PATH / f\"usa.csv\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -81,19 +72,11 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# write per state csvs\n",
|
||||
"with open(fips_csv_path) as csv_file:\n",
|
||||
" csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
|
||||
" line_count = 0\n",
|
||||
"\n",
|
||||
" for row in csv_reader:\n",
|
||||
" if line_count == 0:\n",
|
||||
" line_count += 1\n",
|
||||
" else:\n",
|
||||
" fips = row[0].strip()\n",
|
||||
" print(f\"Generating data{fips} csv\")\n",
|
||||
" df1 = df[df.ID.str[:2] == fips]\n",
|
||||
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
|
||||
" df1.to_csv(csv_path / f\"data{fips}.csv\", index=False)"
|
||||
"for fips in get_state_fips_codes(DATA_PATH):\n",
|
||||
" print(f\"Generating data{fips} csv\")\n",
|
||||
" df1 = df[df.ID.str[:2] == fips]\n",
|
||||
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
|
||||
" df1.to_csv(CSV_PATH / f\"data{fips}.csv\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -102,6 +85,17 @@
|
|||
"id": "81b977f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# cleanup\n",
|
||||
"remove_all_from_dir(TMP_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6d4f74d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
|
|
|
@ -10,15 +10,22 @@
|
|||
"import pandas as pd\n",
|
||||
"import censusdata\n",
|
||||
"import csv\n",
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
"\n",
|
||||
"from pathlib import Path\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"module_path = os.path.abspath(os.path.join('..'))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"from etl.sources.census.etl_utils import get_state_fips_codes\n",
|
||||
"from utils import unzip_file_from_url, remove_all_from_dir\n",
|
||||
"\n",
|
||||
"ACS_YEAR = 2019\n",
|
||||
"\n",
|
||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
|
||||
"TMP_PATH = DATA_PATH / \"tmp\"\n",
|
||||
"HOUSING_FTP_URL = \"https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid=\"\n",
|
||||
"OUTPUT_PATH = DATA_PATH / \"dataset\" / \"housing_and_transportation_index\"\n",
|
||||
"\n",
|
||||
"GEOID_FIELD_NAME = \"GEOID10\""
|
||||
|
@ -31,44 +38,18 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid=01\n",
|
||||
"\n",
|
||||
"# Download each state / territory individually\n",
|
||||
"dfs = []\n",
|
||||
"with open(FIPS_CSV_PATH) as csv_file:\n",
|
||||
" csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
|
||||
" line_count = 0\n",
|
||||
"zip_file_dir = TMP_PATH / \"housing_and_transportation_index\"\n",
|
||||
"for fips in get_state_fips_codes(DATA_PATH):\n",
|
||||
" print(f\"Downloading housing data for state/territory with FIPS code {fips}\")\n",
|
||||
" unzip_file_from_url(f\"{HOUSING_FTP_URL}{fips}\", TMP_PATH, zip_file_dir)\n",
|
||||
" \n",
|
||||
" # New file name:\n",
|
||||
" tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n",
|
||||
" tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
|
||||
"\n",
|
||||
" for row in csv_reader:\n",
|
||||
" if line_count == 0:\n",
|
||||
" line_count += 1\n",
|
||||
" else:\n",
|
||||
" fips = row[0].strip()\n",
|
||||
"\n",
|
||||
" print(f\"Downloading data for state/territory with FIPS code {fips}\")\n",
|
||||
"\n",
|
||||
" download = requests.get(\n",
|
||||
" f\"https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid={fips}\",\n",
|
||||
" verify=False,\n",
|
||||
" )\n",
|
||||
" file_contents = download.content\n",
|
||||
" zip_file_dir = DATA_PATH / \"tmp\" / \"housing_and_transportation_index\"\n",
|
||||
"\n",
|
||||
" # Make the directory if it doesn't exist\n",
|
||||
" zip_file_dir.mkdir(parents=True, exist_ok=True)\n",
|
||||
" zip_file_path = zip_file_dir / f\"{fips}-downloaded.zip\"\n",
|
||||
" zip_file = open(zip_file_path, \"wb\")\n",
|
||||
" zip_file.write(file_contents)\n",
|
||||
" zip_file.close()\n",
|
||||
"\n",
|
||||
" with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
|
||||
" zip_ref.extractall(zip_file_dir)\n",
|
||||
"\n",
|
||||
" # New file name:\n",
|
||||
" tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n",
|
||||
" tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
|
||||
"\n",
|
||||
" dfs.append(tmp_df)\n",
|
||||
" dfs.append(tmp_df)\n",
|
||||
"\n",
|
||||
"df = pd.concat(dfs)\n",
|
||||
"\n",
|
||||
|
@ -105,6 +86,17 @@
|
|||
"id": "ef5bb862",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# cleanup\n",
|
||||
"remove_all_from_dir(TMP_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9269e497",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
|
|
|
@ -17,6 +17,14 @@
|
|||
"from pathlib import Path\n",
|
||||
"import pandas as pd\n",
|
||||
"import csv\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"module_path = os.path.abspath(os.path.join('..'))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"from etl.sources.census.etl_utils import get_state_fips_codes\n",
|
||||
"\n",
|
||||
"# Define some global parameters\n",
|
||||
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
||||
|
@ -37,9 +45,8 @@
|
|||
"\n",
|
||||
"PERCENTILE_FIELD_SUFFIX = \" (percentile)\"\n",
|
||||
"\n",
|
||||
"data_path = Path.cwd().parent / \"data\"\n",
|
||||
"fips_csv_path = data_path / \"fips_states_2010.csv\"\n",
|
||||
"score_csv_path = data_path / \"score\" / \"csv\"\n",
|
||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"SCORE_CSV_PATH = DATA_PATH / \"score\" / \"csv\"\n",
|
||||
"\n",
|
||||
"# Tell pandas to display all columns\n",
|
||||
"pd.set_option(\"display.max_columns\", None)"
|
||||
|
@ -55,7 +62,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# EJSCreen csv Load\n",
|
||||
"ejscreen_csv = data_path / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n",
|
||||
"ejscreen_csv = DATA_PATH / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n",
|
||||
"ejscreen_df = pd.read_csv(ejscreen_csv, dtype={\"ID\": \"string\"}, low_memory=False)\n",
|
||||
"ejscreen_df.rename(columns={\"ID\": GEOID_FIELD_NAME}, inplace=True)\n",
|
||||
"ejscreen_df.head()"
|
||||
|
@ -69,7 +76,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Load census data\n",
|
||||
"census_csv = data_path / \"dataset\" / \"census_acs_2019\" / \"usa.csv\"\n",
|
||||
"census_csv = DATA_PATH / \"dataset\" / \"census_acs_2019\" / \"usa.csv\"\n",
|
||||
"census_df = pd.read_csv(\n",
|
||||
" census_csv, dtype={GEOID_FIELD_NAME: \"string\"}, low_memory=False\n",
|
||||
")\n",
|
||||
|
@ -85,7 +92,7 @@
|
|||
"source": [
|
||||
"# Load housing and transportation data\n",
|
||||
"housing_and_transportation_index_csv = (\n",
|
||||
" data_path / \"dataset\" / \"housing_and_transportation_index\" / \"usa.csv\"\n",
|
||||
" DATA_PATH / \"dataset\" / \"housing_and_transportation_index\" / \"usa.csv\"\n",
|
||||
")\n",
|
||||
"housing_and_transportation_df = pd.read_csv(\n",
|
||||
" housing_and_transportation_index_csv,\n",
|
||||
|
@ -352,7 +359,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# write nationwide csv\n",
|
||||
"df.to_csv(score_csv_path / f\"usa.csv\", index=False)"
|
||||
"df.to_csv(SCORE_CSV_PATH / f\"usa.csv\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -363,19 +370,11 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# write per state csvs\n",
|
||||
"with open(fips_csv_path) as csv_file:\n",
|
||||
" csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
|
||||
" line_count = 0\n",
|
||||
"\n",
|
||||
" for row in csv_reader:\n",
|
||||
" if line_count == 0:\n",
|
||||
" line_count += 1\n",
|
||||
" else:\n",
|
||||
" states_fips = row[0].strip()\n",
|
||||
" print(f\"Generating data{states_fips} csv\")\n",
|
||||
" df1 = df[df[\"GEOID10\"].str[:2] == states_fips]\n",
|
||||
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
|
||||
" df1.to_csv(score_csv_path / f\"data{states_fips}.csv\", index=False)"
|
||||
"for states_fips in get_state_fips_codes(DATA_PATH):\n",
|
||||
" print(f\"Generating data{states_fips} csv\")\n",
|
||||
" df1 = df[df[\"GEOID10\"].str[:2] == states_fips]\n",
|
||||
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
|
||||
" df1.to_csv(SCORE_CSV_PATH / f\"data{states_fips}.csv\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
1766
score/poetry.lock
generated
Normal file
1766
score/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
26
score/pyproject.toml
Normal file
26
score/pyproject.toml
Normal file
|
@ -0,0 +1,26 @@
|
|||
[tool.poetry]
|
||||
name = "score"
|
||||
version = "0.1.0"
|
||||
description = "ETL and Generation of Justice 40 Score"
|
||||
authors = ["Your Name <you@example.com>"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
ipython = "^7.24.1"
|
||||
jupyter = "^1.0.0"
|
||||
jupyter-contrib-nbextensions = "^0.5.1"
|
||||
numpy = "^1.21.0"
|
||||
pandas = "^1.2.5"
|
||||
requests = "^2.25.1"
|
||||
click = "^8.0.1"
|
||||
dynaconf = "^3.1.4"
|
||||
types-requests = "^2.25.0"
|
||||
CensusData = "^1.13"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
mypy = "^0.910"
|
||||
black = {version = "^21.6b0", allow-prereleases = true}
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
Binary file not shown.
|
@ -1,114 +0,0 @@
|
|||
import csv
|
||||
import requests
|
||||
import zipfile
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from utils import get_state_fips_codes
|
||||
|
||||
data_path = Path.cwd() / "data"
|
||||
|
||||
with requests.Session() as s:
|
||||
# the fips_states_2010.csv is generated from data here
|
||||
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
|
||||
state_fips_codes = get_state_fips_codes()
|
||||
for fips in state_fips_codes:
|
||||
# check if file exists
|
||||
shp_file_path = data_path.joinpath(
|
||||
"census", "shp", fips, f"tl_2010_{fips}_bg10.shp"
|
||||
)
|
||||
if not os.path.isfile(shp_file_path):
|
||||
print(f"downloading {row[1]}")
|
||||
|
||||
# 2020 tiger data is here: https://www2.census.gov/geo/tiger/TIGER2020/BG/
|
||||
# But using 2010 for now
|
||||
cbg_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/BG/2010/tl_2010_{fips}_bg10.zip"
|
||||
download = s.get(cbg_state_url)
|
||||
file_contents = download.content
|
||||
zip_file_path = data_path / "census" / "downloaded.zip"
|
||||
zip_file = open(zip_file_path, "wb")
|
||||
zip_file.write(file_contents)
|
||||
zip_file.close()
|
||||
|
||||
print(f"extracting {row[1]}")
|
||||
|
||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||
shp_dir_path = data_path / "census" / "shp" / fips
|
||||
zip_ref.extractall(shp_dir_path)
|
||||
|
||||
geojson_dir_path = data_path.joinpath(
|
||||
"census",
|
||||
"geojson",
|
||||
)
|
||||
if not os.path.isfile(geojson_dir_path.joinpath(fips + ".json")):
|
||||
# ogr2ogr
|
||||
print(f"encoding GeoJSON for {row[1]}")
|
||||
|
||||
# PWD is different for Windows
|
||||
if os.name == "nt":
|
||||
pwd = "%cd%"
|
||||
else:
|
||||
pwd = "${PWD}"
|
||||
cmd = (
|
||||
'docker run --rm -it -v "'
|
||||
+ pwd
|
||||
+ '"/:/home osgeo/gdal:alpine-ultrasmall-latest ogr2ogr -f GeoJSON /home/data/census/geojson/'
|
||||
+ fips
|
||||
+ ".json /home/data/census/shp/"
|
||||
+ fips
|
||||
+ "/tl_2010_"
|
||||
+ fips
|
||||
+ "_bg10.shp"
|
||||
)
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
# generate CBG CSV table for pandas
|
||||
## load in memory
|
||||
cbg_national_list = [] # in-memory global list
|
||||
cbg_per_state_list = {} # in-memory dict per state
|
||||
for file in os.listdir(geojson_dir_path):
|
||||
if file.endswith(".json"):
|
||||
print(f"ingesting geoid10 for file {file}")
|
||||
with open(geojson_dir_path.joinpath(file)) as f:
|
||||
geojson = json.load(f)
|
||||
for feature in geojson["features"]:
|
||||
geoid10 = feature["properties"]["GEOID10"]
|
||||
cbg_national_list.append(str(geoid10))
|
||||
geoid10_state_id = geoid10[:2]
|
||||
if not cbg_per_state_list.get(geoid10_state_id):
|
||||
cbg_per_state_list[geoid10_state_id] = []
|
||||
cbg_per_state_list[geoid10_state_id].append(geoid10)
|
||||
|
||||
csv_dir_path = data_path / "census" / "csv"
|
||||
## write to individual state csv
|
||||
for state_id in cbg_per_state_list:
|
||||
geoid10_list = cbg_per_state_list[state_id]
|
||||
with open(
|
||||
csv_dir_path.joinpath(f"{state_id}.csv"), mode="w", newline=""
|
||||
) as cbg_csv_file:
|
||||
cbg_csv_file_writer = csv.writer(
|
||||
cbg_csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
|
||||
)
|
||||
|
||||
for geoid10 in geoid10_list:
|
||||
cbg_csv_file_writer.writerow(
|
||||
[
|
||||
geoid10,
|
||||
]
|
||||
)
|
||||
|
||||
## write US csv
|
||||
with open(csv_dir_path.joinpath("us.csv"), mode="w", newline="") as cbg_csv_file:
|
||||
cbg_csv_file_writer = csv.writer(
|
||||
cbg_csv_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
|
||||
)
|
||||
for geoid10 in cbg_national_list:
|
||||
cbg_csv_file_writer.writerow(
|
||||
[
|
||||
geoid10,
|
||||
]
|
||||
)
|
||||
|
||||
print("Census block groups downloading complete")
|
|
@ -1,91 +0,0 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
|
||||
from utils import get_state_fips_codes
|
||||
|
||||
data_path = Path.cwd() / "data"
|
||||
|
||||
# remove existing mbtiles file
|
||||
mb_tiles_path = data_path / "tiles" / "block2010.mbtiles"
|
||||
if os.path.exists(mb_tiles_path):
|
||||
os.remove(mb_tiles_path)
|
||||
|
||||
# remove existing mvt directory
|
||||
mvt_tiles_path = data_path / "tiles" / "mvt"
|
||||
if os.path.exists(mvt_tiles_path):
|
||||
shutil.rmtree(mvt_tiles_path)
|
||||
|
||||
# Merge scores into json
|
||||
# TODO: for this first pass, just merging ACS EJScren indicators
|
||||
# Per https://github.com/usds/justice40-tool/issues/102
|
||||
|
||||
if os.name == "nt":
|
||||
pwd = "%cd%"
|
||||
else:
|
||||
pwd = "${PWD}"
|
||||
|
||||
# remove existing score json files
|
||||
score_geojson_dir = data_path / "score" / "geojson"
|
||||
files_in_directory = os.listdir(score_geojson_dir)
|
||||
filtered_files = [file for file in files_in_directory if file.endswith(".json")]
|
||||
for file in filtered_files:
|
||||
path_to_file = os.path.join(score_geojson_dir, file)
|
||||
os.remove(path_to_file)
|
||||
|
||||
# join the state shape sqllite with the score csv
|
||||
state_fips_codes = get_state_fips_codes()
|
||||
for fips in state_fips_codes:
|
||||
cmd = (
|
||||
'docker run --rm -v "'
|
||||
+ pwd
|
||||
+ '"/:/home '
|
||||
+ "osgeo/gdal:alpine-small-latest ogr2ogr -f GeoJSON "
|
||||
+ f"-sql \"SELECT * FROM tl_2010_{fips}_bg10 LEFT JOIN '/home/data/score/csv/data{fips}.csv'.data{fips} ON tl_2010_{fips}_bg10.GEOID10 = data{fips}.ID\" "
|
||||
+ f"/home/data/score/geojson/{fips}.json /home/data/census/shp/{fips}/tl_2010_{fips}_bg10.dbf"
|
||||
)
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
# get a list of all json files to plug in the docker commands below
|
||||
# (workaround since *.json doesn't seem to work)
|
||||
geojson_list = ""
|
||||
geojson_path = data_path / "score" / "geojson"
|
||||
for file in os.listdir(geojson_path):
|
||||
if file.endswith(".json"):
|
||||
geojson_list += f"/home/data/score/geojson/{file} "
|
||||
|
||||
if geojson_list == "":
|
||||
print("No GeoJson files found. Please run scripts/download_cbg.py first")
|
||||
|
||||
|
||||
# generate mbtiles file
|
||||
# PWD is different for Windows
|
||||
if os.name == "nt":
|
||||
pwd = "%cd%"
|
||||
else:
|
||||
pwd = "${PWD}"
|
||||
cmd = (
|
||||
'docker run --rm -it -v "'
|
||||
+ pwd
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 '
|
||||
+ geojson_list
|
||||
)
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
# if AWS creds are present, generate uncompressed toles
|
||||
# docker run --rm -it -v ${PWD}:/data tippecanoe tippecanoe --no-tile-compression -zg -e /data/tiles_custom -l blocks /data/tabblock2010_01_pophu_joined.json
|
||||
# PWD is different for Windows
|
||||
if os.name == "nt":
|
||||
pwd = "%cd%"
|
||||
else:
|
||||
pwd = "${PWD}"
|
||||
cmd = (
|
||||
'docker run --rm -it -v "'
|
||||
+ pwd
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed --no-tile-compression -zg -e /home/data/tiles/mvt '
|
||||
+ geojson_list
|
||||
)
|
||||
print(cmd)
|
||||
os.system(cmd)
|
|
@ -1,20 +0,0 @@
|
|||
# common usage functions
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_state_fips_codes():
|
||||
data_path = Path.cwd() / "data"
|
||||
fips_csv_path = data_path / "fips_states_2010.csv"
|
||||
fips_state_list = []
|
||||
with open(fips_csv_path) as csv_file:
|
||||
csv_reader = csv.reader(csv_file, delimiter=",")
|
||||
line_count = 0
|
||||
|
||||
for row in csv_reader:
|
||||
if line_count == 0:
|
||||
line_count += 1
|
||||
else:
|
||||
fips = row[0].strip()
|
||||
fips_state_list.append(fips)
|
||||
return fips_state_list
|
8
score/settings.toml
Normal file
8
score/settings.toml
Normal file
|
@ -0,0 +1,8 @@
|
|||
[default]
|
||||
AWS_JUSTICE40_DATA_URL = "https://justice40-data.s3.amazonaws.com"
|
||||
|
||||
[development]
|
||||
|
||||
[staging]
|
||||
|
||||
[production]
|
0
score/tile/__init__.py
Normal file
0
score/tile/__init__.py
Normal file
86
score/tile/generate.py
Normal file
86
score/tile/generate.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
|
||||
from etl.sources.census.etl_utils import get_state_fips_codes
|
||||
|
||||
|
||||
def generate_tiles(data_path: Path) -> None:
|
||||
|
||||
# remove existing mbtiles file
|
||||
mb_tiles_path = data_path / "tiles" / "block2010.mbtiles"
|
||||
if os.path.exists(mb_tiles_path):
|
||||
os.remove(mb_tiles_path)
|
||||
|
||||
# remove existing mvt directory
|
||||
mvt_tiles_path = data_path / "tiles" / "mvt"
|
||||
if os.path.exists(mvt_tiles_path):
|
||||
shutil.rmtree(mvt_tiles_path)
|
||||
|
||||
# Merge scores into json
|
||||
|
||||
if os.name == "nt":
|
||||
pwd = "%cd%"
|
||||
else:
|
||||
pwd = "${PWD}"
|
||||
|
||||
# remove existing score json files
|
||||
score_geojson_dir = data_path / "score" / "geojson"
|
||||
files_in_directory = os.listdir(score_geojson_dir)
|
||||
filtered_files = [file for file in files_in_directory if file.endswith(".json")]
|
||||
for file in filtered_files:
|
||||
path_to_file = os.path.join(score_geojson_dir, file)
|
||||
os.remove(path_to_file)
|
||||
|
||||
# join the state shape sqllite with the score csv
|
||||
state_fips_codes = get_state_fips_codes()
|
||||
for fips in state_fips_codes:
|
||||
cmd = (
|
||||
'docker run --rm -v "'
|
||||
+ pwd
|
||||
+ '"/:/home '
|
||||
+ "osgeo/gdal:alpine-small-latest ogr2ogr -f GeoJSON "
|
||||
+ f"-sql \"SELECT * FROM tl_2010_{fips}_bg10 LEFT JOIN '/home/data/score/csv/data{fips}.csv'.data{fips} ON tl_2010_{fips}_bg10.GEOID10 = data{fips}.ID\" "
|
||||
+ f"/home/data/score/geojson/{fips}.json /home/data/census/shp/{fips}/tl_2010_{fips}_bg10.dbf"
|
||||
)
|
||||
os.system(cmd)
|
||||
|
||||
# get a list of all json files to plug in the docker commands below
|
||||
# (workaround since *.json doesn't seem to work)
|
||||
geojson_list = ""
|
||||
geojson_path = data_path / "score" / "geojson"
|
||||
for file in os.listdir(geojson_path):
|
||||
if file.endswith(".json"):
|
||||
geojson_list += f"/home/data/score/geojson/{file} "
|
||||
|
||||
if geojson_list == "":
|
||||
logging.error(
|
||||
"No GeoJson files found. Please run scripts/download_cbg.py first"
|
||||
)
|
||||
|
||||
# generate mbtiles file
|
||||
# PWD is different for Windows
|
||||
if os.name == "nt":
|
||||
pwd = "%cd%"
|
||||
else:
|
||||
pwd = "${PWD}"
|
||||
cmd = (
|
||||
'docker run --rm -it -v "'
|
||||
+ pwd
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed -zg -o /home/data/tiles/block2010.mbtiles --extend-zooms-if-still-dropping -l cbg2010 -s_srs EPSG:4269 -t_srs EPSG:4326 '
|
||||
+ geojson_list
|
||||
)
|
||||
os.system(cmd)
|
||||
|
||||
# PWD is different for Windows
|
||||
if os.name == "nt":
|
||||
pwd = "%cd%"
|
||||
else:
|
||||
pwd = "${PWD}"
|
||||
cmd = (
|
||||
'docker run --rm -it -v "'
|
||||
+ pwd
|
||||
+ '"/:/home klokantech/tippecanoe tippecanoe --drop-densest-as-needed --no-tile-compression -zg -e /home/data/tiles/mvt '
|
||||
+ geojson_list
|
||||
)
|
||||
os.system(cmd)
|
76
score/utils.py
Normal file
76
score/utils.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
from pathlib import Path
|
||||
import os
|
||||
import logging
|
||||
import shutil
|
||||
import requests
|
||||
import zipfile
|
||||
|
||||
|
||||
def get_module_logger(module_name):
|
||||
"""
|
||||
To use this, do logger = get_module_logger(__name__)
|
||||
"""
|
||||
logger = logging.getLogger(module_name)
|
||||
handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s [%(name)-12s] %(levelname)-8s %(message)s"
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
return logger
|
||||
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
def remove_files_from_dir(files_path: Path, extension: str = None) -> None:
|
||||
for file in os.listdir(files_path):
|
||||
if extension:
|
||||
if not file.endswith(extension):
|
||||
continue
|
||||
else:
|
||||
# don't rempove __init__ files as they conserve dir structure
|
||||
if file == "__init__.py":
|
||||
continue
|
||||
os.remove(files_path / file)
|
||||
logger.info(f"Removing {file}")
|
||||
|
||||
|
||||
def remove_all_from_dir(files_path: Path) -> None:
|
||||
for file in os.listdir(files_path):
|
||||
# don't rempove __init__ files as they conserve dir structure
|
||||
if file == "__init__.py":
|
||||
continue
|
||||
if os.path.isfile(files_path / file):
|
||||
os.remove(files_path / file)
|
||||
else:
|
||||
shutil.rmtree(files_path / file)
|
||||
logger.info(f"Removing {file}")
|
||||
|
||||
|
||||
def remove_all_dirs_from_dir(dir_path: Path) -> None:
|
||||
for filename in os.listdir(dir_path):
|
||||
file_path = os.path.join(dir_path, filename)
|
||||
if os.path.isdir(file_path):
|
||||
shutil.rmtree(file_path)
|
||||
logging.info(f"Removing directory {file_path}")
|
||||
|
||||
|
||||
def unzip_file_from_url(
|
||||
file_url: str, download_path: Path, zip_file_directory: Path, verify: bool = False
|
||||
) -> None:
|
||||
logger.info(f"Downloading {file_url}")
|
||||
download = requests.get(file_url, verify=verify)
|
||||
file_contents = download.content
|
||||
zip_file_path = download_path / "downloaded.zip"
|
||||
zip_file = open(zip_file_path, "wb")
|
||||
zip_file.write(file_contents)
|
||||
zip_file.close()
|
||||
|
||||
logger.info(f"Extracting {zip_file_path}")
|
||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||
zip_ref.extractall(zip_file_directory)
|
||||
|
||||
# cleanup temporary file
|
||||
os.remove(zip_file_path)
|
Loading…
Add table
Reference in a new issue