From b404fdcc4316d76e25d4b5da31d184dcde229d01 Mon Sep 17 00:00:00 2001
From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com>
Date: Wed, 28 Jul 2021 16:07:28 -0400
Subject: [PATCH] Generate Geo-aware scores for all zoom levels (#391)
* generate Geo-aware scores for all zoom levels
* usa high progress
* testing dissolve
* checkpoint
* changing type
* removing breakpoint
* validation notebooks
* quick update
* score validation
* fixes for county merge
* code completed
---
data/data-pipeline/README.md | 6 +-
data/data-pipeline/application.py | 10 +-
data/data-pipeline/etl/runner.py | 18 +
data/data-pipeline/etl/score/etl_score_geo.py | 168 +++
.../data-pipeline/etl/score/etl_score_post.py | 45 +-
.../etl/sources/calenviroscreen/etl.py | 8 +-
data/data-pipeline/etl/sources/census/etl.py | 16 +-
.../etl/sources/census_acs/etl.py | 5 +
data/data-pipeline/ipython/ACS Validate.ipynb | 567 +++++++++
.../ipython/EJScreen Validate.ipynb | 1121 +++++++++++++++++
.../ipython/Score Validate.ipynb | 777 ++++++++++++
.../ipython/Score_Dissolve_Script.ipynb | 535 ++++----
.../data-pipeline/ipython/county_lookup.ipynb | 2 +-
data/data-pipeline/requirements.txt | 15 +-
14 files changed, 3023 insertions(+), 270 deletions(-)
create mode 100644 data/data-pipeline/etl/score/etl_score_geo.py
create mode 100644 data/data-pipeline/ipython/ACS Validate.ipynb
create mode 100644 data/data-pipeline/ipython/EJScreen Validate.ipynb
create mode 100644 data/data-pipeline/ipython/Score Validate.ipynb
diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md
index 4920f2d8..6f1e1fb8 100644
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@@ -44,7 +44,7 @@ TODO add mermaid diagram
#### Step 0: Set up your environment
-1. After cloning the project locally, change to this directory: `cd score`
+1. After cloning the project locally, change to this directory: `cd data/data-pipeline`
1. Choose whether you'd like to run this application using Docker or if you'd like to install the dependencies locally so you can contribute to the project.
- **With Docker:** Follow these [installation instructions](https://docs.docker.com/get-docker/) and skip down to the [Running with Docker section](#running-with-docker) for more information
- **For Local Development:** Skip down to the [Local Development section](#local-development) for more detailed installation instructions
@@ -53,7 +53,7 @@ TODO add mermaid diagram
#### Step 1: Run the ETL script for each data source
1. Call the `etl-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute.
- - With Docker: `docker run --rm -it j40_score /bin/sh -c "python3 application.py etl-run"`
+ - With Docker: `docker run --rm -it j40_data_pipeline /bin/sh -c "python3 application.py etl-run"`
- With Poetry: `poetry run python application.py etl-run`
1. The `etl-run` command will execute the corresponding ETL script for each data source in `etl/sources/`. For example, `etl/sources/ejscreen/etl.py` is the ETL script for EJSCREEN data.
1. Each ETL script will extract the data from its original source, then format the data into `.csv` files that get stored in the relevant folder in `data/dataset/`. For example, HUD Housing data is stored in `data/dataset/hud_housing/usa.csv`
@@ -64,7 +64,7 @@ _For example: `poetry run python application.py etl-run ejscreen` would only run
#### Step 2: Calculate the Justice40 score experiments
1. Call the `score-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute.
- - With Docker: `docker run --rm -it j40_score /bin/sh -c "python3 application.py score-run"`
+ - With Docker: `docker run --rm -it j40_data_pipeline /bin/sh -c "python3 application.py score-run"`
- With Poetry: `poetry run python application.py score-run`
1. The `score-run` command will execute the `etl/score/etl.py` script which loads the data from each of the source files added to the `data/dataset/` directory by the ETL scripts in Step 1.
1. These data sets are merged into a single dataframe using their Census Block Group GEOID as a common key, and the data in each of the columns is standardized in two ways:
diff --git a/data/data-pipeline/application.py b/data/data-pipeline/application.py
index 89ee45bf..abf16ef5 100644
--- a/data/data-pipeline/application.py
+++ b/data/data-pipeline/application.py
@@ -9,7 +9,7 @@ from utils import (
temp_folder_cleanup,
)
from etl.sources.census.etl import download_census_csvs
-from etl.runner import etl_runner, score_generate
+from etl.runner import etl_runner, score_generate, score_geo
logger = get_module_logger(__name__)
@@ -88,5 +88,13 @@ def score_run():
score_generate()
+@cli.command(
+ help="Generate Geojson files with scores baked in",
+)
+def geo_score():
+ """CLI command to generate the score"""
+ score_geo()
+
+
if __name__ == "__main__":
cli()
diff --git a/data/data-pipeline/etl/runner.py b/data/data-pipeline/etl/runner.py
index f6ec09b4..90be2f48 100644
--- a/data/data-pipeline/etl/runner.py
+++ b/data/data-pipeline/etl/runner.py
@@ -2,6 +2,7 @@ import importlib
from etl.score.etl_score import ScoreETL
from etl.score.etl_score_post import PostScoreETL
+from etl.score.etl_score_geo import GeoScoreETL
def etl_runner(dataset_to_run: str = None) -> None:
@@ -112,6 +113,23 @@ def score_generate() -> None:
score_post.cleanup()
+def score_geo() -> None:
+ """Generates the geojson files with score data baked in
+
+ Args:
+ None
+
+ Returns:
+ None
+ """
+
+ # Score Geo
+ score_geo = GeoScoreETL()
+ score_geo.extract()
+ score_geo.transform()
+ score_geo.load()
+
+
def _find_dataset_index(dataset_list, key, value):
for i, element in enumerate(dataset_list):
if element[key] == value:
diff --git a/data/data-pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/etl/score/etl_score_geo.py
new file mode 100644
index 00000000..78715e31
--- /dev/null
+++ b/data/data-pipeline/etl/score/etl_score_geo.py
@@ -0,0 +1,168 @@
+import pandas as pd
+import geopandas as gpd
+import math
+
+from etl.base import ExtractTransformLoad
+from utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class GeoScoreETL(ExtractTransformLoad):
+ """
+ A class used to generate per state and national GeoJson files with the score baked in
+ """
+
+ def __init__(self):
+ self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson"
+ self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json"
+ self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json"
+
+ self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
+ self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv"
+
+ self.CENSUS_USA_GEOJSON = (
+ self.DATA_PATH / "census" / "geojson" / "us.json"
+ )
+
+ self.TARGET_SCORE_NAME = "Score E (percentile)"
+ self.TARGET_SCORE_RENAME_TO = "E_SCORE"
+
+ self.NUMBER_OF_BUCKETS = 10
+
+ self.geojson_usa_df: gpd.GeoDataFrame
+ self.score_usa_df: pd.DataFrame
+ self.geojson_score_usa_high: gpd.GeoDataFrame
+ self.geojson_score_usa_low: gpd.GeoDataFrame
+
+ def extract(self) -> None:
+ logger.info(f"Reading US GeoJSON (~6 minutes)")
+ self.geojson_usa_df = gpd.read_file(
+ self.CENSUS_USA_GEOJSON,
+ dtype={"GEOID10": "string"},
+ usecols=["GEOID10", "geometry"],
+ low_memory=False,
+ )
+ self.geojson_usa_df.head()
+
+ logger.info(f"Reading score CSV")
+ self.score_usa_df = pd.read_csv(
+ self.TILE_SCORE_CSV,
+ dtype={"GEOID10": "string"},
+ low_memory=False,
+ )
+
+ def transform(self) -> None:
+ logger.info(f"Pruning Census GeoJSON")
+ fields = ["GEOID10", "geometry"]
+ self.geojson_usa_df = self.geojson_usa_df[fields]
+
+ logger.info(f"Merging and compressing score CSV with USA GeoJSON")
+ self.geojson_score_usa_high = self.score_usa_df.merge(
+ self.geojson_usa_df, on="GEOID10", how="left"
+ )
+
+ self.geojson_score_usa_high = gpd.GeoDataFrame(
+ self.geojson_score_usa_high, crs="EPSG:4326"
+ )
+
+ usa_simplified = self.geojson_score_usa_high[
+ ["GEOID10", self.TARGET_SCORE_NAME, "geometry"]
+ ].reset_index(drop=True)
+
+ usa_simplified.rename(
+ columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO},
+ inplace=True,
+ )
+
+ logger.info(f"Aggregating into tracts (~5 minutes)")
+ usa_tracts = self._aggregate_to_tracts(usa_simplified)
+
+ usa_tracts = gpd.GeoDataFrame(
+ usa_tracts,
+ columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
+ crs="EPSG:4326",
+ )
+
+ logger.info(f"Creating buckets from tracts")
+ usa_bucketed = self._create_buckets_from_tracts(
+ usa_tracts, self.NUMBER_OF_BUCKETS
+ )
+
+ logger.info(f"Aggregating buckets")
+ usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean")
+
+ compressed = self._breakup_multipolygons(
+ usa_aggregated, self.NUMBER_OF_BUCKETS
+ )
+
+ self.geojson_score_usa_low = gpd.GeoDataFrame(
+ compressed,
+ columns=[self.TARGET_SCORE_RENAME_TO, "geometry"],
+ crs="EPSG:4326",
+ )
+
+ def _aggregate_to_tracts(
+ self, block_group_df: gpd.GeoDataFrame
+ ) -> gpd.GeoDataFrame:
+ # The tract identifier is the first 11 digits of the GEOID
+ block_group_df["tract"] = block_group_df.apply(
+ lambda row: row["GEOID10"][0:11], axis=1
+ )
+ state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean")
+ return state_tracts
+
+ def _create_buckets_from_tracts(
+ self, state_tracts: gpd.GeoDataFrame, num_buckets: int
+ ) -> gpd.GeoDataFrame:
+ # assign tracts to buckets by D_SCORE
+ state_tracts.sort_values(self.TARGET_SCORE_RENAME_TO, inplace=True)
+ SCORE_bucket = []
+ bucket_size = math.ceil(
+ len(state_tracts.index) / self.NUMBER_OF_BUCKETS
+ )
+ for i in range(len(state_tracts.index)):
+ SCORE_bucket.extend([math.floor(i / bucket_size)])
+ state_tracts[f"{self.TARGET_SCORE_RENAME_TO}_bucket"] = SCORE_bucket
+ return state_tracts
+
+ def _aggregate_buckets(self, state_tracts: gpd.GeoDataFrame, agg_func: str):
+ # dissolve tracts by bucket
+ state_attr = state_tracts[
+ [
+ self.TARGET_SCORE_RENAME_TO,
+ f"{self.TARGET_SCORE_RENAME_TO}_bucket",
+ "geometry",
+ ]
+ ].reset_index(drop=True)
+ state_dissolve = state_attr.dissolve(
+ by=f"{self.TARGET_SCORE_RENAME_TO}_bucket", aggfunc=agg_func
+ )
+ return state_dissolve
+
+ def _breakup_multipolygons(
+ self, state_bucketed_df: gpd.GeoDataFrame, num_buckets: int
+ ) -> gpd.GeoDataFrame:
+ compressed = []
+ for i in range(num_buckets):
+ for j in range(len(state_bucketed_df["geometry"][i].geoms)):
+ compressed.append(
+ [
+ state_bucketed_df[self.TARGET_SCORE_RENAME_TO][i],
+ state_bucketed_df["geometry"][i].geoms[j],
+ ]
+ )
+ return compressed
+
+ def load(self) -> None:
+ logger.info(f"Writing usa-high (~9 minutes)")
+ self.geojson_score_usa_high.to_file(
+ self.SCORE_HIGH_GEOJSON, driver="GeoJSON"
+ )
+ logger.info(f"Completed writing usa-high")
+
+ logger.info(f"Writing usa-low (~9 minutes)")
+ self.geojson_score_usa_low.to_file(
+ self.SCORE_LOW_GEOJSON, driver="GeoJSON"
+ )
+ logger.info(f"Completed writing usa-low")
diff --git a/data/data-pipeline/etl/score/etl_score_post.py b/data/data-pipeline/etl/score/etl_score_post.py
index 41c837a4..f2e1e376 100644
--- a/data/data-pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/etl/score/etl_score_post.py
@@ -16,10 +16,13 @@ class PostScoreETL(ExtractTransformLoad):
self.CENSUS_COUNTIES_ZIP_URL = "https://www2.census.gov/geo/docs/maps-data/data/gazetteer/Gaz_counties_national.zip"
self.CENSUS_COUNTIES_TXT = self.TMP_PATH / "Gaz_counties_national.txt"
self.CENSUS_COUNTIES_COLS = ["USPS", "GEOID", "NAME"]
+ self.CENSUS_USA_CSV = self.DATA_PATH / "census" / "csv" / "us.csv"
self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv"
+
self.STATE_CSV = (
self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv"
)
+
self.FULL_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv"
self.TILR_SCORE_CSV = self.SCORE_CSV_PATH / "tile" / "usa.csv"
@@ -87,17 +90,43 @@ class PostScoreETL(ExtractTransformLoad):
# add the tract level column
self.score_df["GEOID"] = self.score_df.GEOID10.str[:5]
- # merge state and counties
- county_state_merged = self.counties_df.join(
- self.states_df, rsuffix=" Other"
+ # merge state with counties
+ county_state_merged = self.counties_df.merge(
+ self.states_df, on="State Abbreviation", how="left"
)
- del county_state_merged["State Abbreviation Other"]
- # merge county and score
- self.score_county_state_merged = self.score_df.join(
- county_state_merged, rsuffix="_OTHER"
+ # merge state + county with score
+ self.score_county_state_merged = self.score_df.merge(
+ county_state_merged, on="GEOID", how="left"
)
- del self.score_county_state_merged["GEOID_OTHER"]
+
+ # check if there are census cbgs without score
+ logger.info(f"Removing CBG rows without score")
+
+ ## load cbgs
+ cbg_usa_df = pd.read_csv(
+ self.CENSUS_USA_CSV,
+ names=["GEOID10"],
+ dtype={"GEOID10": "string"},
+ low_memory=False,
+ header=None,
+ )
+
+ # merge census cbgs with score
+ merged_df = cbg_usa_df.merge(
+ self.score_county_state_merged, on="GEOID10", how="left"
+ )
+
+ # list the null score cbgs
+ null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()]
+
+ # subsctract data sets
+ removed_df = pd.concat(
+ [merged_df, null_cbg_df, null_cbg_df]
+ ).drop_duplicates(keep=False)
+
+ # set the score to the new df
+ self.score_county_state_merged = removed_df
def load(self) -> None:
logger.info(f"Saving Full Score CSV with County Information")
diff --git a/data/data-pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/etl/sources/calenviroscreen/etl.py
index ad56b26a..cb3a01c3 100644
--- a/data/data-pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/etl/sources/calenviroscreen/etl.py
@@ -9,12 +9,16 @@ logger = get_module_logger(__name__)
class CalEnviroScreenETL(ExtractTransformLoad):
def __init__(self):
self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip"
- self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
+ self.CALENVIROSCREEN_CSV = (
+ self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
+ )
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Definining some variable names
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
- self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile"
+ self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
+ "calenviroscreen_percentile"
+ )
self.CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = (
"calenviroscreen_priority_community"
)
diff --git a/data/data-pipeline/etl/sources/census/etl.py b/data/data-pipeline/etl/sources/census/etl.py
index c652700e..62c6420c 100644
--- a/data/data-pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/etl/sources/census/etl.py
@@ -2,6 +2,7 @@ import csv
import os
import json
from pathlib import Path
+import geopandas as gpd
from .etl_utils import get_state_fips_codes
from utils import unzip_file_from_url, get_module_logger
@@ -11,7 +12,7 @@ logger = get_module_logger(__name__)
def download_census_csvs(data_path: Path) -> None:
"""Download all census shape files from the Census FTP and extract the geojson
- to generate national and by state Census Block Group CSVs
+ to generate national and by state Census Block Group CSVs and GeoJSONs
Args:
data_path (pathlib.Path): Name of the directory where the files and directories will
@@ -108,4 +109,17 @@ def download_census_csvs(data_path: Path) -> None:
]
)
+ ## create national geojson
+ logger.info(f"Generating national geojson file")
+ usa_df = gpd.GeoDataFrame()
+
+ for file_name in geojson_dir_path.rglob("*.json"):
+ logger.info(f"Ingesting {file_name}")
+ state_gdf = gpd.read_file(file_name)
+ usa_df = usa_df.append(state_gdf)
+
+ usa_df = usa_df.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
+ logger.info(f"Writing national geojson file")
+ usa_df.to_file(geojson_dir_path / "us.json", driver="GeoJSON")
+
logger.info("Census block groups downloading complete")
diff --git a/data/data-pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/etl/sources/census_acs/etl.py
index 39db151c..18c58693 100644
--- a/data/data-pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/etl/sources/census_acs/etl.py
@@ -106,3 +106,8 @@ class CensusACSETL(ExtractTransformLoad):
self.df[columns_to_include].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)
+
+ def validate(self) -> None:
+ logger.info(f"Validating Census ACS Data")
+
+ pass
diff --git a/data/data-pipeline/ipython/ACS Validate.ipynb b/data/data-pipeline/ipython/ACS Validate.ipynb
new file mode 100644
index 00000000..ac5baca4
--- /dev/null
+++ b/data/data-pipeline/ipython/ACS Validate.ipynb
@@ -0,0 +1,567 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "43c5dbee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import csv\n",
+ "from pathlib import Path\n",
+ "import os\n",
+ "import sys"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "f97c95f6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.abspath(os.path.join(\"..\"))\n",
+ "if module_path not in sys.path:\n",
+ " sys.path.append(module_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "b8a2b53e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_PATH = Path.cwd().parent / \"data\"\n",
+ "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
+ "ACS_YEAR = \"2019\"\n",
+ "OUTPUT_PATH = (\n",
+ " DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
+ " )\n",
+ "CENSUS_USA_CSV = (\n",
+ " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "0d33e8db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cbg_usa_df = pd.read_csv(\n",
+ " CENSUS_USA_CSV,\n",
+ " names=['GEOID10'],\n",
+ " dtype={\"GEOID10\": \"string\"},\n",
+ " low_memory=False,\n",
+ " header=None\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "01e6dbe3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100010414002 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100010415002 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100010417011 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100010417012 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100010422011 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10\n",
+ "0 100010414002\n",
+ "1 100010415002\n",
+ "2 100010417011\n",
+ "3 100010417012\n",
+ "4 100010422011"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cbg_usa_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "341dbcb6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GEOID10 string\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cbg_usa_df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "eb25d4bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "acs_df = pd.read_csv(\n",
+ " OUTPUT_PATH / \"usa.csv\",\n",
+ " dtype={\"GEOID10\": \"string\"},\n",
+ " low_memory=False,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "d4c9d010",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " Unemployed civilians (percent) | \n",
+ " Linguistic isolation (percent) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 010399620002 | \n",
+ " 0.077108 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 010399618002 | \n",
+ " 0.126214 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 010399616004 | \n",
+ " 0.133172 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 010399616002 | \n",
+ " 0.028249 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 010399616001 | \n",
+ " 0.063037 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 Unemployed civilians (percent) \\\n",
+ "0 010399620002 0.077108 \n",
+ "1 010399618002 0.126214 \n",
+ "2 010399616004 0.133172 \n",
+ "3 010399616002 0.028249 \n",
+ "4 010399616001 0.063037 \n",
+ "\n",
+ " Linguistic isolation (percent) \n",
+ "0 0.0 \n",
+ "1 0.0 \n",
+ "2 0.0 \n",
+ "3 0.0 \n",
+ "4 0.0 "
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "acs_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "dd390179",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GEOID10 string\n",
+ "Unemployed civilians (percent) float64\n",
+ "Linguistic isolation (percent) float64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "acs_df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "236eb093",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df = cbg_usa_df.merge(\n",
+ " acs_df, on=\"GEOID10\", how=\"left\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "4fff1845",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " Unemployed civilians (percent) | \n",
+ " Linguistic isolation (percent) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100010414002 | \n",
+ " 0.030612 | \n",
+ " 0.065963 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100010415002 | \n",
+ " 0.118056 | \n",
+ " 0.010283 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100010417011 | \n",
+ " 0.042373 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100010417012 | \n",
+ " 0.042473 | \n",
+ " 0.010435 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100010422011 | \n",
+ " 0.054358 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 Unemployed civilians (percent) \\\n",
+ "0 100010414002 0.030612 \n",
+ "1 100010415002 0.118056 \n",
+ "2 100010417011 0.042373 \n",
+ "3 100010417012 0.042473 \n",
+ "4 100010422011 0.054358 \n",
+ "\n",
+ " Linguistic isolation (percent) \n",
+ "0 0.065963 \n",
+ "1 0.010283 \n",
+ "2 0.000000 \n",
+ "3 0.010435 \n",
+ "4 0.000000 "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "f8903557",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " Unemployed civilians (percent) | \n",
+ " Linguistic isolation (percent) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 34 | \n",
+ " 100019900000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 377 | \n",
+ " 100030169041 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 392 | \n",
+ " 100059900000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 400 | \n",
+ " 100039901000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 416 | \n",
+ " 100039801001 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 219505 | \n",
+ " 340057048013 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 219508 | \n",
+ " 340057048024 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 219758 | \n",
+ " 340258047001 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 219807 | \n",
+ " 340259900000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 220134 | \n",
+ " 340076113001 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1462 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 Unemployed civilians (percent) \\\n",
+ "34 100019900000 NaN \n",
+ "377 100030169041 NaN \n",
+ "392 100059900000 NaN \n",
+ "400 100039901000 NaN \n",
+ "416 100039801001 NaN \n",
+ "... ... ... \n",
+ "219505 340057048013 NaN \n",
+ "219508 340057048024 NaN \n",
+ "219758 340258047001 NaN \n",
+ "219807 340259900000 NaN \n",
+ "220134 340076113001 NaN \n",
+ "\n",
+ " Linguistic isolation (percent) \n",
+ "34 NaN \n",
+ "377 NaN \n",
+ "392 NaN \n",
+ "400 NaN \n",
+ "416 NaN \n",
+ "... ... \n",
+ "219505 NaN \n",
+ "219508 NaN \n",
+ "219758 NaN \n",
+ "219807 NaN \n",
+ "220134 0.0 \n",
+ "\n",
+ "[1462 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b870a21f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/data-pipeline/ipython/EJScreen Validate.ipynb b/data/data-pipeline/ipython/EJScreen Validate.ipynb
new file mode 100644
index 00000000..4c2826d0
--- /dev/null
+++ b/data/data-pipeline/ipython/EJScreen Validate.ipynb
@@ -0,0 +1,1121 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "3ab8f7c1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import csv\n",
+ "from pathlib import Path\n",
+ "import os\n",
+ "import sys"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8c22494f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.abspath(os.path.join(\"..\"))\n",
+ "if module_path not in sys.path:\n",
+ " sys.path.append(module_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "eb31e9a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_PATH = Path.cwd().parent / \"data\"\n",
+ "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
+ "OUTPUT_PATH = (\n",
+ " DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n",
+ " )\n",
+ "CENSUS_USA_CSV = (\n",
+ " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "95a5f8d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cbg_usa_df = pd.read_csv(\n",
+ " CENSUS_USA_CSV,\n",
+ " names=['GEOID10'],\n",
+ " dtype={\"GEOID10\": \"string\"},\n",
+ " low_memory=False,\n",
+ " header=None\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "bdd9ab60",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100010414002 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100010415002 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100010417011 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100010417012 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100010422011 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10\n",
+ "0 100010414002\n",
+ "1 100010415002\n",
+ "2 100010417011\n",
+ "3 100010417012\n",
+ "4 100010422011"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cbg_usa_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "05a40080",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GEOID10 string\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cbg_usa_df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "114af777",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ejscreen_df = pd.read_csv(\n",
+ " OUTPUT_PATH / \"usa.csv\",\n",
+ " dtype={\"ID\": \"string\"},\n",
+ " low_memory=False,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "4f070999",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ejscreen_df.rename(\n",
+ " columns={\"ID\": \"GEOID10\"},\n",
+ " inplace=True,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "d5f3ebd4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " OBJECTID | \n",
+ " GEOID10 | \n",
+ " STATE_NAME | \n",
+ " ST_ABBREV | \n",
+ " REGION | \n",
+ " ACSTOTPOP | \n",
+ " D_PM25_2 | \n",
+ " B_PM25_D2 | \n",
+ " P_PM25_D2 | \n",
+ " D_OZONE_2 | \n",
+ " ... | \n",
+ " T_PNPL | \n",
+ " T_PNPL_D2 | \n",
+ " T_PRMP | \n",
+ " T_PRMP_D2 | \n",
+ " T_PTSDF | \n",
+ " T_PTSDF_D2 | \n",
+ " T_PWDIS | \n",
+ " T_PWDIS_D2 | \n",
+ " Shape_Length | \n",
+ " Shape_Area | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 010010201001 | \n",
+ " Alabama | \n",
+ " AL | \n",
+ " 4 | \n",
+ " 692 | \n",
+ " -1161.544049 | \n",
+ " 5 | \n",
+ " 43.0 | \n",
+ " -4661.186378 | \n",
+ " ... | \n",
+ " 0.071 facilities/km distance (79%ile) | \n",
+ " 26%ile | \n",
+ " 0.085 facilities/km distance (24%ile) | \n",
+ " 47%ile | \n",
+ " 0.066 facilities/km distance (21%ile) | \n",
+ " 48%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 62%ile | \n",
+ " 13435.975560 | \n",
+ " 6.026828e+06 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 010010201002 | \n",
+ " Alabama | \n",
+ " AL | \n",
+ " 4 | \n",
+ " 1153 | \n",
+ " -2084.690717 | \n",
+ " 4 | \n",
+ " 31.0 | \n",
+ " -8365.702519 | \n",
+ " ... | \n",
+ " 0.064 facilities/km distance (76%ile) | \n",
+ " 19%ile | \n",
+ " 0.074 facilities/km distance (18%ile) | \n",
+ " 41%ile | \n",
+ " 0.06 facilities/km distance (18%ile) | \n",
+ " 42%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 62%ile | \n",
+ " 11945.584679 | \n",
+ " 7.848121e+06 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 010010202001 | \n",
+ " Alabama | \n",
+ " AL | \n",
+ " 4 | \n",
+ " 1020 | \n",
+ " 2641.389659 | \n",
+ " 9 | \n",
+ " 81.0 | \n",
+ " 10550.793324 | \n",
+ " ... | \n",
+ " 0.069 facilities/km distance (78%ile) | \n",
+ " 87%ile | \n",
+ " 0.078 facilities/km distance (20%ile) | \n",
+ " 71%ile | \n",
+ " 0.065 facilities/km distance (20%ile) | \n",
+ " 71%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 62%ile | \n",
+ " 7770.915121 | \n",
+ " 2.900774e+06 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 010010202002 | \n",
+ " Alabama | \n",
+ " AL | \n",
+ " 4 | \n",
+ " 1152 | \n",
+ " 693.118534 | \n",
+ " 7 | \n",
+ " 65.0 | \n",
+ " 2768.599617 | \n",
+ " ... | \n",
+ " 0.076 facilities/km distance (81%ile) | \n",
+ " 75%ile | \n",
+ " 0.087 facilities/km distance (25%ile) | \n",
+ " 63%ile | \n",
+ " 0.07 facilities/km distance (23%ile) | \n",
+ " 63%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 62%ile | \n",
+ " 6506.804784 | \n",
+ " 1.793332e+06 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 010010203001 | \n",
+ " Alabama | \n",
+ " AL | \n",
+ " 4 | \n",
+ " 2555 | \n",
+ " 1034.343525 | \n",
+ " 7 | \n",
+ " 68.0 | \n",
+ " 4120.531837 | \n",
+ " ... | \n",
+ " 0.074 facilities/km distance (80%ile) | \n",
+ " 79%ile | \n",
+ " 0.08 facilities/km distance (21%ile) | \n",
+ " 64%ile | \n",
+ " 0.07 facilities/km distance (23%ile) | \n",
+ " 65%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 62%ile | \n",
+ " 11070.367848 | \n",
+ " 5.461602e+06 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 128 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " OBJECTID GEOID10 STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n",
+ "0 1 010010201001 Alabama AL 4 692 \n",
+ "1 2 010010201002 Alabama AL 4 1153 \n",
+ "2 3 010010202001 Alabama AL 4 1020 \n",
+ "3 4 010010202002 Alabama AL 4 1152 \n",
+ "4 5 010010203001 Alabama AL 4 2555 \n",
+ "\n",
+ " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n",
+ "0 -1161.544049 5 43.0 -4661.186378 ... \n",
+ "1 -2084.690717 4 31.0 -8365.702519 ... \n",
+ "2 2641.389659 9 81.0 10550.793324 ... \n",
+ "3 693.118534 7 65.0 2768.599617 ... \n",
+ "4 1034.343525 7 68.0 4120.531837 ... \n",
+ "\n",
+ " T_PNPL T_PNPL_D2 \\\n",
+ "0 0.071 facilities/km distance (79%ile) 26%ile \n",
+ "1 0.064 facilities/km distance (76%ile) 19%ile \n",
+ "2 0.069 facilities/km distance (78%ile) 87%ile \n",
+ "3 0.076 facilities/km distance (81%ile) 75%ile \n",
+ "4 0.074 facilities/km distance (80%ile) 79%ile \n",
+ "\n",
+ " T_PRMP T_PRMP_D2 \\\n",
+ "0 0.085 facilities/km distance (24%ile) 47%ile \n",
+ "1 0.074 facilities/km distance (18%ile) 41%ile \n",
+ "2 0.078 facilities/km distance (20%ile) 71%ile \n",
+ "3 0.087 facilities/km distance (25%ile) 63%ile \n",
+ "4 0.08 facilities/km distance (21%ile) 64%ile \n",
+ "\n",
+ " T_PTSDF T_PTSDF_D2 \\\n",
+ "0 0.066 facilities/km distance (21%ile) 48%ile \n",
+ "1 0.06 facilities/km distance (18%ile) 42%ile \n",
+ "2 0.065 facilities/km distance (20%ile) 71%ile \n",
+ "3 0.07 facilities/km distance (23%ile) 63%ile \n",
+ "4 0.07 facilities/km distance (23%ile) 65%ile \n",
+ "\n",
+ " T_PWDIS T_PWDIS_D2 \\\n",
+ "0 0 toxicity-weighted concentration/meters dista... 62%ile \n",
+ "1 0 toxicity-weighted concentration/meters dista... 62%ile \n",
+ "2 0 toxicity-weighted concentration/meters dista... 62%ile \n",
+ "3 0 toxicity-weighted concentration/meters dista... 62%ile \n",
+ "4 0 toxicity-weighted concentration/meters dista... 62%ile \n",
+ "\n",
+ " Shape_Length Shape_Area \n",
+ "0 13435.975560 6.026828e+06 \n",
+ "1 11945.584679 7.848121e+06 \n",
+ "2 7770.915121 2.900774e+06 \n",
+ "3 6506.804784 1.793332e+06 \n",
+ "4 11070.367848 5.461602e+06 \n",
+ "\n",
+ "[5 rows x 128 columns]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ejscreen_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "f84f9e1d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "OBJECTID int64\n",
+ "GEOID10 string\n",
+ "STATE_NAME object\n",
+ "ST_ABBREV object\n",
+ "REGION int64\n",
+ " ... \n",
+ "T_PTSDF_D2 object\n",
+ "T_PWDIS object\n",
+ "T_PWDIS_D2 object\n",
+ "Shape_Length float64\n",
+ "Shape_Area float64\n",
+ "Length: 128, dtype: object"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ejscreen_df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "8d61e29e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df = cbg_usa_df.merge(\n",
+ " ejscreen_df, on=\"GEOID10\", how=\"left\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "7e8c2f2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " OBJECTID | \n",
+ " STATE_NAME | \n",
+ " ST_ABBREV | \n",
+ " REGION | \n",
+ " ACSTOTPOP | \n",
+ " D_PM25_2 | \n",
+ " B_PM25_D2 | \n",
+ " P_PM25_D2 | \n",
+ " D_OZONE_2 | \n",
+ " ... | \n",
+ " T_PNPL | \n",
+ " T_PNPL_D2 | \n",
+ " T_PRMP | \n",
+ " T_PRMP_D2 | \n",
+ " T_PTSDF | \n",
+ " T_PTSDF_D2 | \n",
+ " T_PWDIS | \n",
+ " T_PWDIS_D2 | \n",
+ " Shape_Length | \n",
+ " Shape_Area | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100010414002 | \n",
+ " 39652.0 | \n",
+ " Delaware | \n",
+ " DE | \n",
+ " 3.0 | \n",
+ " 1187.0 | \n",
+ " 3655.279721 | \n",
+ " 10.0 | \n",
+ " 90.0 | \n",
+ " 22778.314495 | \n",
+ " ... | \n",
+ " 1.7 facilities/km distance (99%ile) | \n",
+ " 100%ile | \n",
+ " 0.23 facilities/km distance (40%ile) | \n",
+ " 80%ile | \n",
+ " 1.6 facilities/km distance (63%ile) | \n",
+ " 87%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 69%ile | \n",
+ " 4866.135943 | \n",
+ " 1.156165e+06 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100010415002 | \n",
+ " 39654.0 | \n",
+ " Delaware | \n",
+ " DE | \n",
+ " 3.0 | \n",
+ " 1088.0 | \n",
+ " 100.877666 | \n",
+ " 7.0 | \n",
+ " 65.0 | \n",
+ " 629.604923 | \n",
+ " ... | \n",
+ " 0.32 facilities/km distance (69%ile) | \n",
+ " 66%ile | \n",
+ " 0.14 facilities/km distance (20%ile) | \n",
+ " 64%ile | \n",
+ " 1 facilities/km distance (52%ile) | \n",
+ " 66%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 69%ile | \n",
+ " 7972.275657 | \n",
+ " 2.821805e+06 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100010417011 | \n",
+ " 39656.0 | \n",
+ " Delaware | \n",
+ " DE | \n",
+ " 3.0 | \n",
+ " 1554.0 | \n",
+ " -1256.221548 | \n",
+ " 5.0 | \n",
+ " 45.0 | \n",
+ " -7833.701886 | \n",
+ " ... | \n",
+ " 0.21 facilities/km distance (52%ile) | \n",
+ " 31%ile | \n",
+ " 0.11 facilities/km distance (11%ile) | \n",
+ " 53%ile | \n",
+ " 1.3 facilities/km distance (58%ile) | \n",
+ " 22%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 69%ile | \n",
+ " 17643.717513 | \n",
+ " 8.143206e+06 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100010417012 | \n",
+ " 39657.0 | \n",
+ " Delaware | \n",
+ " DE | \n",
+ " 3.0 | \n",
+ " 4543.0 | \n",
+ " -2095.065215 | \n",
+ " 4.0 | \n",
+ " 32.0 | \n",
+ " -13064.667094 | \n",
+ " ... | \n",
+ " 0.17 facilities/km distance (43%ile) | \n",
+ " 25%ile | \n",
+ " 0.1 facilities/km distance (7%ile) | \n",
+ " 48%ile | \n",
+ " 1.1 facilities/km distance (54%ile) | \n",
+ " 18%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 69%ile | \n",
+ " 15645.341219 | \n",
+ " 9.723460e+06 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100010422011 | \n",
+ " 39671.0 | \n",
+ " Delaware | \n",
+ " DE | \n",
+ " 3.0 | \n",
+ " 5153.0 | \n",
+ " -723.497337 | \n",
+ " 6.0 | \n",
+ " 53.0 | \n",
+ " -4534.212814 | \n",
+ " ... | \n",
+ " 0.24 facilities/km distance (58%ile) | \n",
+ " 41%ile | \n",
+ " 0.11 facilities/km distance (8%ile) | \n",
+ " 58%ile | \n",
+ " 0.3 facilities/km distance (33%ile) | \n",
+ " 50%ile | \n",
+ " 0 toxicity-weighted concentration/meters dista... | \n",
+ " 69%ile | \n",
+ " 20959.959236 | \n",
+ " 2.066192e+07 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 128 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 OBJECTID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n",
+ "0 100010414002 39652.0 Delaware DE 3.0 1187.0 \n",
+ "1 100010415002 39654.0 Delaware DE 3.0 1088.0 \n",
+ "2 100010417011 39656.0 Delaware DE 3.0 1554.0 \n",
+ "3 100010417012 39657.0 Delaware DE 3.0 4543.0 \n",
+ "4 100010422011 39671.0 Delaware DE 3.0 5153.0 \n",
+ "\n",
+ " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n",
+ "0 3655.279721 10.0 90.0 22778.314495 ... \n",
+ "1 100.877666 7.0 65.0 629.604923 ... \n",
+ "2 -1256.221548 5.0 45.0 -7833.701886 ... \n",
+ "3 -2095.065215 4.0 32.0 -13064.667094 ... \n",
+ "4 -723.497337 6.0 53.0 -4534.212814 ... \n",
+ "\n",
+ " T_PNPL T_PNPL_D2 \\\n",
+ "0 1.7 facilities/km distance (99%ile) 100%ile \n",
+ "1 0.32 facilities/km distance (69%ile) 66%ile \n",
+ "2 0.21 facilities/km distance (52%ile) 31%ile \n",
+ "3 0.17 facilities/km distance (43%ile) 25%ile \n",
+ "4 0.24 facilities/km distance (58%ile) 41%ile \n",
+ "\n",
+ " T_PRMP T_PRMP_D2 \\\n",
+ "0 0.23 facilities/km distance (40%ile) 80%ile \n",
+ "1 0.14 facilities/km distance (20%ile) 64%ile \n",
+ "2 0.11 facilities/km distance (11%ile) 53%ile \n",
+ "3 0.1 facilities/km distance (7%ile) 48%ile \n",
+ "4 0.11 facilities/km distance (8%ile) 58%ile \n",
+ "\n",
+ " T_PTSDF T_PTSDF_D2 \\\n",
+ "0 1.6 facilities/km distance (63%ile) 87%ile \n",
+ "1 1 facilities/km distance (52%ile) 66%ile \n",
+ "2 1.3 facilities/km distance (58%ile) 22%ile \n",
+ "3 1.1 facilities/km distance (54%ile) 18%ile \n",
+ "4 0.3 facilities/km distance (33%ile) 50%ile \n",
+ "\n",
+ " T_PWDIS T_PWDIS_D2 \\\n",
+ "0 0 toxicity-weighted concentration/meters dista... 69%ile \n",
+ "1 0 toxicity-weighted concentration/meters dista... 69%ile \n",
+ "2 0 toxicity-weighted concentration/meters dista... 69%ile \n",
+ "3 0 toxicity-weighted concentration/meters dista... 69%ile \n",
+ "4 0 toxicity-weighted concentration/meters dista... 69%ile \n",
+ "\n",
+ " Shape_Length Shape_Area \n",
+ "0 4866.135943 1.156165e+06 \n",
+ "1 7972.275657 2.821805e+06 \n",
+ "2 17643.717513 8.143206e+06 \n",
+ "3 15645.341219 9.723460e+06 \n",
+ "4 20959.959236 2.066192e+07 \n",
+ "\n",
+ "[5 rows x 128 columns]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "e81b1321",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " OBJECTID | \n",
+ " STATE_NAME | \n",
+ " ST_ABBREV | \n",
+ " REGION | \n",
+ " ACSTOTPOP | \n",
+ " D_PM25_2 | \n",
+ " B_PM25_D2 | \n",
+ " P_PM25_D2 | \n",
+ " D_OZONE_2 | \n",
+ " ... | \n",
+ " T_PNPL | \n",
+ " T_PNPL_D2 | \n",
+ " T_PRMP | \n",
+ " T_PRMP_D2 | \n",
+ " T_PTSDF | \n",
+ " T_PTSDF_D2 | \n",
+ " T_PWDIS | \n",
+ " T_PWDIS_D2 | \n",
+ " Shape_Length | \n",
+ " Shape_Area | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 10614 | \n",
+ " 515150501002 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10615 | \n",
+ " 515150501003 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10627 | \n",
+ " 515150501001 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10628 | \n",
+ " 515150501005 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10629 | \n",
+ " 515150501004 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 174140 | \n",
+ " 040190029031 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174143 | \n",
+ " 040190027012 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174184 | \n",
+ " 040190027011 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174242 | \n",
+ " 040194105021 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174243 | \n",
+ " 040194105011 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
73 rows × 128 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 OBJECTID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n",
+ "10614 515150501002 NaN NaN NaN NaN NaN \n",
+ "10615 515150501003 NaN NaN NaN NaN NaN \n",
+ "10627 515150501001 NaN NaN NaN NaN NaN \n",
+ "10628 515150501005 NaN NaN NaN NaN NaN \n",
+ "10629 515150501004 NaN NaN NaN NaN NaN \n",
+ "... ... ... ... ... ... ... \n",
+ "174140 040190029031 NaN NaN NaN NaN NaN \n",
+ "174143 040190027012 NaN NaN NaN NaN NaN \n",
+ "174184 040190027011 NaN NaN NaN NaN NaN \n",
+ "174242 040194105021 NaN NaN NaN NaN NaN \n",
+ "174243 040194105011 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... T_PNPL T_PNPL_D2 \\\n",
+ "10614 NaN NaN NaN NaN ... NaN NaN \n",
+ "10615 NaN NaN NaN NaN ... NaN NaN \n",
+ "10627 NaN NaN NaN NaN ... NaN NaN \n",
+ "10628 NaN NaN NaN NaN ... NaN NaN \n",
+ "10629 NaN NaN NaN NaN ... NaN NaN \n",
+ "... ... ... ... ... ... ... ... \n",
+ "174140 NaN NaN NaN NaN ... NaN NaN \n",
+ "174143 NaN NaN NaN NaN ... NaN NaN \n",
+ "174184 NaN NaN NaN NaN ... NaN NaN \n",
+ "174242 NaN NaN NaN NaN ... NaN NaN \n",
+ "174243 NaN NaN NaN NaN ... NaN NaN \n",
+ "\n",
+ " T_PRMP T_PRMP_D2 T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n",
+ "10614 NaN NaN NaN NaN NaN NaN \n",
+ "10615 NaN NaN NaN NaN NaN NaN \n",
+ "10627 NaN NaN NaN NaN NaN NaN \n",
+ "10628 NaN NaN NaN NaN NaN NaN \n",
+ "10629 NaN NaN NaN NaN NaN NaN \n",
+ "... ... ... ... ... ... ... \n",
+ "174140 NaN NaN NaN NaN NaN NaN \n",
+ "174143 NaN NaN NaN NaN NaN NaN \n",
+ "174184 NaN NaN NaN NaN NaN NaN \n",
+ "174242 NaN NaN NaN NaN NaN NaN \n",
+ "174243 NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " Shape_Length Shape_Area \n",
+ "10614 NaN NaN \n",
+ "10615 NaN NaN \n",
+ "10627 NaN NaN \n",
+ "10628 NaN NaN \n",
+ "10629 NaN NaN \n",
+ "... ... ... \n",
+ "174140 NaN NaN \n",
+ "174143 NaN NaN \n",
+ "174184 NaN NaN \n",
+ "174242 NaN NaN \n",
+ "174243 NaN NaN \n",
+ "\n",
+ "[73 rows x 128 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df[merged_df[\"Shape_Area\"].isnull()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d1a7b71d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/data-pipeline/ipython/Score Validate.ipynb b/data/data-pipeline/ipython/Score Validate.ipynb
new file mode 100644
index 00000000..aa65eafe
--- /dev/null
+++ b/data/data-pipeline/ipython/Score Validate.ipynb
@@ -0,0 +1,777 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "3ab8f7c1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import csv\n",
+ "from pathlib import Path\n",
+ "import os\n",
+ "import sys"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8c22494f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.abspath(os.path.join(\"..\"))\n",
+ "if module_path not in sys.path:\n",
+ " sys.path.append(module_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "eb31e9a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_PATH = Path.cwd().parent / \"data\"\n",
+ "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
+ "OUTPUT_PATH = (\n",
+ " DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
+ " )\n",
+ "CENSUS_USA_CSV = (\n",
+ " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "95a5f8d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cbg_usa_df = pd.read_csv(\n",
+ " CENSUS_USA_CSV,\n",
+ " names=['GEOID10'],\n",
+ " dtype={\"GEOID10\": \"string\"},\n",
+ " low_memory=False,\n",
+ " header=None\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "bdd9ab60",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100010414002 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100010415002 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100010417011 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100010417012 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100010422011 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10\n",
+ "0 100010414002\n",
+ "1 100010415002\n",
+ "2 100010417011\n",
+ "3 100010417012\n",
+ "4 100010422011"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cbg_usa_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "05a40080",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GEOID10 string\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cbg_usa_df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "114af777",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "score_df = pd.read_csv(\n",
+ " OUTPUT_PATH / \"usa.csv\",\n",
+ " dtype={\"GEOID10\": \"string\"},\n",
+ " low_memory=False,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "d5f3ebd4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " Score E (percentile) | \n",
+ " Score E (top 25th percentile) | \n",
+ " GEOID | \n",
+ " State Abbreviation | \n",
+ " County Name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100010414002 | \n",
+ " 0.808889 | \n",
+ " True | \n",
+ " 10001 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100010415002 | \n",
+ " 0.555160 | \n",
+ " False | \n",
+ " 10001 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100010417011 | \n",
+ " 0.272392 | \n",
+ " False | \n",
+ " 10001 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100010417012 | \n",
+ " 0.345686 | \n",
+ " False | \n",
+ " 10001 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100010422011 | \n",
+ " 0.472567 | \n",
+ " False | \n",
+ " 10001 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 220256 | \n",
+ " 340076020004 | \n",
+ " 0.921941 | \n",
+ " True | \n",
+ " 34007 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220257 | \n",
+ " 340076017002 | \n",
+ " 0.934490 | \n",
+ " True | \n",
+ " 34007 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220258 | \n",
+ " 340076015005 | \n",
+ " 0.889613 | \n",
+ " True | \n",
+ " 34007 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220259 | \n",
+ " 340076091032 | \n",
+ " 0.627822 | \n",
+ " False | \n",
+ " 34007 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220260 | \n",
+ " 340076053002 | \n",
+ " 0.762237 | \n",
+ " True | \n",
+ " 34007 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
220261 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
+ "0 100010414002 0.808889 True \n",
+ "1 100010415002 0.555160 False \n",
+ "2 100010417011 0.272392 False \n",
+ "3 100010417012 0.345686 False \n",
+ "4 100010422011 0.472567 False \n",
+ "... ... ... ... \n",
+ "220256 340076020004 0.921941 True \n",
+ "220257 340076017002 0.934490 True \n",
+ "220258 340076015005 0.889613 True \n",
+ "220259 340076091032 0.627822 False \n",
+ "220260 340076053002 0.762237 True \n",
+ "\n",
+ " GEOID State Abbreviation County Name \n",
+ "0 10001 DE Kent County \n",
+ "1 10001 DE Kent County \n",
+ "2 10001 DE Kent County \n",
+ "3 10001 DE Kent County \n",
+ "4 10001 DE Kent County \n",
+ "... ... ... ... \n",
+ "220256 34007 NJ Camden County \n",
+ "220257 34007 NJ Camden County \n",
+ "220258 34007 NJ Camden County \n",
+ "220259 34007 NJ Camden County \n",
+ "220260 34007 NJ Camden County \n",
+ "\n",
+ "[220261 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "score_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f84f9e1d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GEOID10 string\n",
+ "Score E (percentile) float64\n",
+ "Score E (top 25th percentile) bool\n",
+ "GEOID int64\n",
+ "State Abbreviation object\n",
+ "County Name object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "score_df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "8d61e29e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "merged_df = cbg_usa_df.merge(\n",
+ " score_df, on=\"GEOID10\", how=\"left\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "7e8c2f2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " Score E (percentile) | \n",
+ " Score E (top 25th percentile) | \n",
+ " GEOID | \n",
+ " State Abbreviation | \n",
+ " County Name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 100010414002 | \n",
+ " 0.808889 | \n",
+ " True | \n",
+ " 10001.0 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 100010415002 | \n",
+ " 0.555160 | \n",
+ " False | \n",
+ " 10001.0 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 100010417011 | \n",
+ " 0.272392 | \n",
+ " False | \n",
+ " 10001.0 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 100010417012 | \n",
+ " 0.345686 | \n",
+ " False | \n",
+ " 10001.0 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100010422011 | \n",
+ " 0.472567 | \n",
+ " False | \n",
+ " 10001.0 | \n",
+ " DE | \n",
+ " Kent County | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 220329 | \n",
+ " 340076020004 | \n",
+ " 0.921941 | \n",
+ " True | \n",
+ " 34007.0 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220330 | \n",
+ " 340076017002 | \n",
+ " 0.934490 | \n",
+ " True | \n",
+ " 34007.0 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220331 | \n",
+ " 340076015005 | \n",
+ " 0.889613 | \n",
+ " True | \n",
+ " 34007.0 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220332 | \n",
+ " 340076091032 | \n",
+ " 0.627822 | \n",
+ " False | \n",
+ " 34007.0 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ " 220333 | \n",
+ " 340076053002 | \n",
+ " 0.762237 | \n",
+ " True | \n",
+ " 34007.0 | \n",
+ " NJ | \n",
+ " Camden County | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
220334 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
+ "0 100010414002 0.808889 True \n",
+ "1 100010415002 0.555160 False \n",
+ "2 100010417011 0.272392 False \n",
+ "3 100010417012 0.345686 False \n",
+ "4 100010422011 0.472567 False \n",
+ "... ... ... ... \n",
+ "220329 340076020004 0.921941 True \n",
+ "220330 340076017002 0.934490 True \n",
+ "220331 340076015005 0.889613 True \n",
+ "220332 340076091032 0.627822 False \n",
+ "220333 340076053002 0.762237 True \n",
+ "\n",
+ " GEOID State Abbreviation County Name \n",
+ "0 10001.0 DE Kent County \n",
+ "1 10001.0 DE Kent County \n",
+ "2 10001.0 DE Kent County \n",
+ "3 10001.0 DE Kent County \n",
+ "4 10001.0 DE Kent County \n",
+ "... ... ... ... \n",
+ "220329 34007.0 NJ Camden County \n",
+ "220330 34007.0 NJ Camden County \n",
+ "220331 34007.0 NJ Camden County \n",
+ "220332 34007.0 NJ Camden County \n",
+ "220333 34007.0 NJ Camden County \n",
+ "\n",
+ "[220334 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "e81b1321",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GEOID10 | \n",
+ " Score E (percentile) | \n",
+ " Score E (top 25th percentile) | \n",
+ " GEOID | \n",
+ " State Abbreviation | \n",
+ " County Name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 10614 | \n",
+ " 515150501002 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10615 | \n",
+ " 515150501003 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10627 | \n",
+ " 515150501001 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10628 | \n",
+ " 515150501005 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10629 | \n",
+ " 515150501004 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 174140 | \n",
+ " 040190029031 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174143 | \n",
+ " 040190027012 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174184 | \n",
+ " 040190027011 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174242 | \n",
+ " 040194105021 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 174243 | \n",
+ " 040194105011 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
73 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
+ "10614 515150501002 NaN NaN \n",
+ "10615 515150501003 NaN NaN \n",
+ "10627 515150501001 NaN NaN \n",
+ "10628 515150501005 NaN NaN \n",
+ "10629 515150501004 NaN NaN \n",
+ "... ... ... ... \n",
+ "174140 040190029031 NaN NaN \n",
+ "174143 040190027012 NaN NaN \n",
+ "174184 040190027011 NaN NaN \n",
+ "174242 040194105021 NaN NaN \n",
+ "174243 040194105011 NaN NaN \n",
+ "\n",
+ " GEOID State Abbreviation County Name \n",
+ "10614 NaN NaN NaN \n",
+ "10615 NaN NaN NaN \n",
+ "10627 NaN NaN NaN \n",
+ "10628 NaN NaN NaN \n",
+ "10629 NaN NaN NaN \n",
+ "... ... ... ... \n",
+ "174140 NaN NaN NaN \n",
+ "174143 NaN NaN NaN \n",
+ "174184 NaN NaN NaN \n",
+ "174242 NaN NaN NaN \n",
+ "174243 NaN NaN NaN \n",
+ "\n",
+ "[73 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "merged_df[merged_df[\"Score E (percentile)\"].isnull()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d1a7b71d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb b/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb
index 747796aa..8e57da6d 100644
--- a/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb
+++ b/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb
@@ -2,7 +2,9 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
@@ -10,24 +12,24 @@
"import pathlib\n",
"import os\n",
"import sys"
- ],
- "outputs": [],
- "metadata": {}
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
- ],
- "outputs": [],
- "metadata": {}
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
"source": [
"def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
" state_gdf = gpd.read_file(file_name)\n",
@@ -100,104 +102,133 @@
" state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n",
" compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n",
" write_to_file(compressed, file_name)"
- ],
- "outputs": [],
- "metadata": {}
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
+ "metadata": {
+ "id": "Ia5bqxS2LJqe"
+ },
+ "outputs": [],
"source": [
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
- "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n",
- "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})"
- ],
- "outputs": [],
- "metadata": {
- "id": "Ia5bqxS2LJqe"
- }
+ "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
+ "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
- "source": [
- "master_df = gpd.GeoDataFrame()"
- ],
- "outputs": [],
+ "execution_count": 7,
"metadata": {
"id": "Dtf5qD50JvCw"
- }
+ },
+ "outputs": [],
+ "source": [
+ "master_df = gpd.GeoDataFrame()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty GeoDataFrame\n",
+ "Columns: []\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "master_df.head()"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "metadata": {
+ "id": "PNdw8bERJyKk"
+ },
+ "outputs": [],
"source": [
"for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
" state_gdf = gpd.read_file(file_name)\n",
" master_df = master_df.append(state_gdf)"
- ],
- "outputs": [],
- "metadata": {
- "id": "PNdw8bERJyKk"
- }
+ ]
},
{
"cell_type": "code",
"execution_count": null,
- "source": [
- "master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")"
- ],
- "outputs": [],
"metadata": {
"id": "B5SS9y2pLwks"
- }
+ },
+ "outputs": [],
+ "source": [
+ "master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
- "source": [
- "master_df.shape"
- ],
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "(220742, 13)"
- ]
- },
- "metadata": {
- "tags": []
- },
- "execution_count": 68
- }
- ],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_C6vaR9HQeLa",
"outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4"
- }
+ },
+ "outputs": [],
+ "source": [
+ "master_df.shape"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
- "source": [
- "master_df.head(2)"
- ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "oMoubjqCQiw5",
+ "outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0"
+ },
"outputs": [
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " STATEFP10 ... geometry\n",
- "0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n",
- "1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n",
- "\n",
- "[2 rows x 13 columns]"
- ],
"text/html": [
"\n",
"