From b404fdcc4316d76e25d4b5da31d184dcde229d01 Mon Sep 17 00:00:00 2001 From: Jorge Escobar <83969469+esfoobar-usds@users.noreply.github.com> Date: Wed, 28 Jul 2021 16:07:28 -0400 Subject: [PATCH] Generate Geo-aware scores for all zoom levels (#391) * generate Geo-aware scores for all zoom levels * usa high progress * testing dissolve * checkpoint * changing type * removing breakpoint * validation notebooks * quick update * score validation * fixes for county merge * code completed --- data/data-pipeline/README.md | 6 +- data/data-pipeline/application.py | 10 +- data/data-pipeline/etl/runner.py | 18 + data/data-pipeline/etl/score/etl_score_geo.py | 168 +++ .../data-pipeline/etl/score/etl_score_post.py | 45 +- .../etl/sources/calenviroscreen/etl.py | 8 +- data/data-pipeline/etl/sources/census/etl.py | 16 +- .../etl/sources/census_acs/etl.py | 5 + data/data-pipeline/ipython/ACS Validate.ipynb | 567 +++++++++ .../ipython/EJScreen Validate.ipynb | 1121 +++++++++++++++++ .../ipython/Score Validate.ipynb | 777 ++++++++++++ .../ipython/Score_Dissolve_Script.ipynb | 535 ++++---- .../data-pipeline/ipython/county_lookup.ipynb | 2 +- data/data-pipeline/requirements.txt | 15 +- 14 files changed, 3023 insertions(+), 270 deletions(-) create mode 100644 data/data-pipeline/etl/score/etl_score_geo.py create mode 100644 data/data-pipeline/ipython/ACS Validate.ipynb create mode 100644 data/data-pipeline/ipython/EJScreen Validate.ipynb create mode 100644 data/data-pipeline/ipython/Score Validate.ipynb diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index 4920f2d8..6f1e1fb8 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -44,7 +44,7 @@ TODO add mermaid diagram #### Step 0: Set up your environment -1. After cloning the project locally, change to this directory: `cd score` +1. After cloning the project locally, change to this directory: `cd data/data-pipeline` 1. Choose whether you'd like to run this application using Docker or if you'd like to install the dependencies locally so you can contribute to the project. - **With Docker:** Follow these [installation instructions](https://docs.docker.com/get-docker/) and skip down to the [Running with Docker section](#running-with-docker) for more information - **For Local Development:** Skip down to the [Local Development section](#local-development) for more detailed installation instructions @@ -53,7 +53,7 @@ TODO add mermaid diagram #### Step 1: Run the ETL script for each data source 1. Call the `etl-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute. - - With Docker: `docker run --rm -it j40_score /bin/sh -c "python3 application.py etl-run"` + - With Docker: `docker run --rm -it j40_data_pipeline /bin/sh -c "python3 application.py etl-run"` - With Poetry: `poetry run python application.py etl-run` 1. The `etl-run` command will execute the corresponding ETL script for each data source in `etl/sources/`. For example, `etl/sources/ejscreen/etl.py` is the ETL script for EJSCREEN data. 1. Each ETL script will extract the data from its original source, then format the data into `.csv` files that get stored in the relevant folder in `data/dataset/`. For example, HUD Housing data is stored in `data/dataset/hud_housing/usa.csv` @@ -64,7 +64,7 @@ _For example: `poetry run python application.py etl-run ejscreen` would only run #### Step 2: Calculate the Justice40 score experiments 1. Call the `score-run` command using the application manager `application.py` **NOTE:** This may take several minutes to execute. - - With Docker: `docker run --rm -it j40_score /bin/sh -c "python3 application.py score-run"` + - With Docker: `docker run --rm -it j40_data_pipeline /bin/sh -c "python3 application.py score-run"` - With Poetry: `poetry run python application.py score-run` 1. The `score-run` command will execute the `etl/score/etl.py` script which loads the data from each of the source files added to the `data/dataset/` directory by the ETL scripts in Step 1. 1. These data sets are merged into a single dataframe using their Census Block Group GEOID as a common key, and the data in each of the columns is standardized in two ways: diff --git a/data/data-pipeline/application.py b/data/data-pipeline/application.py index 89ee45bf..abf16ef5 100644 --- a/data/data-pipeline/application.py +++ b/data/data-pipeline/application.py @@ -9,7 +9,7 @@ from utils import ( temp_folder_cleanup, ) from etl.sources.census.etl import download_census_csvs -from etl.runner import etl_runner, score_generate +from etl.runner import etl_runner, score_generate, score_geo logger = get_module_logger(__name__) @@ -88,5 +88,13 @@ def score_run(): score_generate() +@cli.command( + help="Generate Geojson files with scores baked in", +) +def geo_score(): + """CLI command to generate the score""" + score_geo() + + if __name__ == "__main__": cli() diff --git a/data/data-pipeline/etl/runner.py b/data/data-pipeline/etl/runner.py index f6ec09b4..90be2f48 100644 --- a/data/data-pipeline/etl/runner.py +++ b/data/data-pipeline/etl/runner.py @@ -2,6 +2,7 @@ import importlib from etl.score.etl_score import ScoreETL from etl.score.etl_score_post import PostScoreETL +from etl.score.etl_score_geo import GeoScoreETL def etl_runner(dataset_to_run: str = None) -> None: @@ -112,6 +113,23 @@ def score_generate() -> None: score_post.cleanup() +def score_geo() -> None: + """Generates the geojson files with score data baked in + + Args: + None + + Returns: + None + """ + + # Score Geo + score_geo = GeoScoreETL() + score_geo.extract() + score_geo.transform() + score_geo.load() + + def _find_dataset_index(dataset_list, key, value): for i, element in enumerate(dataset_list): if element[key] == value: diff --git a/data/data-pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/etl/score/etl_score_geo.py new file mode 100644 index 00000000..78715e31 --- /dev/null +++ b/data/data-pipeline/etl/score/etl_score_geo.py @@ -0,0 +1,168 @@ +import pandas as pd +import geopandas as gpd +import math + +from etl.base import ExtractTransformLoad +from utils import get_module_logger + +logger = get_module_logger(__name__) + + +class GeoScoreETL(ExtractTransformLoad): + """ + A class used to generate per state and national GeoJson files with the score baked in + """ + + def __init__(self): + self.SCORE_GEOJSON_PATH = self.DATA_PATH / "score" / "geojson" + self.SCORE_LOW_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-low.json" + self.SCORE_HIGH_GEOJSON = self.SCORE_GEOJSON_PATH / "usa-high.json" + + self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" + self.TILE_SCORE_CSV = self.SCORE_CSV_PATH / "tiles" / "usa.csv" + + self.CENSUS_USA_GEOJSON = ( + self.DATA_PATH / "census" / "geojson" / "us.json" + ) + + self.TARGET_SCORE_NAME = "Score E (percentile)" + self.TARGET_SCORE_RENAME_TO = "E_SCORE" + + self.NUMBER_OF_BUCKETS = 10 + + self.geojson_usa_df: gpd.GeoDataFrame + self.score_usa_df: pd.DataFrame + self.geojson_score_usa_high: gpd.GeoDataFrame + self.geojson_score_usa_low: gpd.GeoDataFrame + + def extract(self) -> None: + logger.info(f"Reading US GeoJSON (~6 minutes)") + self.geojson_usa_df = gpd.read_file( + self.CENSUS_USA_GEOJSON, + dtype={"GEOID10": "string"}, + usecols=["GEOID10", "geometry"], + low_memory=False, + ) + self.geojson_usa_df.head() + + logger.info(f"Reading score CSV") + self.score_usa_df = pd.read_csv( + self.TILE_SCORE_CSV, + dtype={"GEOID10": "string"}, + low_memory=False, + ) + + def transform(self) -> None: + logger.info(f"Pruning Census GeoJSON") + fields = ["GEOID10", "geometry"] + self.geojson_usa_df = self.geojson_usa_df[fields] + + logger.info(f"Merging and compressing score CSV with USA GeoJSON") + self.geojson_score_usa_high = self.score_usa_df.merge( + self.geojson_usa_df, on="GEOID10", how="left" + ) + + self.geojson_score_usa_high = gpd.GeoDataFrame( + self.geojson_score_usa_high, crs="EPSG:4326" + ) + + usa_simplified = self.geojson_score_usa_high[ + ["GEOID10", self.TARGET_SCORE_NAME, "geometry"] + ].reset_index(drop=True) + + usa_simplified.rename( + columns={self.TARGET_SCORE_NAME: self.TARGET_SCORE_RENAME_TO}, + inplace=True, + ) + + logger.info(f"Aggregating into tracts (~5 minutes)") + usa_tracts = self._aggregate_to_tracts(usa_simplified) + + usa_tracts = gpd.GeoDataFrame( + usa_tracts, + columns=[self.TARGET_SCORE_RENAME_TO, "geometry"], + crs="EPSG:4326", + ) + + logger.info(f"Creating buckets from tracts") + usa_bucketed = self._create_buckets_from_tracts( + usa_tracts, self.NUMBER_OF_BUCKETS + ) + + logger.info(f"Aggregating buckets") + usa_aggregated = self._aggregate_buckets(usa_bucketed, agg_func="mean") + + compressed = self._breakup_multipolygons( + usa_aggregated, self.NUMBER_OF_BUCKETS + ) + + self.geojson_score_usa_low = gpd.GeoDataFrame( + compressed, + columns=[self.TARGET_SCORE_RENAME_TO, "geometry"], + crs="EPSG:4326", + ) + + def _aggregate_to_tracts( + self, block_group_df: gpd.GeoDataFrame + ) -> gpd.GeoDataFrame: + # The tract identifier is the first 11 digits of the GEOID + block_group_df["tract"] = block_group_df.apply( + lambda row: row["GEOID10"][0:11], axis=1 + ) + state_tracts = block_group_df.dissolve(by="tract", aggfunc="mean") + return state_tracts + + def _create_buckets_from_tracts( + self, state_tracts: gpd.GeoDataFrame, num_buckets: int + ) -> gpd.GeoDataFrame: + # assign tracts to buckets by D_SCORE + state_tracts.sort_values(self.TARGET_SCORE_RENAME_TO, inplace=True) + SCORE_bucket = [] + bucket_size = math.ceil( + len(state_tracts.index) / self.NUMBER_OF_BUCKETS + ) + for i in range(len(state_tracts.index)): + SCORE_bucket.extend([math.floor(i / bucket_size)]) + state_tracts[f"{self.TARGET_SCORE_RENAME_TO}_bucket"] = SCORE_bucket + return state_tracts + + def _aggregate_buckets(self, state_tracts: gpd.GeoDataFrame, agg_func: str): + # dissolve tracts by bucket + state_attr = state_tracts[ + [ + self.TARGET_SCORE_RENAME_TO, + f"{self.TARGET_SCORE_RENAME_TO}_bucket", + "geometry", + ] + ].reset_index(drop=True) + state_dissolve = state_attr.dissolve( + by=f"{self.TARGET_SCORE_RENAME_TO}_bucket", aggfunc=agg_func + ) + return state_dissolve + + def _breakup_multipolygons( + self, state_bucketed_df: gpd.GeoDataFrame, num_buckets: int + ) -> gpd.GeoDataFrame: + compressed = [] + for i in range(num_buckets): + for j in range(len(state_bucketed_df["geometry"][i].geoms)): + compressed.append( + [ + state_bucketed_df[self.TARGET_SCORE_RENAME_TO][i], + state_bucketed_df["geometry"][i].geoms[j], + ] + ) + return compressed + + def load(self) -> None: + logger.info(f"Writing usa-high (~9 minutes)") + self.geojson_score_usa_high.to_file( + self.SCORE_HIGH_GEOJSON, driver="GeoJSON" + ) + logger.info(f"Completed writing usa-high") + + logger.info(f"Writing usa-low (~9 minutes)") + self.geojson_score_usa_low.to_file( + self.SCORE_LOW_GEOJSON, driver="GeoJSON" + ) + logger.info(f"Completed writing usa-low") diff --git a/data/data-pipeline/etl/score/etl_score_post.py b/data/data-pipeline/etl/score/etl_score_post.py index 41c837a4..f2e1e376 100644 --- a/data/data-pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/etl/score/etl_score_post.py @@ -16,10 +16,13 @@ class PostScoreETL(ExtractTransformLoad): self.CENSUS_COUNTIES_ZIP_URL = "https://www2.census.gov/geo/docs/maps-data/data/gazetteer/Gaz_counties_national.zip" self.CENSUS_COUNTIES_TXT = self.TMP_PATH / "Gaz_counties_national.txt" self.CENSUS_COUNTIES_COLS = ["USPS", "GEOID", "NAME"] + self.CENSUS_USA_CSV = self.DATA_PATH / "census" / "csv" / "us.csv" self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" + self.STATE_CSV = ( self.DATA_PATH / "census" / "csv" / "fips_states_2010.csv" ) + self.FULL_SCORE_CSV = self.SCORE_CSV_PATH / "full" / "usa.csv" self.TILR_SCORE_CSV = self.SCORE_CSV_PATH / "tile" / "usa.csv" @@ -87,17 +90,43 @@ class PostScoreETL(ExtractTransformLoad): # add the tract level column self.score_df["GEOID"] = self.score_df.GEOID10.str[:5] - # merge state and counties - county_state_merged = self.counties_df.join( - self.states_df, rsuffix=" Other" + # merge state with counties + county_state_merged = self.counties_df.merge( + self.states_df, on="State Abbreviation", how="left" ) - del county_state_merged["State Abbreviation Other"] - # merge county and score - self.score_county_state_merged = self.score_df.join( - county_state_merged, rsuffix="_OTHER" + # merge state + county with score + self.score_county_state_merged = self.score_df.merge( + county_state_merged, on="GEOID", how="left" ) - del self.score_county_state_merged["GEOID_OTHER"] + + # check if there are census cbgs without score + logger.info(f"Removing CBG rows without score") + + ## load cbgs + cbg_usa_df = pd.read_csv( + self.CENSUS_USA_CSV, + names=["GEOID10"], + dtype={"GEOID10": "string"}, + low_memory=False, + header=None, + ) + + # merge census cbgs with score + merged_df = cbg_usa_df.merge( + self.score_county_state_merged, on="GEOID10", how="left" + ) + + # list the null score cbgs + null_cbg_df = merged_df[merged_df["Score E (percentile)"].isnull()] + + # subsctract data sets + removed_df = pd.concat( + [merged_df, null_cbg_df, null_cbg_df] + ).drop_duplicates(keep=False) + + # set the score to the new df + self.score_county_state_merged = removed_df def load(self) -> None: logger.info(f"Saving Full Score CSV with County Information") diff --git a/data/data-pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/etl/sources/calenviroscreen/etl.py index ad56b26a..cb3a01c3 100644 --- a/data/data-pipeline/etl/sources/calenviroscreen/etl.py +++ b/data/data-pipeline/etl/sources/calenviroscreen/etl.py @@ -9,12 +9,16 @@ logger = get_module_logger(__name__) class CalEnviroScreenETL(ExtractTransformLoad): def __init__(self): self.CALENVIROSCREEN_FTP_URL = "https://justice40-data.s3.amazonaws.com/data-sources/CalEnviroScreen_4.0_2021.zip" - self.CALENVIROSCREEN_CSV = self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv" + self.CALENVIROSCREEN_CSV = ( + self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv" + ) self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4" # Definining some variable names self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score" - self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = "calenviroscreen_percentile" + self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = ( + "calenviroscreen_percentile" + ) self.CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = ( "calenviroscreen_priority_community" ) diff --git a/data/data-pipeline/etl/sources/census/etl.py b/data/data-pipeline/etl/sources/census/etl.py index c652700e..62c6420c 100644 --- a/data/data-pipeline/etl/sources/census/etl.py +++ b/data/data-pipeline/etl/sources/census/etl.py @@ -2,6 +2,7 @@ import csv import os import json from pathlib import Path +import geopandas as gpd from .etl_utils import get_state_fips_codes from utils import unzip_file_from_url, get_module_logger @@ -11,7 +12,7 @@ logger = get_module_logger(__name__) def download_census_csvs(data_path: Path) -> None: """Download all census shape files from the Census FTP and extract the geojson - to generate national and by state Census Block Group CSVs + to generate national and by state Census Block Group CSVs and GeoJSONs Args: data_path (pathlib.Path): Name of the directory where the files and directories will @@ -108,4 +109,17 @@ def download_census_csvs(data_path: Path) -> None: ] ) + ## create national geojson + logger.info(f"Generating national geojson file") + usa_df = gpd.GeoDataFrame() + + for file_name in geojson_dir_path.rglob("*.json"): + logger.info(f"Ingesting {file_name}") + state_gdf = gpd.read_file(file_name) + usa_df = usa_df.append(state_gdf) + + usa_df = usa_df.to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs") + logger.info(f"Writing national geojson file") + usa_df.to_file(geojson_dir_path / "us.json", driver="GeoJSON") + logger.info("Census block groups downloading complete") diff --git a/data/data-pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/etl/sources/census_acs/etl.py index 39db151c..18c58693 100644 --- a/data/data-pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/etl/sources/census_acs/etl.py @@ -106,3 +106,8 @@ class CensusACSETL(ExtractTransformLoad): self.df[columns_to_include].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False ) + + def validate(self) -> None: + logger.info(f"Validating Census ACS Data") + + pass diff --git a/data/data-pipeline/ipython/ACS Validate.ipynb b/data/data-pipeline/ipython/ACS Validate.ipynb new file mode 100644 index 00000000..ac5baca4 --- /dev/null +++ b/data/data-pipeline/ipython/ACS Validate.ipynb @@ -0,0 +1,567 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "43c5dbee", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import csv\n", + "from pathlib import Path\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f97c95f6", + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "b8a2b53e", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", + "ACS_YEAR = \"2019\"\n", + "OUTPUT_PATH = (\n", + " DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n", + " )\n", + "CENSUS_USA_CSV = (\n", + " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "0d33e8db", + "metadata": {}, + "outputs": [], + "source": [ + "cbg_usa_df = pd.read_csv(\n", + " CENSUS_USA_CSV,\n", + " names=['GEOID10'],\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " header=None\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "01e6dbe3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10
0100010414002
1100010415002
2100010417011
3100010417012
4100010422011
\n", + "
" + ], + "text/plain": [ + " GEOID10\n", + "0 100010414002\n", + "1 100010415002\n", + "2 100010417011\n", + "3 100010417012\n", + "4 100010422011" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cbg_usa_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "341dbcb6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GEOID10 string\n", + "dtype: object" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cbg_usa_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "eb25d4bf", + "metadata": {}, + "outputs": [], + "source": [ + "acs_df = pd.read_csv(\n", + " OUTPUT_PATH / \"usa.csv\",\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d4c9d010", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10Unemployed civilians (percent)Linguistic isolation (percent)
00103996200020.0771080.0
10103996180020.1262140.0
20103996160040.1331720.0
30103996160020.0282490.0
40103996160010.0630370.0
\n", + "
" + ], + "text/plain": [ + " GEOID10 Unemployed civilians (percent) \\\n", + "0 010399620002 0.077108 \n", + "1 010399618002 0.126214 \n", + "2 010399616004 0.133172 \n", + "3 010399616002 0.028249 \n", + "4 010399616001 0.063037 \n", + "\n", + " Linguistic isolation (percent) \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "acs_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "dd390179", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GEOID10 string\n", + "Unemployed civilians (percent) float64\n", + "Linguistic isolation (percent) float64\n", + "dtype: object" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "acs_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "236eb093", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = cbg_usa_df.merge(\n", + " acs_df, on=\"GEOID10\", how=\"left\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "4fff1845", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10Unemployed civilians (percent)Linguistic isolation (percent)
01000104140020.0306120.065963
11000104150020.1180560.010283
21000104170110.0423730.000000
31000104170120.0424730.010435
41000104220110.0543580.000000
\n", + "
" + ], + "text/plain": [ + " GEOID10 Unemployed civilians (percent) \\\n", + "0 100010414002 0.030612 \n", + "1 100010415002 0.118056 \n", + "2 100010417011 0.042373 \n", + "3 100010417012 0.042473 \n", + "4 100010422011 0.054358 \n", + "\n", + " Linguistic isolation (percent) \n", + "0 0.065963 \n", + "1 0.010283 \n", + "2 0.000000 \n", + "3 0.010435 \n", + "4 0.000000 " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "f8903557", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10Unemployed civilians (percent)Linguistic isolation (percent)
34100019900000NaNNaN
377100030169041NaNNaN
392100059900000NaNNaN
400100039901000NaNNaN
416100039801001NaNNaN
............
219505340057048013NaNNaN
219508340057048024NaNNaN
219758340258047001NaNNaN
219807340259900000NaNNaN
220134340076113001NaN0.0
\n", + "

1462 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 Unemployed civilians (percent) \\\n", + "34 100019900000 NaN \n", + "377 100030169041 NaN \n", + "392 100059900000 NaN \n", + "400 100039901000 NaN \n", + "416 100039801001 NaN \n", + "... ... ... \n", + "219505 340057048013 NaN \n", + "219508 340057048024 NaN \n", + "219758 340258047001 NaN \n", + "219807 340259900000 NaN \n", + "220134 340076113001 NaN \n", + "\n", + " Linguistic isolation (percent) \n", + "34 NaN \n", + "377 NaN \n", + "392 NaN \n", + "400 NaN \n", + "416 NaN \n", + "... ... \n", + "219505 NaN \n", + "219508 NaN \n", + "219758 NaN \n", + "219807 NaN \n", + "220134 0.0 \n", + "\n", + "[1462 rows x 3 columns]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b870a21f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/ipython/EJScreen Validate.ipynb b/data/data-pipeline/ipython/EJScreen Validate.ipynb new file mode 100644 index 00000000..4c2826d0 --- /dev/null +++ b/data/data-pipeline/ipython/EJScreen Validate.ipynb @@ -0,0 +1,1121 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3ab8f7c1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import csv\n", + "from pathlib import Path\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8c22494f", + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "eb31e9a1", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", + "OUTPUT_PATH = (\n", + " DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n", + " )\n", + "CENSUS_USA_CSV = (\n", + " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "95a5f8d8", + "metadata": {}, + "outputs": [], + "source": [ + "cbg_usa_df = pd.read_csv(\n", + " CENSUS_USA_CSV,\n", + " names=['GEOID10'],\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " header=None\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bdd9ab60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10
0100010414002
1100010415002
2100010417011
3100010417012
4100010422011
\n", + "
" + ], + "text/plain": [ + " GEOID10\n", + "0 100010414002\n", + "1 100010415002\n", + "2 100010417011\n", + "3 100010417012\n", + "4 100010422011" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cbg_usa_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "05a40080", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GEOID10 string\n", + "dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cbg_usa_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "114af777", + "metadata": {}, + "outputs": [], + "source": [ + "ejscreen_df = pd.read_csv(\n", + " OUTPUT_PATH / \"usa.csv\",\n", + " dtype={\"ID\": \"string\"},\n", + " low_memory=False,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4f070999", + "metadata": {}, + "outputs": [], + "source": [ + "ejscreen_df.rename(\n", + " columns={\"ID\": \"GEOID10\"},\n", + " inplace=True,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d5f3ebd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OBJECTIDGEOID10STATE_NAMEST_ABBREVREGIONACSTOTPOPD_PM25_2B_PM25_D2P_PM25_D2D_OZONE_2...T_PNPLT_PNPL_D2T_PRMPT_PRMP_D2T_PTSDFT_PTSDF_D2T_PWDIST_PWDIS_D2Shape_LengthShape_Area
01010010201001AlabamaAL4692-1161.544049543.0-4661.186378...0.071 facilities/km distance (79%ile)26%ile0.085 facilities/km distance (24%ile)47%ile0.066 facilities/km distance (21%ile)48%ile0 toxicity-weighted concentration/meters dista...62%ile13435.9755606.026828e+06
12010010201002AlabamaAL41153-2084.690717431.0-8365.702519...0.064 facilities/km distance (76%ile)19%ile0.074 facilities/km distance (18%ile)41%ile0.06 facilities/km distance (18%ile)42%ile0 toxicity-weighted concentration/meters dista...62%ile11945.5846797.848121e+06
23010010202001AlabamaAL410202641.389659981.010550.793324...0.069 facilities/km distance (78%ile)87%ile0.078 facilities/km distance (20%ile)71%ile0.065 facilities/km distance (20%ile)71%ile0 toxicity-weighted concentration/meters dista...62%ile7770.9151212.900774e+06
34010010202002AlabamaAL41152693.118534765.02768.599617...0.076 facilities/km distance (81%ile)75%ile0.087 facilities/km distance (25%ile)63%ile0.07 facilities/km distance (23%ile)63%ile0 toxicity-weighted concentration/meters dista...62%ile6506.8047841.793332e+06
45010010203001AlabamaAL425551034.343525768.04120.531837...0.074 facilities/km distance (80%ile)79%ile0.08 facilities/km distance (21%ile)64%ile0.07 facilities/km distance (23%ile)65%ile0 toxicity-weighted concentration/meters dista...62%ile11070.3678485.461602e+06
\n", + "

5 rows × 128 columns

\n", + "
" + ], + "text/plain": [ + " OBJECTID GEOID10 STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n", + "0 1 010010201001 Alabama AL 4 692 \n", + "1 2 010010201002 Alabama AL 4 1153 \n", + "2 3 010010202001 Alabama AL 4 1020 \n", + "3 4 010010202002 Alabama AL 4 1152 \n", + "4 5 010010203001 Alabama AL 4 2555 \n", + "\n", + " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n", + "0 -1161.544049 5 43.0 -4661.186378 ... \n", + "1 -2084.690717 4 31.0 -8365.702519 ... \n", + "2 2641.389659 9 81.0 10550.793324 ... \n", + "3 693.118534 7 65.0 2768.599617 ... \n", + "4 1034.343525 7 68.0 4120.531837 ... \n", + "\n", + " T_PNPL T_PNPL_D2 \\\n", + "0 0.071 facilities/km distance (79%ile) 26%ile \n", + "1 0.064 facilities/km distance (76%ile) 19%ile \n", + "2 0.069 facilities/km distance (78%ile) 87%ile \n", + "3 0.076 facilities/km distance (81%ile) 75%ile \n", + "4 0.074 facilities/km distance (80%ile) 79%ile \n", + "\n", + " T_PRMP T_PRMP_D2 \\\n", + "0 0.085 facilities/km distance (24%ile) 47%ile \n", + "1 0.074 facilities/km distance (18%ile) 41%ile \n", + "2 0.078 facilities/km distance (20%ile) 71%ile \n", + "3 0.087 facilities/km distance (25%ile) 63%ile \n", + "4 0.08 facilities/km distance (21%ile) 64%ile \n", + "\n", + " T_PTSDF T_PTSDF_D2 \\\n", + "0 0.066 facilities/km distance (21%ile) 48%ile \n", + "1 0.06 facilities/km distance (18%ile) 42%ile \n", + "2 0.065 facilities/km distance (20%ile) 71%ile \n", + "3 0.07 facilities/km distance (23%ile) 63%ile \n", + "4 0.07 facilities/km distance (23%ile) 65%ile \n", + "\n", + " T_PWDIS T_PWDIS_D2 \\\n", + "0 0 toxicity-weighted concentration/meters dista... 62%ile \n", + "1 0 toxicity-weighted concentration/meters dista... 62%ile \n", + "2 0 toxicity-weighted concentration/meters dista... 62%ile \n", + "3 0 toxicity-weighted concentration/meters dista... 62%ile \n", + "4 0 toxicity-weighted concentration/meters dista... 62%ile \n", + "\n", + " Shape_Length Shape_Area \n", + "0 13435.975560 6.026828e+06 \n", + "1 11945.584679 7.848121e+06 \n", + "2 7770.915121 2.900774e+06 \n", + "3 6506.804784 1.793332e+06 \n", + "4 11070.367848 5.461602e+06 \n", + "\n", + "[5 rows x 128 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ejscreen_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f84f9e1d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OBJECTID int64\n", + "GEOID10 string\n", + "STATE_NAME object\n", + "ST_ABBREV object\n", + "REGION int64\n", + " ... \n", + "T_PTSDF_D2 object\n", + "T_PWDIS object\n", + "T_PWDIS_D2 object\n", + "Shape_Length float64\n", + "Shape_Area float64\n", + "Length: 128, dtype: object" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ejscreen_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8d61e29e", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = cbg_usa_df.merge(\n", + " ejscreen_df, on=\"GEOID10\", how=\"left\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7e8c2f2a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10OBJECTIDSTATE_NAMEST_ABBREVREGIONACSTOTPOPD_PM25_2B_PM25_D2P_PM25_D2D_OZONE_2...T_PNPLT_PNPL_D2T_PRMPT_PRMP_D2T_PTSDFT_PTSDF_D2T_PWDIST_PWDIS_D2Shape_LengthShape_Area
010001041400239652.0DelawareDE3.01187.03655.27972110.090.022778.314495...1.7 facilities/km distance (99%ile)100%ile0.23 facilities/km distance (40%ile)80%ile1.6 facilities/km distance (63%ile)87%ile0 toxicity-weighted concentration/meters dista...69%ile4866.1359431.156165e+06
110001041500239654.0DelawareDE3.01088.0100.8776667.065.0629.604923...0.32 facilities/km distance (69%ile)66%ile0.14 facilities/km distance (20%ile)64%ile1 facilities/km distance (52%ile)66%ile0 toxicity-weighted concentration/meters dista...69%ile7972.2756572.821805e+06
210001041701139656.0DelawareDE3.01554.0-1256.2215485.045.0-7833.701886...0.21 facilities/km distance (52%ile)31%ile0.11 facilities/km distance (11%ile)53%ile1.3 facilities/km distance (58%ile)22%ile0 toxicity-weighted concentration/meters dista...69%ile17643.7175138.143206e+06
310001041701239657.0DelawareDE3.04543.0-2095.0652154.032.0-13064.667094...0.17 facilities/km distance (43%ile)25%ile0.1 facilities/km distance (7%ile)48%ile1.1 facilities/km distance (54%ile)18%ile0 toxicity-weighted concentration/meters dista...69%ile15645.3412199.723460e+06
410001042201139671.0DelawareDE3.05153.0-723.4973376.053.0-4534.212814...0.24 facilities/km distance (58%ile)41%ile0.11 facilities/km distance (8%ile)58%ile0.3 facilities/km distance (33%ile)50%ile0 toxicity-weighted concentration/meters dista...69%ile20959.9592362.066192e+07
\n", + "

5 rows × 128 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 OBJECTID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n", + "0 100010414002 39652.0 Delaware DE 3.0 1187.0 \n", + "1 100010415002 39654.0 Delaware DE 3.0 1088.0 \n", + "2 100010417011 39656.0 Delaware DE 3.0 1554.0 \n", + "3 100010417012 39657.0 Delaware DE 3.0 4543.0 \n", + "4 100010422011 39671.0 Delaware DE 3.0 5153.0 \n", + "\n", + " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... \\\n", + "0 3655.279721 10.0 90.0 22778.314495 ... \n", + "1 100.877666 7.0 65.0 629.604923 ... \n", + "2 -1256.221548 5.0 45.0 -7833.701886 ... \n", + "3 -2095.065215 4.0 32.0 -13064.667094 ... \n", + "4 -723.497337 6.0 53.0 -4534.212814 ... \n", + "\n", + " T_PNPL T_PNPL_D2 \\\n", + "0 1.7 facilities/km distance (99%ile) 100%ile \n", + "1 0.32 facilities/km distance (69%ile) 66%ile \n", + "2 0.21 facilities/km distance (52%ile) 31%ile \n", + "3 0.17 facilities/km distance (43%ile) 25%ile \n", + "4 0.24 facilities/km distance (58%ile) 41%ile \n", + "\n", + " T_PRMP T_PRMP_D2 \\\n", + "0 0.23 facilities/km distance (40%ile) 80%ile \n", + "1 0.14 facilities/km distance (20%ile) 64%ile \n", + "2 0.11 facilities/km distance (11%ile) 53%ile \n", + "3 0.1 facilities/km distance (7%ile) 48%ile \n", + "4 0.11 facilities/km distance (8%ile) 58%ile \n", + "\n", + " T_PTSDF T_PTSDF_D2 \\\n", + "0 1.6 facilities/km distance (63%ile) 87%ile \n", + "1 1 facilities/km distance (52%ile) 66%ile \n", + "2 1.3 facilities/km distance (58%ile) 22%ile \n", + "3 1.1 facilities/km distance (54%ile) 18%ile \n", + "4 0.3 facilities/km distance (33%ile) 50%ile \n", + "\n", + " T_PWDIS T_PWDIS_D2 \\\n", + "0 0 toxicity-weighted concentration/meters dista... 69%ile \n", + "1 0 toxicity-weighted concentration/meters dista... 69%ile \n", + "2 0 toxicity-weighted concentration/meters dista... 69%ile \n", + "3 0 toxicity-weighted concentration/meters dista... 69%ile \n", + "4 0 toxicity-weighted concentration/meters dista... 69%ile \n", + "\n", + " Shape_Length Shape_Area \n", + "0 4866.135943 1.156165e+06 \n", + "1 7972.275657 2.821805e+06 \n", + "2 17643.717513 8.143206e+06 \n", + "3 15645.341219 9.723460e+06 \n", + "4 20959.959236 2.066192e+07 \n", + "\n", + "[5 rows x 128 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e81b1321", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10OBJECTIDSTATE_NAMEST_ABBREVREGIONACSTOTPOPD_PM25_2B_PM25_D2P_PM25_D2D_OZONE_2...T_PNPLT_PNPL_D2T_PRMPT_PRMP_D2T_PTSDFT_PTSDF_D2T_PWDIST_PWDIS_D2Shape_LengthShape_Area
10614515150501002NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
10615515150501003NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
10627515150501001NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
10628515150501005NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
10629515150501004NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
174140040190029031NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
174143040190027012NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
174184040190027011NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
174242040194105021NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
174243040194105011NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

73 rows × 128 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 OBJECTID STATE_NAME ST_ABBREV REGION ACSTOTPOP \\\n", + "10614 515150501002 NaN NaN NaN NaN NaN \n", + "10615 515150501003 NaN NaN NaN NaN NaN \n", + "10627 515150501001 NaN NaN NaN NaN NaN \n", + "10628 515150501005 NaN NaN NaN NaN NaN \n", + "10629 515150501004 NaN NaN NaN NaN NaN \n", + "... ... ... ... ... ... ... \n", + "174140 040190029031 NaN NaN NaN NaN NaN \n", + "174143 040190027012 NaN NaN NaN NaN NaN \n", + "174184 040190027011 NaN NaN NaN NaN NaN \n", + "174242 040194105021 NaN NaN NaN NaN NaN \n", + "174243 040194105011 NaN NaN NaN NaN NaN \n", + "\n", + " D_PM25_2 B_PM25_D2 P_PM25_D2 D_OZONE_2 ... T_PNPL T_PNPL_D2 \\\n", + "10614 NaN NaN NaN NaN ... NaN NaN \n", + "10615 NaN NaN NaN NaN ... NaN NaN \n", + "10627 NaN NaN NaN NaN ... NaN NaN \n", + "10628 NaN NaN NaN NaN ... NaN NaN \n", + "10629 NaN NaN NaN NaN ... NaN NaN \n", + "... ... ... ... ... ... ... ... \n", + "174140 NaN NaN NaN NaN ... NaN NaN \n", + "174143 NaN NaN NaN NaN ... NaN NaN \n", + "174184 NaN NaN NaN NaN ... NaN NaN \n", + "174242 NaN NaN NaN NaN ... NaN NaN \n", + "174243 NaN NaN NaN NaN ... NaN NaN \n", + "\n", + " T_PRMP T_PRMP_D2 T_PTSDF T_PTSDF_D2 T_PWDIS T_PWDIS_D2 \\\n", + "10614 NaN NaN NaN NaN NaN NaN \n", + "10615 NaN NaN NaN NaN NaN NaN \n", + "10627 NaN NaN NaN NaN NaN NaN \n", + "10628 NaN NaN NaN NaN NaN NaN \n", + "10629 NaN NaN NaN NaN NaN NaN \n", + "... ... ... ... ... ... ... \n", + "174140 NaN NaN NaN NaN NaN NaN \n", + "174143 NaN NaN NaN NaN NaN NaN \n", + "174184 NaN NaN NaN NaN NaN NaN \n", + "174242 NaN NaN NaN NaN NaN NaN \n", + "174243 NaN NaN NaN NaN NaN NaN \n", + "\n", + " Shape_Length Shape_Area \n", + "10614 NaN NaN \n", + "10615 NaN NaN \n", + "10627 NaN NaN \n", + "10628 NaN NaN \n", + "10629 NaN NaN \n", + "... ... ... \n", + "174140 NaN NaN \n", + "174143 NaN NaN \n", + "174184 NaN NaN \n", + "174242 NaN NaN \n", + "174243 NaN NaN \n", + "\n", + "[73 rows x 128 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df[merged_df[\"Shape_Area\"].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1a7b71d", + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/ipython/Score Validate.ipynb b/data/data-pipeline/ipython/Score Validate.ipynb new file mode 100644 index 00000000..aa65eafe --- /dev/null +++ b/data/data-pipeline/ipython/Score Validate.ipynb @@ -0,0 +1,777 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3ab8f7c1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import csv\n", + "from pathlib import Path\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8c22494f", + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "eb31e9a1", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", + "OUTPUT_PATH = (\n", + " DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n", + " )\n", + "CENSUS_USA_CSV = (\n", + " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "95a5f8d8", + "metadata": {}, + "outputs": [], + "source": [ + "cbg_usa_df = pd.read_csv(\n", + " CENSUS_USA_CSV,\n", + " names=['GEOID10'],\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " header=None\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bdd9ab60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10
0100010414002
1100010415002
2100010417011
3100010417012
4100010422011
\n", + "
" + ], + "text/plain": [ + " GEOID10\n", + "0 100010414002\n", + "1 100010415002\n", + "2 100010417011\n", + "3 100010417012\n", + "4 100010422011" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cbg_usa_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "05a40080", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GEOID10 string\n", + "dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cbg_usa_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "114af777", + "metadata": {}, + "outputs": [], + "source": [ + "score_df = pd.read_csv(\n", + " OUTPUT_PATH / \"usa.csv\",\n", + " dtype={\"GEOID10\": \"string\"},\n", + " low_memory=False,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d5f3ebd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10Score E (percentile)Score E (top 25th percentile)GEOIDState AbbreviationCounty Name
01000104140020.808889True10001DEKent County
11000104150020.555160False10001DEKent County
21000104170110.272392False10001DEKent County
31000104170120.345686False10001DEKent County
41000104220110.472567False10001DEKent County
.....................
2202563400760200040.921941True34007NJCamden County
2202573400760170020.934490True34007NJCamden County
2202583400760150050.889613True34007NJCamden County
2202593400760910320.627822False34007NJCamden County
2202603400760530020.762237True34007NJCamden County
\n", + "

220261 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n", + "0 100010414002 0.808889 True \n", + "1 100010415002 0.555160 False \n", + "2 100010417011 0.272392 False \n", + "3 100010417012 0.345686 False \n", + "4 100010422011 0.472567 False \n", + "... ... ... ... \n", + "220256 340076020004 0.921941 True \n", + "220257 340076017002 0.934490 True \n", + "220258 340076015005 0.889613 True \n", + "220259 340076091032 0.627822 False \n", + "220260 340076053002 0.762237 True \n", + "\n", + " GEOID State Abbreviation County Name \n", + "0 10001 DE Kent County \n", + "1 10001 DE Kent County \n", + "2 10001 DE Kent County \n", + "3 10001 DE Kent County \n", + "4 10001 DE Kent County \n", + "... ... ... ... \n", + "220256 34007 NJ Camden County \n", + "220257 34007 NJ Camden County \n", + "220258 34007 NJ Camden County \n", + "220259 34007 NJ Camden County \n", + "220260 34007 NJ Camden County \n", + "\n", + "[220261 rows x 6 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score_df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f84f9e1d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GEOID10 string\n", + "Score E (percentile) float64\n", + "Score E (top 25th percentile) bool\n", + "GEOID int64\n", + "State Abbreviation object\n", + "County Name object\n", + "dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8d61e29e", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = cbg_usa_df.merge(\n", + " score_df, on=\"GEOID10\", how=\"left\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7e8c2f2a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10Score E (percentile)Score E (top 25th percentile)GEOIDState AbbreviationCounty Name
01000104140020.808889True10001.0DEKent County
11000104150020.555160False10001.0DEKent County
21000104170110.272392False10001.0DEKent County
31000104170120.345686False10001.0DEKent County
41000104220110.472567False10001.0DEKent County
.....................
2203293400760200040.921941True34007.0NJCamden County
2203303400760170020.934490True34007.0NJCamden County
2203313400760150050.889613True34007.0NJCamden County
2203323400760910320.627822False34007.0NJCamden County
2203333400760530020.762237True34007.0NJCamden County
\n", + "

220334 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n", + "0 100010414002 0.808889 True \n", + "1 100010415002 0.555160 False \n", + "2 100010417011 0.272392 False \n", + "3 100010417012 0.345686 False \n", + "4 100010422011 0.472567 False \n", + "... ... ... ... \n", + "220329 340076020004 0.921941 True \n", + "220330 340076017002 0.934490 True \n", + "220331 340076015005 0.889613 True \n", + "220332 340076091032 0.627822 False \n", + "220333 340076053002 0.762237 True \n", + "\n", + " GEOID State Abbreviation County Name \n", + "0 10001.0 DE Kent County \n", + "1 10001.0 DE Kent County \n", + "2 10001.0 DE Kent County \n", + "3 10001.0 DE Kent County \n", + "4 10001.0 DE Kent County \n", + "... ... ... ... \n", + "220329 34007.0 NJ Camden County \n", + "220330 34007.0 NJ Camden County \n", + "220331 34007.0 NJ Camden County \n", + "220332 34007.0 NJ Camden County \n", + "220333 34007.0 NJ Camden County \n", + "\n", + "[220334 rows x 6 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e81b1321", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GEOID10Score E (percentile)Score E (top 25th percentile)GEOIDState AbbreviationCounty Name
10614515150501002NaNNaNNaNNaNNaN
10615515150501003NaNNaNNaNNaNNaN
10627515150501001NaNNaNNaNNaNNaN
10628515150501005NaNNaNNaNNaNNaN
10629515150501004NaNNaNNaNNaNNaN
.....................
174140040190029031NaNNaNNaNNaNNaN
174143040190027012NaNNaNNaNNaNNaN
174184040190027011NaNNaNNaNNaNNaN
174242040194105021NaNNaNNaNNaNNaN
174243040194105011NaNNaNNaNNaNNaN
\n", + "

73 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n", + "10614 515150501002 NaN NaN \n", + "10615 515150501003 NaN NaN \n", + "10627 515150501001 NaN NaN \n", + "10628 515150501005 NaN NaN \n", + "10629 515150501004 NaN NaN \n", + "... ... ... ... \n", + "174140 040190029031 NaN NaN \n", + "174143 040190027012 NaN NaN \n", + "174184 040190027011 NaN NaN \n", + "174242 040194105021 NaN NaN \n", + "174243 040194105011 NaN NaN \n", + "\n", + " GEOID State Abbreviation County Name \n", + "10614 NaN NaN NaN \n", + "10615 NaN NaN NaN \n", + "10627 NaN NaN NaN \n", + "10628 NaN NaN NaN \n", + "10629 NaN NaN NaN \n", + "... ... ... ... \n", + "174140 NaN NaN NaN \n", + "174143 NaN NaN NaN \n", + "174184 NaN NaN NaN \n", + "174242 NaN NaN NaN \n", + "174243 NaN NaN NaN \n", + "\n", + "[73 rows x 6 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df[merged_df[\"Score E (percentile)\"].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1a7b71d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb b/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb index 747796aa..8e57da6d 100644 --- a/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb +++ b/data/data-pipeline/ipython/Score_Dissolve_Script.ipynb @@ -2,7 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [], "source": [ "import pandas as pd\n", "import geopandas as gpd\n", @@ -10,24 +12,24 @@ "import pathlib\n", "import os\n", "import sys" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "metadata": {}, + "outputs": [], "source": [ "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n", " state_gdf = gpd.read_file(file_name)\n", @@ -100,104 +102,133 @@ " state_bucketed_df = aggregate_buckets(state_tracts, \"mean\")\n", " compressed = breakup_multipolygons(state_bucketed_df, num_buckets)\n", " write_to_file(compressed, file_name)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, + "metadata": { + "id": "Ia5bqxS2LJqe" + }, + "outputs": [], "source": [ "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", "CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n", - "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n", - "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"})" - ], - "outputs": [], - "metadata": { - "id": "Ia5bqxS2LJqe" - } + "CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n", + "score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)" + ] }, { "cell_type": "code", - "execution_count": null, - "source": [ - "master_df = gpd.GeoDataFrame()" - ], - "outputs": [], + "execution_count": 7, "metadata": { "id": "Dtf5qD50JvCw" - } + }, + "outputs": [], + "source": [ + "master_df = gpd.GeoDataFrame()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty GeoDataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "master_df.head()" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "id": "PNdw8bERJyKk" + }, + "outputs": [], "source": [ "for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n", " state_gdf = gpd.read_file(file_name)\n", " master_df = master_df.append(state_gdf)" - ], - "outputs": [], - "metadata": { - "id": "PNdw8bERJyKk" - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")" - ], - "outputs": [], "metadata": { "id": "B5SS9y2pLwks" - } + }, + "outputs": [], + "source": [ + "master_df = master_df.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "master_df.shape" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(220742, 13)" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 68 - } - ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_C6vaR9HQeLa", "outputId": "fab3bc7f-e716-431e-bc76-bd26289ea4a4" - } + }, + "outputs": [], + "source": [ + "master_df.shape" + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "master_df.head(2)" - ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oMoubjqCQiw5", + "outputId": "6195ffbc-6275-40c6-bb6a-e0a6bd1e71f0" + }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " STATEFP10 ... geometry\n", - "0 01 ... POLYGON ((-85.17240 31.82508, -85.17334 31.824...\n", - "1 01 ... POLYGON ((-85.16283 31.81051, -85.16284 31.813...\n", - "\n", - "[2 rows x 13 columns]" - ], "text/html": [ "
\n", "