j40-cejst-2/data/data-pipeline/data_pipeline/ipython/Score Validate.ipynb
Lucas Merrill Brown b1a4d26be8
Adding persistent poverty tracts (#738)
* persistent poverty working

* fixing left-padding

* running black and adding persistent poverty to comp tool

* fixing bug

* running black and fixing linter

* fixing linter

* fixing linter error
2021-09-22 17:57:08 -04:00

771 lines
23 KiB
Text
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "3ab8f7c1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8c22494f",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "eb31e9a1",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"OUTPUT_PATH = DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
"CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "95a5f8d8",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=[\"GEOID10\"],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bdd9ab60",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "05a40080",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "114af777",
"metadata": {},
"outputs": [],
"source": [
"score_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d5f3ebd4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.808889</td>\n",
" <td>True</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.555160</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.272392</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.345686</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.472567</td>\n",
" <td>False</td>\n",
" <td>10001</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220256</th>\n",
" <td>340076020004</td>\n",
" <td>0.921941</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220257</th>\n",
" <td>340076017002</td>\n",
" <td>0.934490</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220258</th>\n",
" <td>340076015005</td>\n",
" <td>0.889613</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220259</th>\n",
" <td>340076091032</td>\n",
" <td>0.627822</td>\n",
" <td>False</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220260</th>\n",
" <td>340076053002</td>\n",
" <td>0.762237</td>\n",
" <td>True</td>\n",
" <td>34007</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>220261 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"0 100010414002 0.808889 True \n",
"1 100010415002 0.555160 False \n",
"2 100010417011 0.272392 False \n",
"3 100010417012 0.345686 False \n",
"4 100010422011 0.472567 False \n",
"... ... ... ... \n",
"220256 340076020004 0.921941 True \n",
"220257 340076017002 0.934490 True \n",
"220258 340076015005 0.889613 True \n",
"220259 340076091032 0.627822 False \n",
"220260 340076053002 0.762237 True \n",
"\n",
" GEOID State Abbreviation County Name \n",
"0 10001 DE Kent County \n",
"1 10001 DE Kent County \n",
"2 10001 DE Kent County \n",
"3 10001 DE Kent County \n",
"4 10001 DE Kent County \n",
"... ... ... ... \n",
"220256 34007 NJ Camden County \n",
"220257 34007 NJ Camden County \n",
"220258 34007 NJ Camden County \n",
"220259 34007 NJ Camden County \n",
"220260 34007 NJ Camden County \n",
"\n",
"[220261 rows x 6 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f84f9e1d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Score E (percentile) float64\n",
"Score E (top 25th percentile) bool\n",
"GEOID int64\n",
"State Abbreviation object\n",
"County Name object\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8d61e29e",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(score_df, on=\"GEOID10\", how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7e8c2f2a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.808889</td>\n",
" <td>True</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.555160</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.272392</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.345686</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.472567</td>\n",
" <td>False</td>\n",
" <td>10001.0</td>\n",
" <td>DE</td>\n",
" <td>Kent County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220329</th>\n",
" <td>340076020004</td>\n",
" <td>0.921941</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220330</th>\n",
" <td>340076017002</td>\n",
" <td>0.934490</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220331</th>\n",
" <td>340076015005</td>\n",
" <td>0.889613</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220332</th>\n",
" <td>340076091032</td>\n",
" <td>0.627822</td>\n",
" <td>False</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220333</th>\n",
" <td>340076053002</td>\n",
" <td>0.762237</td>\n",
" <td>True</td>\n",
" <td>34007.0</td>\n",
" <td>NJ</td>\n",
" <td>Camden County</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>220334 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"0 100010414002 0.808889 True \n",
"1 100010415002 0.555160 False \n",
"2 100010417011 0.272392 False \n",
"3 100010417012 0.345686 False \n",
"4 100010422011 0.472567 False \n",
"... ... ... ... \n",
"220329 340076020004 0.921941 True \n",
"220330 340076017002 0.934490 True \n",
"220331 340076015005 0.889613 True \n",
"220332 340076091032 0.627822 False \n",
"220333 340076053002 0.762237 True \n",
"\n",
" GEOID State Abbreviation County Name \n",
"0 10001.0 DE Kent County \n",
"1 10001.0 DE Kent County \n",
"2 10001.0 DE Kent County \n",
"3 10001.0 DE Kent County \n",
"4 10001.0 DE Kent County \n",
"... ... ... ... \n",
"220329 34007.0 NJ Camden County \n",
"220330 34007.0 NJ Camden County \n",
"220331 34007.0 NJ Camden County \n",
"220332 34007.0 NJ Camden County \n",
"220333 34007.0 NJ Camden County \n",
"\n",
"[220334 rows x 6 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e81b1321",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Score E (percentile)</th>\n",
" <th>Score E (top 25th percentile)</th>\n",
" <th>GEOID</th>\n",
" <th>State Abbreviation</th>\n",
" <th>County Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10614</th>\n",
" <td>515150501002</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10615</th>\n",
" <td>515150501003</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10627</th>\n",
" <td>515150501001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10628</th>\n",
" <td>515150501005</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10629</th>\n",
" <td>515150501004</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174140</th>\n",
" <td>040190029031</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174143</th>\n",
" <td>040190027012</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174184</th>\n",
" <td>040190027011</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174242</th>\n",
" <td>040194105021</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174243</th>\n",
" <td>040194105011</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Score E (percentile) Score E (top 25th percentile) \\\n",
"10614 515150501002 NaN NaN \n",
"10615 515150501003 NaN NaN \n",
"10627 515150501001 NaN NaN \n",
"10628 515150501005 NaN NaN \n",
"10629 515150501004 NaN NaN \n",
"... ... ... ... \n",
"174140 040190029031 NaN NaN \n",
"174143 040190027012 NaN NaN \n",
"174184 040190027011 NaN NaN \n",
"174242 040194105021 NaN NaN \n",
"174243 040194105011 NaN NaN \n",
"\n",
" GEOID State Abbreviation County Name \n",
"10614 NaN NaN NaN \n",
"10615 NaN NaN NaN \n",
"10627 NaN NaN NaN \n",
"10628 NaN NaN NaN \n",
"10629 NaN NaN NaN \n",
"... ... ... ... \n",
"174140 NaN NaN NaN \n",
"174143 NaN NaN NaN \n",
"174184 NaN NaN NaN \n",
"174242 NaN NaN NaN \n",
"174243 NaN NaN NaN \n",
"\n",
"[73 rows x 6 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Score E (percentile)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1a7b71d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}