mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-10-24 00:13:51 -07:00
* Fixes #456 - Our data directory should adopt standard python package structure * a few missed references * updating readme * updating requirements * Running Black * Fixes for flake8 * updating pylint
567 lines
15 KiB
Text
567 lines
15 KiB
Text
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "43c5dbee",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import csv\n",
|
||
"from pathlib import Path\n",
|
||
"import os\n",
|
||
"import sys"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "f97c95f6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
|
||
"if module_path not in sys.path:\n",
|
||
" sys.path.append(module_path)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "b8a2b53e",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
|
||
"ACS_YEAR = \"2019\"\n",
|
||
"OUTPUT_PATH = (\n",
|
||
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
|
||
" )\n",
|
||
"CENSUS_USA_CSV = (\n",
|
||
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
|
||
" )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"id": "0d33e8db",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"cbg_usa_df = pd.read_csv(\n",
|
||
" CENSUS_USA_CSV,\n",
|
||
" names=['GEOID10'],\n",
|
||
" dtype={\"GEOID10\": \"string\"},\n",
|
||
" low_memory=False,\n",
|
||
" header=None\n",
|
||
" )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"id": "01e6dbe3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>100010414002</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>100010415002</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>100010417011</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>100010417012</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>100010422011</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10\n",
|
||
"0 100010414002\n",
|
||
"1 100010415002\n",
|
||
"2 100010417011\n",
|
||
"3 100010417012\n",
|
||
"4 100010422011"
|
||
]
|
||
},
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"cbg_usa_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"id": "341dbcb6",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"GEOID10 string\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"cbg_usa_df.dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"id": "eb25d4bf",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"acs_df = pd.read_csv(\n",
|
||
" OUTPUT_PATH / \"usa.csv\",\n",
|
||
" dtype={\"GEOID10\": \"string\"},\n",
|
||
" low_memory=False,\n",
|
||
" )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"id": "d4c9d010",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10</th>\n",
|
||
" <th>Unemployed civilians (percent)</th>\n",
|
||
" <th>Linguistic isolation (percent)</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>010399620002</td>\n",
|
||
" <td>0.077108</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>010399618002</td>\n",
|
||
" <td>0.126214</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>010399616004</td>\n",
|
||
" <td>0.133172</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>010399616002</td>\n",
|
||
" <td>0.028249</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>010399616001</td>\n",
|
||
" <td>0.063037</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10 Unemployed civilians (percent) \\\n",
|
||
"0 010399620002 0.077108 \n",
|
||
"1 010399618002 0.126214 \n",
|
||
"2 010399616004 0.133172 \n",
|
||
"3 010399616002 0.028249 \n",
|
||
"4 010399616001 0.063037 \n",
|
||
"\n",
|
||
" Linguistic isolation (percent) \n",
|
||
"0 0.0 \n",
|
||
"1 0.0 \n",
|
||
"2 0.0 \n",
|
||
"3 0.0 \n",
|
||
"4 0.0 "
|
||
]
|
||
},
|
||
"execution_count": 42,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"acs_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"id": "dd390179",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"GEOID10 string\n",
|
||
"Unemployed civilians (percent) float64\n",
|
||
"Linguistic isolation (percent) float64\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 43,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"acs_df.dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"id": "236eb093",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"merged_df = cbg_usa_df.merge(\n",
|
||
" acs_df, on=\"GEOID10\", how=\"left\"\n",
|
||
" )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"id": "4fff1845",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10</th>\n",
|
||
" <th>Unemployed civilians (percent)</th>\n",
|
||
" <th>Linguistic isolation (percent)</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>100010414002</td>\n",
|
||
" <td>0.030612</td>\n",
|
||
" <td>0.065963</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>100010415002</td>\n",
|
||
" <td>0.118056</td>\n",
|
||
" <td>0.010283</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>100010417011</td>\n",
|
||
" <td>0.042373</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>100010417012</td>\n",
|
||
" <td>0.042473</td>\n",
|
||
" <td>0.010435</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>100010422011</td>\n",
|
||
" <td>0.054358</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10 Unemployed civilians (percent) \\\n",
|
||
"0 100010414002 0.030612 \n",
|
||
"1 100010415002 0.118056 \n",
|
||
"2 100010417011 0.042373 \n",
|
||
"3 100010417012 0.042473 \n",
|
||
"4 100010422011 0.054358 \n",
|
||
"\n",
|
||
" Linguistic isolation (percent) \n",
|
||
"0 0.065963 \n",
|
||
"1 0.010283 \n",
|
||
"2 0.000000 \n",
|
||
"3 0.010435 \n",
|
||
"4 0.000000 "
|
||
]
|
||
},
|
||
"execution_count": 45,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"merged_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 64,
|
||
"id": "f8903557",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10</th>\n",
|
||
" <th>Unemployed civilians (percent)</th>\n",
|
||
" <th>Linguistic isolation (percent)</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>34</th>\n",
|
||
" <td>100019900000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>377</th>\n",
|
||
" <td>100030169041</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>392</th>\n",
|
||
" <td>100059900000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>400</th>\n",
|
||
" <td>100039901000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>416</th>\n",
|
||
" <td>100039801001</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>219505</th>\n",
|
||
" <td>340057048013</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>219508</th>\n",
|
||
" <td>340057048024</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>219758</th>\n",
|
||
" <td>340258047001</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>219807</th>\n",
|
||
" <td>340259900000</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>220134</th>\n",
|
||
" <td>340076113001</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1462 rows × 3 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10 Unemployed civilians (percent) \\\n",
|
||
"34 100019900000 NaN \n",
|
||
"377 100030169041 NaN \n",
|
||
"392 100059900000 NaN \n",
|
||
"400 100039901000 NaN \n",
|
||
"416 100039801001 NaN \n",
|
||
"... ... ... \n",
|
||
"219505 340057048013 NaN \n",
|
||
"219508 340057048024 NaN \n",
|
||
"219758 340258047001 NaN \n",
|
||
"219807 340259900000 NaN \n",
|
||
"220134 340076113001 NaN \n",
|
||
"\n",
|
||
" Linguistic isolation (percent) \n",
|
||
"34 NaN \n",
|
||
"377 NaN \n",
|
||
"392 NaN \n",
|
||
"400 NaN \n",
|
||
"416 NaN \n",
|
||
"... ... \n",
|
||
"219505 NaN \n",
|
||
"219508 NaN \n",
|
||
"219758 NaN \n",
|
||
"219807 NaN \n",
|
||
"220134 0.0 \n",
|
||
"\n",
|
||
"[1462 rows x 3 columns]"
|
||
]
|
||
},
|
||
"execution_count": 64,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "b870a21f",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.2"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|