{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "43c5dbee",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f97c95f6",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b8a2b53e",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"ACS_YEAR = \"2019\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0d33e8db",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "01e6dbe3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GEOID10 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100010414002 | \n",
"
\n",
" \n",
" 1 | \n",
" 100010415002 | \n",
"
\n",
" \n",
" 2 | \n",
" 100010417011 | \n",
"
\n",
" \n",
" 3 | \n",
" 100010417012 | \n",
"
\n",
" \n",
" 4 | \n",
" 100010422011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "341dbcb6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "eb25d4bf",
"metadata": {},
"outputs": [],
"source": [
"acs_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d4c9d010",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GEOID10 | \n",
" Unemployed civilians (percent) | \n",
" Linguistic isolation (percent) | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 010399620002 | \n",
" 0.077108 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 010399618002 | \n",
" 0.126214 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 010399616004 | \n",
" 0.133172 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 010399616002 | \n",
" 0.028249 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 010399616001 | \n",
" 0.063037 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 010399620002 0.077108 \n",
"1 010399618002 0.126214 \n",
"2 010399616004 0.133172 \n",
"3 010399616002 0.028249 \n",
"4 010399616001 0.063037 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "dd390179",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Unemployed civilians (percent) float64\n",
"Linguistic isolation (percent) float64\n",
"dtype: object"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "236eb093",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" acs_df, on=\"GEOID10\", how=\"left\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "4fff1845",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GEOID10 | \n",
" Unemployed civilians (percent) | \n",
" Linguistic isolation (percent) | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100010414002 | \n",
" 0.030612 | \n",
" 0.065963 | \n",
"
\n",
" \n",
" 1 | \n",
" 100010415002 | \n",
" 0.118056 | \n",
" 0.010283 | \n",
"
\n",
" \n",
" 2 | \n",
" 100010417011 | \n",
" 0.042373 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" 100010417012 | \n",
" 0.042473 | \n",
" 0.010435 | \n",
"
\n",
" \n",
" 4 | \n",
" 100010422011 | \n",
" 0.054358 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 100010414002 0.030612 \n",
"1 100010415002 0.118056 \n",
"2 100010417011 0.042373 \n",
"3 100010417012 0.042473 \n",
"4 100010422011 0.054358 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.065963 \n",
"1 0.010283 \n",
"2 0.000000 \n",
"3 0.010435 \n",
"4 0.000000 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "f8903557",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GEOID10 | \n",
" Unemployed civilians (percent) | \n",
" Linguistic isolation (percent) | \n",
"
\n",
" \n",
" \n",
" \n",
" 34 | \n",
" 100019900000 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 377 | \n",
" 100030169041 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 392 | \n",
" 100059900000 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 400 | \n",
" 100039901000 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 416 | \n",
" 100039801001 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 219505 | \n",
" 340057048013 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 219508 | \n",
" 340057048024 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 219758 | \n",
" 340258047001 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 219807 | \n",
" 340259900000 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 220134 | \n",
" 340076113001 | \n",
" NaN | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
1462 rows × 3 columns
\n",
"
"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"34 100019900000 NaN \n",
"377 100030169041 NaN \n",
"392 100059900000 NaN \n",
"400 100039901000 NaN \n",
"416 100039801001 NaN \n",
"... ... ... \n",
"219505 340057048013 NaN \n",
"219508 340057048024 NaN \n",
"219758 340258047001 NaN \n",
"219807 340259900000 NaN \n",
"220134 340076113001 NaN \n",
"\n",
" Linguistic isolation (percent) \n",
"34 NaN \n",
"377 NaN \n",
"392 NaN \n",
"400 NaN \n",
"416 NaN \n",
"... ... \n",
"219505 NaN \n",
"219508 NaN \n",
"219758 NaN \n",
"219807 NaN \n",
"220134 0.0 \n",
"\n",
"[1462 rows x 3 columns]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b870a21f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}