{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "43c5dbee", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import csv\n", "from pathlib import Path\n", "import os\n", "import sys" ] }, { "cell_type": "code", "execution_count": 2, "id": "f97c95f6", "metadata": {}, "outputs": [], "source": [ "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)" ] }, { "cell_type": "code", "execution_count": 25, "id": "b8a2b53e", "metadata": {}, "outputs": [], "source": [ "DATA_PATH = Path.cwd().parent / \"data\"\n", "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", "ACS_YEAR = \"2019\"\n", "OUTPUT_PATH = (\n", " DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n", " )\n", "CENSUS_USA_CSV = (\n", " DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n", " )" ] }, { "cell_type": "code", "execution_count": 29, "id": "0d33e8db", "metadata": {}, "outputs": [], "source": [ "cbg_usa_df = pd.read_csv(\n", " CENSUS_USA_CSV,\n", " names=['GEOID10'],\n", " dtype={\"GEOID10\": \"string\"},\n", " low_memory=False,\n", " header=None\n", " )" ] }, { "cell_type": "code", "execution_count": 30, "id": "01e6dbe3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GEOID10
0100010414002
1100010415002
2100010417011
3100010417012
4100010422011
\n", "
" ], "text/plain": [ " GEOID10\n", "0 100010414002\n", "1 100010415002\n", "2 100010417011\n", "3 100010417012\n", "4 100010422011" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cbg_usa_df.head()" ] }, { "cell_type": "code", "execution_count": 31, "id": "341dbcb6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GEOID10 string\n", "dtype: object" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cbg_usa_df.dtypes" ] }, { "cell_type": "code", "execution_count": 41, "id": "eb25d4bf", "metadata": {}, "outputs": [], "source": [ "acs_df = pd.read_csv(\n", " OUTPUT_PATH / \"usa.csv\",\n", " dtype={\"GEOID10\": \"string\"},\n", " low_memory=False,\n", " )" ] }, { "cell_type": "code", "execution_count": 42, "id": "d4c9d010", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GEOID10Unemployed civilians (percent)Linguistic isolation (percent)
00103996200020.0771080.0
10103996180020.1262140.0
20103996160040.1331720.0
30103996160020.0282490.0
40103996160010.0630370.0
\n", "
" ], "text/plain": [ " GEOID10 Unemployed civilians (percent) \\\n", "0 010399620002 0.077108 \n", "1 010399618002 0.126214 \n", "2 010399616004 0.133172 \n", "3 010399616002 0.028249 \n", "4 010399616001 0.063037 \n", "\n", " Linguistic isolation (percent) \n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "acs_df.head()" ] }, { "cell_type": "code", "execution_count": 43, "id": "dd390179", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GEOID10 string\n", "Unemployed civilians (percent) float64\n", "Linguistic isolation (percent) float64\n", "dtype: object" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "acs_df.dtypes" ] }, { "cell_type": "code", "execution_count": 44, "id": "236eb093", "metadata": {}, "outputs": [], "source": [ "merged_df = cbg_usa_df.merge(\n", " acs_df, on=\"GEOID10\", how=\"left\"\n", " )" ] }, { "cell_type": "code", "execution_count": 45, "id": "4fff1845", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GEOID10Unemployed civilians (percent)Linguistic isolation (percent)
01000104140020.0306120.065963
11000104150020.1180560.010283
21000104170110.0423730.000000
31000104170120.0424730.010435
41000104220110.0543580.000000
\n", "
" ], "text/plain": [ " GEOID10 Unemployed civilians (percent) \\\n", "0 100010414002 0.030612 \n", "1 100010415002 0.118056 \n", "2 100010417011 0.042373 \n", "3 100010417012 0.042473 \n", "4 100010422011 0.054358 \n", "\n", " Linguistic isolation (percent) \n", "0 0.065963 \n", "1 0.010283 \n", "2 0.000000 \n", "3 0.010435 \n", "4 0.000000 " ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df.head()" ] }, { "cell_type": "code", "execution_count": 64, "id": "f8903557", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GEOID10Unemployed civilians (percent)Linguistic isolation (percent)
34100019900000NaNNaN
377100030169041NaNNaN
392100059900000NaNNaN
400100039901000NaNNaN
416100039801001NaNNaN
............
219505340057048013NaNNaN
219508340057048024NaNNaN
219758340258047001NaNNaN
219807340259900000NaNNaN
220134340076113001NaN0.0
\n", "

1462 rows × 3 columns

\n", "
" ], "text/plain": [ " GEOID10 Unemployed civilians (percent) \\\n", "34 100019900000 NaN \n", "377 100030169041 NaN \n", "392 100059900000 NaN \n", "400 100039901000 NaN \n", "416 100039801001 NaN \n", "... ... ... \n", "219505 340057048013 NaN \n", "219508 340057048024 NaN \n", "219758 340258047001 NaN \n", "219807 340259900000 NaN \n", "220134 340076113001 NaN \n", "\n", " Linguistic isolation (percent) \n", "34 NaN \n", "377 NaN \n", "392 NaN \n", "400 NaN \n", "416 NaN \n", "... ... \n", "219505 NaN \n", "219508 NaN \n", "219758 NaN \n", "219807 NaN \n", "220134 0.0 \n", "\n", "[1462 rows x 3 columns]" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]" ] }, { "cell_type": "code", "execution_count": null, "id": "b870a21f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 5 }