Data directory should adopt standard Poetry-suggested python package structure (#457)

* Fixes #456 - Our data directory should adopt standard python package structure
* a few missed references
* updating readme
* updating requirements
* Running Black
* Fixes for flake8
* updating pylint
This commit is contained in:
Nat Hillard 2021-08-05 15:35:54 -04:00 committed by GitHub
commit c1568e87c0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
61 changed files with 1273 additions and 1256 deletions

View file

@ -0,0 +1,567 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "43c5dbee",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f97c95f6",
"metadata": {},
"outputs": [],
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b8a2b53e",
"metadata": {},
"outputs": [],
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"ACS_YEAR = \"2019\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0d33e8db",
"metadata": {},
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "01e6dbe3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10\n",
"0 100010414002\n",
"1 100010415002\n",
"2 100010417011\n",
"3 100010417012\n",
"4 100010422011"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "341dbcb6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cbg_usa_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "eb25d4bf",
"metadata": {},
"outputs": [],
"source": [
"acs_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d4c9d010",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010399620002</td>\n",
" <td>0.077108</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010399618002</td>\n",
" <td>0.126214</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010399616004</td>\n",
" <td>0.133172</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010399616002</td>\n",
" <td>0.028249</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010399616001</td>\n",
" <td>0.063037</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 010399620002 0.077108 \n",
"1 010399618002 0.126214 \n",
"2 010399616004 0.133172 \n",
"3 010399616002 0.028249 \n",
"4 010399616001 0.063037 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "dd390179",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GEOID10 string\n",
"Unemployed civilians (percent) float64\n",
"Linguistic isolation (percent) float64\n",
"dtype: object"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"acs_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "236eb093",
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" acs_df, on=\"GEOID10\", how=\"left\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "4fff1845",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100010414002</td>\n",
" <td>0.030612</td>\n",
" <td>0.065963</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100010415002</td>\n",
" <td>0.118056</td>\n",
" <td>0.010283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100010417011</td>\n",
" <td>0.042373</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100010417012</td>\n",
" <td>0.042473</td>\n",
" <td>0.010435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100010422011</td>\n",
" <td>0.054358</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"0 100010414002 0.030612 \n",
"1 100010415002 0.118056 \n",
"2 100010417011 0.042373 \n",
"3 100010417012 0.042473 \n",
"4 100010422011 0.054358 \n",
"\n",
" Linguistic isolation (percent) \n",
"0 0.065963 \n",
"1 0.010283 \n",
"2 0.000000 \n",
"3 0.010435 \n",
"4 0.000000 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "f8903557",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>GEOID10</th>\n",
" <th>Unemployed civilians (percent)</th>\n",
" <th>Linguistic isolation (percent)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>100019900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>377</th>\n",
" <td>100030169041</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>392</th>\n",
" <td>100059900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400</th>\n",
" <td>100039901000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>416</th>\n",
" <td>100039801001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219505</th>\n",
" <td>340057048013</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219508</th>\n",
" <td>340057048024</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219758</th>\n",
" <td>340258047001</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219807</th>\n",
" <td>340259900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220134</th>\n",
" <td>340076113001</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1462 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" GEOID10 Unemployed civilians (percent) \\\n",
"34 100019900000 NaN \n",
"377 100030169041 NaN \n",
"392 100059900000 NaN \n",
"400 100039901000 NaN \n",
"416 100039801001 NaN \n",
"... ... ... \n",
"219505 340057048013 NaN \n",
"219508 340057048024 NaN \n",
"219758 340258047001 NaN \n",
"219807 340259900000 NaN \n",
"220134 340076113001 NaN \n",
"\n",
" Linguistic isolation (percent) \n",
"34 NaN \n",
"377 NaN \n",
"392 NaN \n",
"400 NaN \n",
"416 NaN \n",
"... ... \n",
"219505 NaN \n",
"219508 NaN \n",
"219758 NaN \n",
"219807 NaN \n",
"220134 0.0 \n",
"\n",
"[1462 rows x 3 columns]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b870a21f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}