Data directory should adopt standard Poetry-suggested python package structure (#457)

* Fixes #456 - Our data directory should adopt standard python package structure * a few missed references * updating readme * updating requirements * Running Black * Fixes for flake8 * updating pylint
2025-07-28 13:11:17 -07:00 · 2021-08-05 15:35:54 -04:00 · 2021-08-05 15:35:54 -04:00 · c1568e87c0
commit c1568e87c0
parent 4d7465c833
61 changed files with 1273 additions and 1256 deletions
--- a/data/data-pipeline/data_pipeline/ipython/ACS
+++ b/data/data-pipeline/data_pipeline/ipython/ACS
@ -0,0 +1,567 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "43c5dbee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import csv\n",
+    "from pathlib import Path\n",
+    "import os\n",
+    "import sys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f97c95f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.abspath(os.path.join(\"..\"))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "b8a2b53e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_PATH = Path.cwd().parent / \"data\"\n",
+    "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
+    "ACS_YEAR = \"2019\"\n",
+    "OUTPUT_PATH = (\n",
+    "            DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
+    "        )\n",
+    "CENSUS_USA_CSV = (\n",
+    "            DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "0d33e8db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cbg_usa_df = pd.read_csv(\n",
+    "            CENSUS_USA_CSV,\n",
+    "            names=['GEOID10'],\n",
+    "            dtype={\"GEOID10\": \"string\"},\n",
+    "            low_memory=False,\n",
+    "            header=None\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "01e6dbe3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GEOID10</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>100010414002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>100010415002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>100010417011</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>100010417012</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>100010422011</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        GEOID10\n",
+       "0  100010414002\n",
+       "1  100010415002\n",
+       "2  100010417011\n",
+       "3  100010417012\n",
+       "4  100010422011"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cbg_usa_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "341dbcb6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "GEOID10    string\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cbg_usa_df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "eb25d4bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "acs_df = pd.read_csv(\n",
+    "            OUTPUT_PATH / \"usa.csv\",\n",
+    "            dtype={\"GEOID10\": \"string\"},\n",
+    "            low_memory=False,\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "d4c9d010",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GEOID10</th>\n",
+       "      <th>Unemployed civilians (percent)</th>\n",
+       "      <th>Linguistic isolation (percent)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>010399620002</td>\n",
+       "      <td>0.077108</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>010399618002</td>\n",
+       "      <td>0.126214</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>010399616004</td>\n",
+       "      <td>0.133172</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>010399616002</td>\n",
+       "      <td>0.028249</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>010399616001</td>\n",
+       "      <td>0.063037</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        GEOID10  Unemployed civilians (percent)  \\\n",
+       "0  010399620002                        0.077108   \n",
+       "1  010399618002                        0.126214   \n",
+       "2  010399616004                        0.133172   \n",
+       "3  010399616002                        0.028249   \n",
+       "4  010399616001                        0.063037   \n",
+       "\n",
+       "   Linguistic isolation (percent)  \n",
+       "0                             0.0  \n",
+       "1                             0.0  \n",
+       "2                             0.0  \n",
+       "3                             0.0  \n",
+       "4                             0.0  "
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "acs_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "dd390179",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "GEOID10                            string\n",
+       "Unemployed civilians (percent)    float64\n",
+       "Linguistic isolation (percent)    float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "acs_df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "236eb093",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_df = cbg_usa_df.merge(\n",
+    "            acs_df, on=\"GEOID10\", how=\"left\"\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "4fff1845",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GEOID10</th>\n",
+       "      <th>Unemployed civilians (percent)</th>\n",
+       "      <th>Linguistic isolation (percent)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>100010414002</td>\n",
+       "      <td>0.030612</td>\n",
+       "      <td>0.065963</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>100010415002</td>\n",
+       "      <td>0.118056</td>\n",
+       "      <td>0.010283</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>100010417011</td>\n",
+       "      <td>0.042373</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>100010417012</td>\n",
+       "      <td>0.042473</td>\n",
+       "      <td>0.010435</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>100010422011</td>\n",
+       "      <td>0.054358</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        GEOID10  Unemployed civilians (percent)  \\\n",
+       "0  100010414002                        0.030612   \n",
+       "1  100010415002                        0.118056   \n",
+       "2  100010417011                        0.042373   \n",
+       "3  100010417012                        0.042473   \n",
+       "4  100010422011                        0.054358   \n",
+       "\n",
+       "   Linguistic isolation (percent)  \n",
+       "0                        0.065963  \n",
+       "1                        0.010283  \n",
+       "2                        0.000000  \n",
+       "3                        0.010435  \n",
+       "4                        0.000000  "
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "f8903557",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GEOID10</th>\n",
+       "      <th>Unemployed civilians (percent)</th>\n",
+       "      <th>Linguistic isolation (percent)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>100019900000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>377</th>\n",
+       "      <td>100030169041</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>392</th>\n",
+       "      <td>100059900000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>400</th>\n",
+       "      <td>100039901000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>416</th>\n",
+       "      <td>100039801001</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>219505</th>\n",
+       "      <td>340057048013</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>219508</th>\n",
+       "      <td>340057048024</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>219758</th>\n",
+       "      <td>340258047001</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>219807</th>\n",
+       "      <td>340259900000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>220134</th>\n",
+       "      <td>340076113001</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1462 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             GEOID10  Unemployed civilians (percent)  \\\n",
+       "34      100019900000                             NaN   \n",
+       "377     100030169041                             NaN   \n",
+       "392     100059900000                             NaN   \n",
+       "400     100039901000                             NaN   \n",
+       "416     100039801001                             NaN   \n",
+       "...              ...                             ...   \n",
+       "219505  340057048013                             NaN   \n",
+       "219508  340057048024                             NaN   \n",
+       "219758  340258047001                             NaN   \n",
+       "219807  340259900000                             NaN   \n",
+       "220134  340076113001                             NaN   \n",
+       "\n",
+       "        Linguistic isolation (percent)  \n",
+       "34                                 NaN  \n",
+       "377                                NaN  \n",
+       "392                                NaN  \n",
+       "400                                NaN  \n",
+       "416                                NaN  \n",
+       "...                                ...  \n",
+       "219505                             NaN  \n",
+       "219508                             NaN  \n",
+       "219758                             NaN  \n",
+       "219807                             NaN  \n",
+       "220134                             0.0  \n",
+       "\n",
+       "[1462 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged_df[merged_df[\"Unemployed civilians (percent)\"].isnull()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b870a21f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}