Data directory should adopt standard Poetry-suggested python package structure (#457)

* Fixes #456 - Our data directory should adopt standard python package structure * a few missed references * updating readme * updating requirements * Running Black * Fixes for flake8 * updating pylint
2025-09-13 18:28:17 -07:00 · 2021-08-05 15:35:54 -04:00 · 2021-08-05 15:35:54 -04:00 · c1568e87c0
commit c1568e87c0
parent 4d7465c833
61 changed files with 1273 additions and 1256 deletions
--- a/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb
@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "import pandas as pd\n",
+    "import csv\n",
+    "from pathlib import Path\n",
+    "import os\n",
+    "import sys"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "module_path = os.path.abspath(os.path.join(\"..\"))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)\n",
+    "    \n",
+    "from data_pipeline.utils import unzip_file_from_url\n",
+    "from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "DATA_PATH = Path.cwd().parent / \"data\"\n",
+    "TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
+    "STATE_CSV = DATA_PATH / \"census\" / \"csv\" / \"fips_states_2010.csv\"\n",
+    "SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa.csv\"\n",
+    "COUNTY_SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa-county.csv\"\n",
+    "CENSUS_COUNTIES_ZIP_URL = \"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_counties_national.zip\"\n",
+    "CENSUS_COUNTIES_TXT = TMP_PATH / \"2020_Gaz_counties_national.txt\""
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "unzip_file_from_url(CENSUS_COUNTIES_ZIP_URL, TMP_PATH, TMP_PATH)"
+   ],
+   "outputs": [],
+   "metadata": {
+    "scrolled": true
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n",
+    "counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n",
+    "counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n",
+    "counties_df.head()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n",
+    "states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n",
+    "states_df.head()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n",
+    "del county_state_merged[\"State Abbreviation Other\"]\n",
+    "county_state_merged.head()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "score_df = pd.read_csv(SCORE_CSV, dtype={\"GEOID10\": \"string\"})\n",
+    "score_df[\"GEOID\"] = score_df.GEOID10.str[:5]\n",
+    "score_df.head()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n",
+    "del score_county_state_merged[\"GEOID_OTHER\"]\n",
+    "score_county_state_merged.head()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "score_county_state_merged.to_csv(COUNTY_SCORE_CSV, index=False)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [],
+   "outputs": [],
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}