{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "7185e18d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import csv\n", "from pathlib import Path\n", "import os\n", "import sys" ] }, { "cell_type": "code", "execution_count": null, "id": "174bbd09", "metadata": {}, "outputs": [], "source": [ "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", " \n", "from utils import unzip_file_from_url\n", "from etl.sources.census.etl_utils import get_state_fips_codes" ] }, { "cell_type": "code", "execution_count": null, "id": "dd090fcc", "metadata": {}, "outputs": [], "source": [ "DATA_PATH = Path.cwd().parent / \"data\"\n", "TMP_PATH: Path = DATA_PATH / \"tmp\"\n", "STATE_CSV = DATA_PATH / \"census\" / \"csv\" / \"fips_states_2010.csv\"\n", "SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa.csv\"\n", "COUNTY_SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa-county.csv\"\n", "CENSUS_COUNTIES_ZIP_URL = \"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_counties_national.zip\"\n", "CENSUS_COUNTIES_TXT = TMP_PATH / \"2020_Gaz_counties_national.txt\"" ] }, { "cell_type": "code", "execution_count": null, "id": "cf2e266b", "metadata": { "scrolled": true }, "outputs": [], "source": [ "unzip_file_from_url(CENSUS_COUNTIES_ZIP_URL, TMP_PATH, TMP_PATH)" ] }, { "cell_type": "code", "execution_count": null, "id": "9ff96da8", "metadata": {}, "outputs": [], "source": [ "counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n", "counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n", "counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n", "counties_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "5af103da", "metadata": {}, "outputs": [], "source": [ "states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n", "states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n", "states_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "c8680258", "metadata": {}, "outputs": [], "source": [ "county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n", "del county_state_merged[\"State Abbreviation Other\"]\n", "county_state_merged.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "58dca55a", "metadata": {}, "outputs": [], "source": [ "score_df = pd.read_csv(SCORE_CSV, dtype={\"GEOID10\": \"string\"})\n", "score_df[\"GEOID\"] = score_df.GEOID10.str[:5]\n", "score_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "45e04d42", "metadata": {}, "outputs": [], "source": [ "score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n", "del score_county_state_merged[\"GEOID_OTHER\"]\n", "score_county_state_merged.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "a5a0b32b", "metadata": {}, "outputs": [], "source": [ "score_county_state_merged.to_csv(COUNTY_SCORE_CSV, index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "b690937e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 5 }