j40-cejst-2/data/data-pipeline/data_pipeline/ipython/county_lookup.ipynb
Lucas Merrill Brown b1a4d26be8
Adding persistent poverty tracts (#738)
* persistent poverty working

* fixing left-padding

* running black and adding persistent poverty to comp tool

* fixing bug

* running black and fixing linter

* fixing linter

* fixing linter error
2021-09-22 17:57:08 -04:00

166 lines
No EOL
4.1 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"source": [
"import pandas as pd\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import sys"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from data_pipeline.utils import unzip_file_from_url\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"STATE_CSV = DATA_PATH / \"census\" / \"csv\" / \"fips_states_2010.csv\"\n",
"SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa.csv\"\n",
"COUNTY_SCORE_CSV = DATA_PATH / \"score\" / \"csv\" / \"usa-county.csv\"\n",
"CENSUS_COUNTIES_ZIP_URL = \"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_counties_national.zip\"\n",
"CENSUS_COUNTIES_TXT = TMP_PATH / \"2020_Gaz_counties_national.txt\""
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"unzip_file_from_url(CENSUS_COUNTIES_ZIP_URL, TMP_PATH, TMP_PATH)"
],
"outputs": [],
"metadata": {
"scrolled": true
}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"counties_df = pd.read_csv(\n",
" CENSUS_COUNTIES_TXT,\n",
" sep=\"\\t\",\n",
" dtype={\"GEOID\": \"string\", \"USPS\": \"string\"},\n",
" low_memory=False,\n",
")\n",
"counties_df = counties_df[[\"USPS\", \"GEOID\", \"NAME\"]]\n",
"counties_df.rename(\n",
" columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True\n",
")\n",
"counties_df.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"states_df = pd.read_csv(\n",
" STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"}\n",
")\n",
"states_df.rename(\n",
" columns={\n",
" \"fips\": \"State Code\",\n",
" \"state_name\": \"State Name\",\n",
" \"state_abbreviation\": \"State Abbreviation\",\n",
" },\n",
" inplace=True,\n",
")\n",
"states_df.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"county_state_merged = counties_df.join(states_df, rsuffix=\" Other\")\n",
"del county_state_merged[\"State Abbreviation Other\"]\n",
"county_state_merged.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"score_df = pd.read_csv(SCORE_CSV, dtype={\"GEOID10\": \"string\"})\n",
"score_df[\"GEOID\"] = score_df.GEOID10.str[:5]\n",
"score_df.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"score_county_state_merged = score_df.join(county_state_merged, rsuffix=\"_OTHER\")\n",
"del score_county_state_merged[\"GEOID_OTHER\"]\n",
"score_county_state_merged.head()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"score_county_state_merged.to_csv(COUNTY_SCORE_CSV, index=False)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}