This commit is contained in:
lucasmbrown-usds 2021-08-09 22:24:14 -05:00
commit ebe6180f7c
3 changed files with 180 additions and 4 deletions

View file

@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1b0b3db8",
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"import functools\n",
"import IPython\n",
"import itertools\n",
"import matplotlib\n",
"import numpy as np\n",
"import os\n",
"import pandas as pd\n",
"import pathlib\n",
"import pypandoc\n",
"import requests\n",
"import string\n",
"import sys\n",
"import typing\n",
"import us\n",
"import zipfile\n",
"\n",
"from datetime import datetime\n",
"from tqdm.notebook import tqdm_notebook\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
"\n",
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
"tqdm_notebook.pandas()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be9bff9f",
"metadata": {},
"outputs": [],
"source": [
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
"pd.options.display.float_format = \"{:.2f}\".format\n",
"\n",
"# Set some global parameters\n",
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"\n",
"GEOID_FIELD_NAME = \"GEOID10\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
"COUNTRY_FIELD_NAME = \"Country\"\n",
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94407baa",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Load CEJST score data\n",
"cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n",
"cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n",
"\n",
"cejst_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29babd55",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"columns_to_plot = [\n",
" \"Respiratory hazard index\",\n",
" \"Particulate matter (PM2.5)\",\n",
" \"Poverty (Less than 200% of federal poverty line)\",\n",
" \"Percent individuals age 25 or over with less than high school degree\",\n",
" \"Unemployed civilians (percent)\",\n",
" \"Linguistic isolation (percent)\"\n",
"]\n",
"\n",
"column_to_plot = columns_to_plot[0]\n",
"print(f\"Plotting {column_to_plot}\")\n",
"print(cejst_df[\n",
" column_to_plot\n",
"].hist())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e5c8dcf",
"metadata": {},
"outputs": [],
"source": [
"for i in cejst_df.columns:\n",
" print(i)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}