j40-cejst-2/score/ipython/score_calc_0.1.ipynb
2021-06-22 11:57:59 -04:00

424 lines
13 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a664f981",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import pandas as pd\n",
"import csv\n",
"\n",
"data_path = Path.cwd().parent / \"data\"\n",
"fips_csv_path = data_path / \"fips_states_2010.csv\"\n",
"csv_path = data_path / \"score\" / \"csv\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7df430cb",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>ACSTOTPOP</th>\n",
" <th>LESSHSPCT</th>\n",
" <th>LOWINCPCT</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010010201001</td>\n",
" <td>636</td>\n",
" <td>0.208134</td>\n",
" <td>0.385220</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010010201002</td>\n",
" <td>1287</td>\n",
" <td>0.040678</td>\n",
" <td>0.163170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010010202001</td>\n",
" <td>810</td>\n",
" <td>0.135563</td>\n",
" <td>0.501247</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010010202002</td>\n",
" <td>1218</td>\n",
" <td>0.192000</td>\n",
" <td>0.393701</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010010203001</td>\n",
" <td>2641</td>\n",
" <td>0.125473</td>\n",
" <td>0.308217</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID ACSTOTPOP LESSHSPCT LOWINCPCT\n",
"0 010010201001 636 0.208134 0.385220\n",
"1 010010201002 1287 0.040678 0.163170\n",
"2 010010202001 810 0.135563 0.501247\n",
"3 010010202002 1218 0.192000 0.393701\n",
"4 010010203001 2641 0.125473 0.308217"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# EJSCreen csv Load\n",
"ejscreen_csv = data_path / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n",
"df = pd.read_csv(ejscreen_csv, dtype={'ID': 'string'}, low_memory=False)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "27677132",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# calculate percentiles\n",
"df['lesshs_percentile'] = df.LESSHSPCT.rank(pct = True)\n",
"df['lowin_percentile'] = df.LOWINCPCT.rank(pct = True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1f7b864f",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>ACSTOTPOP</th>\n",
" <th>LESSHSPCT</th>\n",
" <th>LOWINCPCT</th>\n",
" <th>lesshs_percentile</th>\n",
" <th>lowin_percentile</th>\n",
" <th>score_a</th>\n",
" <th>score_b</th>\n",
" <th>score_a_percentile</th>\n",
" <th>score_b_percentile</th>\n",
" <th>score_a_top_percentile_25</th>\n",
" <th>score_b_top_percentile_25</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>010010201001</td>\n",
" <td>636</td>\n",
" <td>0.208134</td>\n",
" <td>0.385220</td>\n",
" <td>0.793292</td>\n",
" <td>0.625015</td>\n",
" <td>0.709154</td>\n",
" <td>0.495820</td>\n",
" <td>0.739540</td>\n",
" <td>0.743311</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>010010201002</td>\n",
" <td>1287</td>\n",
" <td>0.040678</td>\n",
" <td>0.163170</td>\n",
" <td>0.238550</td>\n",
" <td>0.246722</td>\n",
" <td>0.242636</td>\n",
" <td>0.058856</td>\n",
" <td>0.206805</td>\n",
" <td>0.249590</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>010010202001</td>\n",
" <td>810</td>\n",
" <td>0.135563</td>\n",
" <td>0.501247</td>\n",
" <td>0.634390</td>\n",
" <td>0.772002</td>\n",
" <td>0.703196</td>\n",
" <td>0.489750</td>\n",
" <td>0.733009</td>\n",
" <td>0.738859</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>010010202002</td>\n",
" <td>1218</td>\n",
" <td>0.192000</td>\n",
" <td>0.393701</td>\n",
" <td>0.765126</td>\n",
" <td>0.637158</td>\n",
" <td>0.701142</td>\n",
" <td>0.487506</td>\n",
" <td>0.730848</td>\n",
" <td>0.737357</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>010010203001</td>\n",
" <td>2641</td>\n",
" <td>0.125473</td>\n",
" <td>0.308217</td>\n",
" <td>0.603841</td>\n",
" <td>0.504977</td>\n",
" <td>0.554409</td>\n",
" <td>0.304925</td>\n",
" <td>0.568571</td>\n",
" <td>0.586058</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID ACSTOTPOP LESSHSPCT LOWINCPCT lesshs_percentile \\\n",
"0 010010201001 636 0.208134 0.385220 0.793292 \n",
"1 010010201002 1287 0.040678 0.163170 0.238550 \n",
"2 010010202001 810 0.135563 0.501247 0.634390 \n",
"3 010010202002 1218 0.192000 0.393701 0.765126 \n",
"4 010010203001 2641 0.125473 0.308217 0.603841 \n",
"\n",
" lowin_percentile score_a score_b score_a_percentile \\\n",
"0 0.625015 0.709154 0.495820 0.739540 \n",
"1 0.246722 0.242636 0.058856 0.206805 \n",
"2 0.772002 0.703196 0.489750 0.733009 \n",
"3 0.637158 0.701142 0.487506 0.730848 \n",
"4 0.504977 0.554409 0.304925 0.568571 \n",
"\n",
" score_b_percentile score_a_top_percentile_25 score_b_top_percentile_25 \n",
"0 0.743311 False False \n",
"1 0.249590 False False \n",
"2 0.738859 False False \n",
"3 0.737357 False False \n",
"4 0.586058 False False "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# calculate scores\n",
"df[\"score_a\"] = df[[\"lesshs_percentile\", \"lowin_percentile\"]].mean(axis=1)\n",
"df[\"score_b\"] = df.lesshs_percentile * df.lowin_percentile\n",
"\n",
"# Create percentiles for the scores \n",
"df[\"score_a_percentile\"] = df.score_a.rank(pct = True)\n",
"df[\"score_b_percentile\"] = df.score_b.rank(pct = True)\n",
"df[\"score_a_top_percentile_25\"] = df[\"score_a_percentile\"] >= 0.75\n",
"df[\"score_b_top_percentile_25\"] = df[\"score_b_percentile\"] >= 0.75\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "91755bcf",
"metadata": {},
"outputs": [],
"source": [
"# strip calculations\n",
"df = df[[\"ID\", \"ACSTOTPOP\", \"score_a\",\"score_b\", \"score_a_percentile\", \"score_b_percentile\",\"score_a_top_percentile_25\",\"score_b_top_percentile_25\"]]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b3a65af4",
"metadata": {},
"outputs": [],
"source": [
"# write nationwide csv\n",
"df.to_csv(csv_path / f\"usa.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "58ddd8b3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generating data01 csv\n",
"Generating data02 csv\n",
"Generating data04 csv\n",
"Generating data05 csv\n",
"Generating data06 csv\n",
"Generating data08 csv\n",
"Generating data09 csv\n",
"Generating data10 csv\n",
"Generating data11 csv\n",
"Generating data12 csv\n",
"Generating data13 csv\n",
"Generating data15 csv\n",
"Generating data16 csv\n",
"Generating data17 csv\n",
"Generating data18 csv\n",
"Generating data19 csv\n",
"Generating data20 csv\n",
"Generating data21 csv\n",
"Generating data22 csv\n",
"Generating data23 csv\n",
"Generating data24 csv\n",
"Generating data25 csv\n",
"Generating data26 csv\n",
"Generating data27 csv\n",
"Generating data28 csv\n",
"Generating data29 csv\n",
"Generating data30 csv\n",
"Generating data31 csv\n",
"Generating data32 csv\n",
"Generating data33 csv\n",
"Generating data34 csv\n",
"Generating data35 csv\n",
"Generating data36 csv\n",
"Generating data37 csv\n",
"Generating data38 csv\n",
"Generating data39 csv\n",
"Generating data40 csv\n",
"Generating data41 csv\n",
"Generating data42 csv\n",
"Generating data44 csv\n",
"Generating data45 csv\n",
"Generating data46 csv\n",
"Generating data47 csv\n",
"Generating data48 csv\n",
"Generating data49 csv\n",
"Generating data50 csv\n",
"Generating data51 csv\n",
"Generating data53 csv\n",
"Generating data54 csv\n",
"Generating data55 csv\n",
"Generating data56 csv\n"
]
}
],
"source": [
"# write per state csvs\n",
"with open(fips_csv_path) as csv_file:\n",
" csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
" line_count = 0\n",
"\n",
" for row in csv_reader:\n",
" if line_count == 0:\n",
" line_count += 1\n",
" else:\n",
" fips = row[0].strip()\n",
" print(f\"Generating data{fips} csv\")\n",
" df1 = df[df.ID.str[:2] == fips]\n",
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
" df1.to_csv(csv_path / f\"data{fips}.csv\", index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e545623b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}