diff --git a/score/.vscode/settings.json b/score/.vscode/settings.json index 84a192ce..3111938d 100644 --- a/score/.vscode/settings.json +++ b/score/.vscode/settings.json @@ -1,3 +1,4 @@ { - "python.pythonPath": "venv\\Scripts\\python.exe" -} \ No newline at end of file + "python.pythonPath": "venv\\Scripts\\python.exe", + "python.dataScience.sendSelectionToInteractiveWindow": false +} diff --git a/score/ipython/ejscreen_etl.ipynb b/score/ipython/ejscreen_etl.ipynb index 5a95a935..48807596 100644 --- a/score/ipython/ejscreen_etl.ipynb +++ b/score/ipython/ejscreen_etl.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "20aa3891", "metadata": {}, "outputs": [], @@ -21,21 +21,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "67a58c24", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\opt\\justice40-tool\\score\\venv\\lib\\site-packages\\urllib3\\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gaftp.epa.gov'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ - "download = requests.get(\"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\", verify=False)\n", + "download = requests.get(\n", + " \"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\",\n", + " verify=False,\n", + ")\n", "file_contents = download.content\n", "zip_file_path = data_path / \"tmp\"\n", "zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n", @@ -45,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "cc3fb9ec", "metadata": {}, "outputs": [], @@ -57,101 +51,33 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "b25738bb", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "df = pd.read_csv(ejscreen_csv, dtype={'ID': 'string'}, low_memory=False)" + "df = pd.read_csv(ejscreen_csv, dtype={\"ID\": \"string\"}, low_memory=False)" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "e6994f2d", - "metadata": {}, - "outputs": [], - "source": [ - "df = df[[\"ID\", \"ACSTOTPOP\", \"LESSHSPCT\", \"LOWINCPCT\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "9fa2077a", "metadata": {}, "outputs": [], "source": [ "# write nationwide csv\n", - "df.to_csv(csv_path / f\"usa.csv\", index = False)" + "df.to_csv(csv_path / f\"usa.csv\", index=False)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "5e5cc12a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generating data01 csv\n", - "Generating data02 csv\n", - "Generating data04 csv\n", - "Generating data05 csv\n", - "Generating data06 csv\n", - "Generating data08 csv\n", - "Generating data09 csv\n", - "Generating data10 csv\n", - "Generating data11 csv\n", - "Generating data12 csv\n", - "Generating data13 csv\n", - "Generating data15 csv\n", - "Generating data16 csv\n", - "Generating data17 csv\n", - "Generating data18 csv\n", - "Generating data19 csv\n", - "Generating data20 csv\n", - "Generating data21 csv\n", - "Generating data22 csv\n", - "Generating data23 csv\n", - "Generating data24 csv\n", - "Generating data25 csv\n", - "Generating data26 csv\n", - "Generating data27 csv\n", - "Generating data28 csv\n", - "Generating data29 csv\n", - "Generating data30 csv\n", - "Generating data31 csv\n", - "Generating data32 csv\n", - "Generating data33 csv\n", - "Generating data34 csv\n", - "Generating data35 csv\n", - "Generating data36 csv\n", - "Generating data37 csv\n", - "Generating data38 csv\n", - "Generating data39 csv\n", - "Generating data40 csv\n", - "Generating data41 csv\n", - "Generating data42 csv\n", - "Generating data44 csv\n", - "Generating data45 csv\n", - "Generating data46 csv\n", - "Generating data47 csv\n", - "Generating data48 csv\n", - "Generating data49 csv\n", - "Generating data50 csv\n", - "Generating data51 csv\n", - "Generating data53 csv\n", - "Generating data54 csv\n", - "Generating data55 csv\n", - "Generating data56 csv\n" - ] - } - ], + "outputs": [], "source": [ "# write per state csvs\n", "with open(fips_csv_path) as csv_file:\n", @@ -166,16 +92,8 @@ " print(f\"Generating data{fips} csv\")\n", " df1 = df[df.ID.str[:2] == fips]\n", " # we need to name the file data01.csv for ogr2ogr csv merge to work\n", - " df1.to_csv(csv_path / f\"data{fips}.csv\", index = False)" + " df1.to_csv(csv_path / f\"data{fips}.csv\", index=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2674fb20", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -194,7 +112,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.0" + "version": "3.7.1" } }, "nbformat": 4, diff --git a/score/ipython/score_calc.ipynb b/score/ipython/score_calc.ipynb new file mode 100644 index 00000000..f2f032c8 --- /dev/null +++ b/score/ipython/score_calc.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a664f981", + "metadata": {}, + "outputs": [], + "source": [ + "# Before running this notebook, you must run the notebook `ejscreen_etl.ipynb`.\n", + "\n", + "import collections\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import csv\n", + "\n", + "# Define some global parameters\n", + "BUCKET_SOCIOECONOMIC = \"Socioeconomic Factors\"\n", + "BUCKET_SENSITIVE = \"Sensitive populations\"\n", + "BUCKET_ENVIRONMENTAL = \"Environmental effects\"\n", + "BUCKET_EXPOSURES = \"Exposures\"\n", + "BUCKETS = [\n", + " BUCKET_SOCIOECONOMIC,\n", + " BUCKET_SENSITIVE,\n", + " BUCKET_ENVIRONMENTAL,\n", + " BUCKET_EXPOSURES,\n", + "]\n", + "\n", + "# There's another aggregation level (a second level of \"buckets\").\n", + "AGGREGATION_POLLUTION = \"Pollution Burden\"\n", + "AGGREGATION_POPULATION = \"Population Characteristics\"\n", + "\n", + "PERCENTILE_FIELD_SUFFIX = \" (percentile)\"\n", + "\n", + "data_path = Path.cwd().parent / \"data\"\n", + "fips_csv_path = data_path / \"fips_states_2010.csv\"\n", + "score_csv_path = data_path / \"score\" / \"csv\"\n", + "\n", + "# Tell pandas to display all columns\n", + "pd.set_option(\"display.max_columns\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7df430cb", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# EJSCreen csv Load\n", + "ejscreen_csv = data_path / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n", + "df = pd.read_csv(ejscreen_csv, dtype={\"ID\": \"string\"}, low_memory=False)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8567900", + "metadata": {}, + "outputs": [], + "source": [ + "# Define a named tuple that will be used for each data set input.\n", + "DataSet = collections.namedtuple(\n", + " typename=\"DataSet\", field_names=[\"input_field\", \"renamed_field\", \"bucket\"]\n", + ")\n", + "\n", + "data_sets = [\n", + " # The following data sets have `bucket=None`, because it's not used in the score.\n", + " DataSet(\n", + " input_field=\"ID\", \n", + " # Use the name `GEOID10` to enable geoplatform.gov's workflow.\n", + " renamed_field=\"GEOID10\", bucket=None\n", + " ),\n", + " DataSet(input_field=\"ACSTOTPOP\", renamed_field=\"Total population\", bucket=None),\n", + " # The following data sets have buckets, because they're used in the score\n", + " DataSet(\n", + " input_field=\"CANCER\",\n", + " renamed_field=\"Air toxics cancer risk\",\n", + " bucket=BUCKET_EXPOSURES,\n", + " ),\n", + " DataSet(\n", + " input_field=\"RESP\",\n", + " renamed_field=\"Respiratory hazard index\",\n", + " bucket=BUCKET_EXPOSURES,\n", + " ),\n", + " DataSet(\n", + " input_field=\"DSLPM\",\n", + " renamed_field=\"Diesel particulate matter\",\n", + " bucket=BUCKET_EXPOSURES,\n", + " ),\n", + " DataSet(\n", + " input_field=\"PM25\",\n", + " renamed_field=\"Particulate matter (PM2.5)\",\n", + " bucket=BUCKET_EXPOSURES,\n", + " ),\n", + " DataSet(input_field=\"OZONE\", renamed_field=\"Ozone\", bucket=BUCKET_EXPOSURES),\n", + " DataSet(\n", + " input_field=\"PTRAF\",\n", + " renamed_field=\"Traffic proximity and volume\",\n", + " bucket=BUCKET_EXPOSURES,\n", + " ),\n", + " DataSet(\n", + " input_field=\"PRMP\",\n", + " renamed_field=\"Proximity to RMP sites\",\n", + " bucket=BUCKET_ENVIRONMENTAL,\n", + " ),\n", + " DataSet(\n", + " input_field=\"PTSDF\",\n", + " renamed_field=\"Proximity to TSDF sites\",\n", + " bucket=BUCKET_ENVIRONMENTAL,\n", + " ),\n", + " DataSet(\n", + " input_field=\"PNPL\",\n", + " renamed_field=\"Proximity to NPL sites\",\n", + " bucket=BUCKET_ENVIRONMENTAL,\n", + " ),\n", + " DataSet(\n", + " input_field=\"PWDIS\",\n", + " renamed_field=\"Wastewater discharge\",\n", + " bucket=BUCKET_ENVIRONMENTAL,\n", + " ),\n", + " DataSet(\n", + " input_field=\"PRE1960PCT\",\n", + " renamed_field=\"Percent pre-1960s housing (lead paint indicator)\",\n", + " bucket=BUCKET_ENVIRONMENTAL,\n", + " ),\n", + " DataSet(\n", + " input_field=\"UNDER5PCT\",\n", + " renamed_field=\"Individuals under 5 years old\",\n", + " bucket=BUCKET_SENSITIVE,\n", + " ),\n", + " DataSet(\n", + " input_field=\"OVER64PCT\",\n", + " renamed_field=\"Individuals over 64 years old\",\n", + " bucket=BUCKET_SENSITIVE,\n", + " ),\n", + " DataSet(\n", + " input_field=\"LINGISOPCT\",\n", + " renamed_field=\"Percent of households in linguistic isolation\",\n", + " bucket=BUCKET_SOCIOECONOMIC,\n", + " ),\n", + " DataSet(\n", + " input_field=\"LOWINCPCT\",\n", + " renamed_field=\"Poverty (Less than 200% of federal poverty line)\",\n", + " bucket=BUCKET_SOCIOECONOMIC,\n", + " ),\n", + " DataSet(\n", + " input_field=\"LESSHSPCT\",\n", + " renamed_field=\"Percent individuals age 25 or over with less than high school degree\",\n", + " bucket=BUCKET_SOCIOECONOMIC,\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e152a655", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename columns:\n", + "renaming_dict = {data_set.input_field: data_set.renamed_field for data_set in data_sets}\n", + "\n", + "df.rename(\n", + " columns=renaming_dict,\n", + " inplace=True,\n", + " errors=\"raise\",\n", + ")\n", + "\n", + "columns_to_keep = [data_set.renamed_field for data_set in data_sets]\n", + "df = df[columns_to_keep]\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27677132", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# calculate percentiles\n", + "for data_set in data_sets:\n", + " df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n", + " data_set.renamed_field\n", + " ].rank(pct=True)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f7b864f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Calculate score \"A\" and score \"B\"\n", + "df[\"Score A\"] = df[\n", + " [\n", + " \"Poverty (Less than 200% of federal poverty line) (percentile)\",\n", + " \"Percent individuals age 25 or over with less than high school degree (percentile)\",\n", + " ]\n", + "].mean(axis=1)\n", + "df[\"Score B\"] = (\n", + " df[\"Poverty (Less than 200% of federal poverty line) (percentile)\"]\n", + " * df[\n", + " \"Percent individuals age 25 or over with less than high school degree (percentile)\"\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c107baf", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Calculate \"CalEnviroScreen for the US\" score\n", + "# Average all the percentile values in each bucket into a single score for each of the four buckets.\n", + "for bucket in BUCKETS:\n", + " fields_in_bucket = [\n", + " f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"\n", + " for data_set in data_sets\n", + " if data_set.bucket == bucket\n", + " ]\n", + " df[f\"{bucket}\"] = df[fields_in_bucket].mean(axis=1)\n", + "\n", + "# Combine the score from the two Exposures and Environmental Effects buckets into a single score called \"Pollution Burden\". The math for this score is: (1.0 * Exposures Score + 0.5 * Environment Effects score) / 1.5.\n", + "df[AGGREGATION_POLLUTION] = (\n", + " 1.0 * df[f\"{BUCKET_EXPOSURES}\"] + 0.5 * df[f\"{BUCKET_ENVIRONMENTAL}\"]\n", + ") / 1.5\n", + "\n", + "# Average the score from the two Sensitive populations and Socioeconomic factors buckets into a single score called \"Population Characteristics\".\n", + "df[AGGREGATION_POPULATION] = df[\n", + " [f\"{BUCKET_SENSITIVE}\", f\"{BUCKET_SOCIOECONOMIC}\"]\n", + "].mean(axis=1)\n", + "\n", + "# Multiply the \"Pollution Burden\" score and the \"Population Characteristics\" together to produce the cumulative impact score.\n", + "df[\"Score C\"] = df[AGGREGATION_POLLUTION] * df[AGGREGATION_POPULATION]\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "729aed12", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Create percentiles for the scores\n", + "for score_field in [\"Score A\", \"Score B\", \"Score C\"]:\n", + " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[score_field].rank(pct=True)\n", + " df[f\"{score_field} (top 25th percentile)\"] = (\n", + " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] >= 0.75\n", + " )\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3a65af4", + "metadata": {}, + "outputs": [], + "source": [ + "# write nationwide csv\n", + "df.to_csv(score_csv_path / f\"usa.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58ddd8b3", + "metadata": {}, + "outputs": [], + "source": [ + "# write per state csvs\n", + "with open(fips_csv_path) as csv_file:\n", + " csv_reader = csv.reader(csv_file, delimiter=\",\")\n", + " line_count = 0\n", + "\n", + " for row in csv_reader:\n", + " if line_count == 0:\n", + " line_count += 1\n", + " else:\n", + " states_fips = row[0].strip()\n", + " print(f\"Generating data{states_fips} csv\")\n", + " df1 = df[df[\"GEOID10\"].str[:2] == states_fips]\n", + " # we need to name the file data01.csv for ogr2ogr csv merge to work\n", + " df1.to_csv(score_csv_path / f\"data{states_fips}.csv\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/ipython/score_calc_0.1.ipynb b/score/ipython/score_calc_0.1.ipynb deleted file mode 100644 index 01cd19b2..00000000 --- a/score/ipython/score_calc_0.1.ipynb +++ /dev/null @@ -1,424 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "a664f981", - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "import pandas as pd\n", - "import csv\n", - "\n", - "data_path = Path.cwd().parent / \"data\"\n", - "fips_csv_path = data_path / \"fips_states_2010.csv\"\n", - "csv_path = data_path / \"score\" / \"csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "7df430cb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDACSTOTPOPLESSHSPCTLOWINCPCT
00100102010016360.2081340.385220
101001020100212870.0406780.163170
20100102020018100.1355630.501247
301001020200212180.1920000.393701
401001020300126410.1254730.308217
\n", - "
" - ], - "text/plain": [ - " ID ACSTOTPOP LESSHSPCT LOWINCPCT\n", - "0 010010201001 636 0.208134 0.385220\n", - "1 010010201002 1287 0.040678 0.163170\n", - "2 010010202001 810 0.135563 0.501247\n", - "3 010010202002 1218 0.192000 0.393701\n", - "4 010010203001 2641 0.125473 0.308217" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# EJSCreen csv Load\n", - "ejscreen_csv = data_path / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n", - "df = pd.read_csv(ejscreen_csv, dtype={'ID': 'string'}, low_memory=False)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "27677132", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# calculate percentiles\n", - "df['lesshs_percentile'] = df.LESSHSPCT.rank(pct = True)\n", - "df['lowin_percentile'] = df.LOWINCPCT.rank(pct = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1f7b864f", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDACSTOTPOPLESSHSPCTLOWINCPCTlesshs_percentilelowin_percentilescore_ascore_bscore_a_percentilescore_b_percentilescore_a_top_percentile_25score_b_top_percentile_25
00100102010016360.2081340.3852200.7932920.6250150.7091540.4958200.7395400.743311FalseFalse
101001020100212870.0406780.1631700.2385500.2467220.2426360.0588560.2068050.249590FalseFalse
20100102020018100.1355630.5012470.6343900.7720020.7031960.4897500.7330090.738859FalseFalse
301001020200212180.1920000.3937010.7651260.6371580.7011420.4875060.7308480.737357FalseFalse
401001020300126410.1254730.3082170.6038410.5049770.5544090.3049250.5685710.586058FalseFalse
\n", - "
" - ], - "text/plain": [ - " ID ACSTOTPOP LESSHSPCT LOWINCPCT lesshs_percentile \\\n", - "0 010010201001 636 0.208134 0.385220 0.793292 \n", - "1 010010201002 1287 0.040678 0.163170 0.238550 \n", - "2 010010202001 810 0.135563 0.501247 0.634390 \n", - "3 010010202002 1218 0.192000 0.393701 0.765126 \n", - "4 010010203001 2641 0.125473 0.308217 0.603841 \n", - "\n", - " lowin_percentile score_a score_b score_a_percentile \\\n", - "0 0.625015 0.709154 0.495820 0.739540 \n", - "1 0.246722 0.242636 0.058856 0.206805 \n", - "2 0.772002 0.703196 0.489750 0.733009 \n", - "3 0.637158 0.701142 0.487506 0.730848 \n", - "4 0.504977 0.554409 0.304925 0.568571 \n", - "\n", - " score_b_percentile score_a_top_percentile_25 score_b_top_percentile_25 \n", - "0 0.743311 False False \n", - "1 0.249590 False False \n", - "2 0.738859 False False \n", - "3 0.737357 False False \n", - "4 0.586058 False False " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# calculate scores\n", - "df[\"score_a\"] = df[[\"lesshs_percentile\", \"lowin_percentile\"]].mean(axis=1)\n", - "df[\"score_b\"] = df.lesshs_percentile * df.lowin_percentile\n", - "\n", - "# Create percentiles for the scores \n", - "df[\"score_a_percentile\"] = df.score_a.rank(pct = True)\n", - "df[\"score_b_percentile\"] = df.score_b.rank(pct = True)\n", - "df[\"score_a_top_percentile_25\"] = df[\"score_a_percentile\"] >= 0.75\n", - "df[\"score_b_top_percentile_25\"] = df[\"score_b_percentile\"] >= 0.75\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "91755bcf", - "metadata": {}, - "outputs": [], - "source": [ - "# strip calculations\n", - "df = df[[\"ID\", \"ACSTOTPOP\", \"score_a\",\"score_b\", \"score_a_percentile\", \"score_b_percentile\",\"score_a_top_percentile_25\",\"score_b_top_percentile_25\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b3a65af4", - "metadata": {}, - "outputs": [], - "source": [ - "# write nationwide csv\n", - "df.to_csv(csv_path / f\"usa.csv\", index = False)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "58ddd8b3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generating data01 csv\n", - "Generating data02 csv\n", - "Generating data04 csv\n", - "Generating data05 csv\n", - "Generating data06 csv\n", - "Generating data08 csv\n", - "Generating data09 csv\n", - "Generating data10 csv\n", - "Generating data11 csv\n", - "Generating data12 csv\n", - "Generating data13 csv\n", - "Generating data15 csv\n", - "Generating data16 csv\n", - "Generating data17 csv\n", - "Generating data18 csv\n", - "Generating data19 csv\n", - "Generating data20 csv\n", - "Generating data21 csv\n", - "Generating data22 csv\n", - "Generating data23 csv\n", - "Generating data24 csv\n", - "Generating data25 csv\n", - "Generating data26 csv\n", - "Generating data27 csv\n", - "Generating data28 csv\n", - "Generating data29 csv\n", - "Generating data30 csv\n", - "Generating data31 csv\n", - "Generating data32 csv\n", - "Generating data33 csv\n", - "Generating data34 csv\n", - "Generating data35 csv\n", - "Generating data36 csv\n", - "Generating data37 csv\n", - "Generating data38 csv\n", - "Generating data39 csv\n", - "Generating data40 csv\n", - "Generating data41 csv\n", - "Generating data42 csv\n", - "Generating data44 csv\n", - "Generating data45 csv\n", - "Generating data46 csv\n", - "Generating data47 csv\n", - "Generating data48 csv\n", - "Generating data49 csv\n", - "Generating data50 csv\n", - "Generating data51 csv\n", - "Generating data53 csv\n", - "Generating data54 csv\n", - "Generating data55 csv\n", - "Generating data56 csv\n" - ] - } - ], - "source": [ - "# write per state csvs\n", - "with open(fips_csv_path) as csv_file:\n", - " csv_reader = csv.reader(csv_file, delimiter=\",\")\n", - " line_count = 0\n", - "\n", - " for row in csv_reader:\n", - " if line_count == 0:\n", - " line_count += 1\n", - " else:\n", - " fips = row[0].strip()\n", - " print(f\"Generating data{fips} csv\")\n", - " df1 = df[df.ID.str[:2] == fips]\n", - " # we need to name the file data01.csv for ogr2ogr csv merge to work\n", - " df1.to_csv(csv_path / f\"data{fips}.csv\", index = False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e545623b", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/score/ipython/scoring_comparison.ipynb b/score/ipython/scoring_comparison.ipynb index 1b6b401c..6bd965c2 100644 --- a/score/ipython/scoring_comparison.ipynb +++ b/score/ipython/scoring_comparison.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "54615cef", "metadata": {}, "outputs": [], @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "49a63129", "metadata": {}, "outputs": [], @@ -56,148 +56,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "2b26dccf", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
census_block_group_idcensus_block_group_populationcejst_scorescore_bcejst_percentilescore_b_percentilescore_a_top_percentile_25score_b_top_percentile_25cejst_priority_communitycensus_tract_id
102976001400100131150.140.020.100.14FalseFalseFalse6001400100
102986001400200110370.090.010.050.07FalseFalseFalse6001400200
10299600140020029880.150.020.110.12FalseFalseFalse6001400200
103006001400300111370.030.000.010.02FalseFalseFalse6001400300
103016001400300214040.340.090.310.31FalseFalseFalse6001400300
\n", - "
" - ], - "text/plain": [ - " census_block_group_id census_block_group_population cejst_score \\\n", - "10297 60014001001 3115 0.14 \n", - "10298 60014002001 1037 0.09 \n", - "10299 60014002002 988 0.15 \n", - "10300 60014003001 1137 0.03 \n", - "10301 60014003002 1404 0.34 \n", - "\n", - " score_b cejst_percentile score_b_percentile \\\n", - "10297 0.02 0.10 0.14 \n", - "10298 0.01 0.05 0.07 \n", - "10299 0.02 0.11 0.12 \n", - "10300 0.00 0.01 0.02 \n", - "10301 0.09 0.31 0.31 \n", - "\n", - " score_a_top_percentile_25 score_b_top_percentile_25 \\\n", - "10297 False False \n", - "10298 False False \n", - "10299 False False \n", - "10300 False False \n", - "10301 False False \n", - "\n", - " cejst_priority_community census_tract_id \n", - "10297 False 6001400100 \n", - "10298 False 6001400200 \n", - "10299 False 6001400200 \n", - "10300 False 6001400300 \n", - "10301 False 6001400300 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Load CEJST score data\n", "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"usa.csv\"\n", @@ -240,19 +102,10 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "ec6b27e3", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\opt\\justice40-tool\\score\\venv\\lib\\site-packages\\urllib3\\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'justice40-data.s3.amazonaws.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:\n", "# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip\n", @@ -267,18 +120,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "bdf08971", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "C:\\opt\\justice40-tool\\score\\data\\tmp\n" - ] - } - ], + "outputs": [], "source": [ "# Extract zip\n", "print(zip_file_path)\n", @@ -290,231 +135,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "29c14b29", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
census_tract_idTotal PopulationCalifornia CountyZIPNearby City \\r\\n(to help approximate location only)LongitudeLatitudecalenviroscreen_scorecalenviroscreen_percentileDRAFT CES 4.0\\r\\nPercentile Range...PovertyPoverty PctlUnemploymentUnemployment PctlHousing BurdenHousing Burden PctlPop. Char.Pop. Char. ScorePop. Char. Pctlcalenviroscreen_priority_community
060190011002760Fresno93706Fresno-119.7836.7194.61100.0095-100% (highest scores)...76.6098.4316.2097.1530.7090.6193.739.7299.87True
160770007004177San Joaquin95206Stockton-121.2937.9490.8399.9995-100% (highest scores)...70.6096.4318.5098.4535.2095.6193.409.6899.84True
260770001004055San Joaquin95202Stockton-121.2937.9585.7599.9795-100% (highest scores)...81.8099.5017.9098.1736.4096.5195.719.9299.97True
360710016005527San Bernardino91761Ontario-117.6234.0683.5699.9695-100% (highest scores)...67.1094.826.7057.2032.1092.6580.598.3693.06True
460372049202639Los Angeles90023Los Angeles-118.2034.0282.9099.9595-100% (highest scores)...64.9093.515.6043.8125.0077.9583.958.7095.78True
\n", - "

5 rows × 59 columns

\n", - "
" - ], - "text/plain": [ - " census_tract_id Total Population California County ZIP \\\n", - "0 6019001100 2760 Fresno 93706 \n", - "1 6077000700 4177 San Joaquin 95206 \n", - "2 6077000100 4055 San Joaquin 95202 \n", - "3 6071001600 5527 San Bernardino 91761 \n", - "4 6037204920 2639 Los Angeles 90023 \n", - "\n", - " Nearby City \\r\\n(to help approximate location only) Longitude Latitude \\\n", - "0 Fresno -119.78 36.71 \n", - "1 Stockton -121.29 37.94 \n", - "2 Stockton -121.29 37.95 \n", - "3 Ontario -117.62 34.06 \n", - "4 Los Angeles -118.20 34.02 \n", - "\n", - " calenviroscreen_score calenviroscreen_percentile \\\n", - "0 94.61 100.00 \n", - "1 90.83 99.99 \n", - "2 85.75 99.97 \n", - "3 83.56 99.96 \n", - "4 82.90 99.95 \n", - "\n", - " DRAFT CES 4.0\\r\\nPercentile Range ... Poverty Poverty Pctl Unemployment \\\n", - "0 95-100% (highest scores) ... 76.60 98.43 16.20 \n", - "1 95-100% (highest scores) ... 70.60 96.43 18.50 \n", - "2 95-100% (highest scores) ... 81.80 99.50 17.90 \n", - "3 95-100% (highest scores) ... 67.10 94.82 6.70 \n", - "4 95-100% (highest scores) ... 64.90 93.51 5.60 \n", - "\n", - " Unemployment Pctl Housing Burden Housing Burden Pctl Pop. Char. \\\n", - "0 97.15 30.70 90.61 93.73 \n", - "1 98.45 35.20 95.61 93.40 \n", - "2 98.17 36.40 96.51 95.71 \n", - "3 57.20 32.10 92.65 80.59 \n", - "4 43.81 25.00 77.95 83.95 \n", - "\n", - " Pop. Char. Score Pop. Char. Pctl calenviroscreen_priority_community \n", - "0 9.72 99.87 True \n", - "1 9.68 99.84 True \n", - "2 9.92 99.97 True \n", - "3 8.36 93.06 True \n", - "4 8.70 95.78 True \n", - "\n", - "[5 rows x 59 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Load comparison index (CalEnviroScreen 4)\n", "\n", @@ -541,142 +165,10 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "813e5656", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
census_block_group_idcensus_tract_idcensus_block_group_populationcejst_scorecejst_percentilecejst_priority_communitycalenviroscreen_scorecalenviroscreen_percentilecalenviroscreen_priority_community
060014001001600140010031150.140.10False4.402.38False
160014002001600140020010370.090.05False5.053.48False
26001400200260014002009880.150.11False5.053.48False
360014003001600140030011370.030.01False9.9213.44False
460014003002600140030014040.340.31False9.9213.44False
\n", - "
" - ], - "text/plain": [ - " census_block_group_id census_tract_id census_block_group_population \\\n", - "0 60014001001 6001400100 3115 \n", - "1 60014002001 6001400200 1037 \n", - "2 60014002002 6001400200 988 \n", - "3 60014003001 6001400300 1137 \n", - "4 60014003002 6001400300 1404 \n", - "\n", - " cejst_score cejst_percentile cejst_priority_community \\\n", - "0 0.14 0.10 False \n", - "1 0.09 0.05 False \n", - "2 0.15 0.11 False \n", - "3 0.03 0.01 False \n", - "4 0.34 0.31 False \n", - "\n", - " calenviroscreen_score calenviroscreen_percentile \\\n", - "0 4.40 2.38 \n", - "1 5.05 3.48 \n", - "2 5.05 3.48 \n", - "3 9.92 13.44 \n", - "4 9.92 13.44 \n", - "\n", - " calenviroscreen_priority_community \n", - "0 False \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Join CalEnviroScreen and CEJST data.\n", "# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,\n", @@ -716,58 +208,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "939baea4", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " census_tract_id calenviroscreen_score \\\n", - "census_tract_id \n", - "6019001100 6019001100 94.61 \n", - "6077000700 6077000700 90.83 \n", - "6077000100 6077000100 85.75 \n", - "6071001600 6071001600 83.56 \n", - "6037204920 6037204920 82.90 \n", - "\n", - " calenviroscreen_percentile \\\n", - "census_tract_id \n", - "6019001100 100.00 \n", - "6077000700 99.99 \n", - "6077000100 99.97 \n", - "6071001600 99.96 \n", - "6037204920 99.95 \n", - "\n", - " calenviroscreen_priority_community \\\n", - "census_tract_id \n", - "6019001100 True \n", - "6077000700 True \n", - "6077000100 True \n", - "6071001600 True \n", - "6037204920 True \n", - "\n", - " CES Tract has at least one CEJST CBG? \\\n", - "census_tract_id \n", - "6019001100 True \n", - "6077000700 True \n", - "6077000100 True \n", - "6071001600 True \n", - "6037204920 True \n", - "\n", - " CES Tract has 100% CEJST CBGs? \n", - "census_tract_id \n", - "6019001100 True \n", - "6077000700 True \n", - "6077000100 True \n", - "6071001600 False \n", - "6037204920 True \n" - ] - } - ], + "outputs": [], "source": [ "# Create analysis\n", "def calculate_comparison(frame):\n", @@ -826,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "85709225", "metadata": { "scrolled": true