From caf9fe0e00c73557386f8115e0dae92e777fa6ca Mon Sep 17 00:00:00 2001 From: Lucas Merrill Brown Date: Tue, 29 Jun 2021 08:20:23 -0700 Subject: [PATCH] Scores D & E (#266) * running black throughout * adding housing * hud housing etl working * got score d and e working * updating scoring comparison * minor fixes * small changes * small comments --- score/ipython/census_etl.ipynb | 83 +++++- score/ipython/ejscreen_etl.ipynb | 24 +- .../housing_and_transportation_etl.ipynb | 14 +- score/ipython/hud_housing_etl.ipynb | 274 ++++++++++++++++++ score/ipython/score_calc.ipynb | 197 +++++++++++-- score/ipython/scoring_comparison.ipynb | 107 ++++++- 6 files changed, 625 insertions(+), 74 deletions(-) create mode 100644 score/ipython/hud_housing_etl.ipynb diff --git a/score/ipython/census_etl.ipynb b/score/ipython/census_etl.ipynb index ac4a51ef..8f2ab4b6 100644 --- a/score/ipython/census_etl.ipynb +++ b/score/ipython/census_etl.ipynb @@ -14,7 +14,7 @@ "import os\n", "import sys\n", "\n", - "module_path = os.path.abspath(os.path.join('..'))\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", @@ -26,13 +26,36 @@ "OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n", "\n", "GEOID_FIELD_NAME = \"GEOID10\"\n", - "UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n", + "UNEMPLOYED_FIELD_NAME = \"Unemployed civilians (percent)\"\n", + "LINGUISTIC_ISOLATION_FIELD_NAME = \"Linguistic isolation (percent)\"\n", + "LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = \"Linguistic isolation (total)\"\n", + "\n", + "LINGUISTIC_ISOLATION_FIELDS = [\n", + " \"C16002_001E\",\n", + " \"C16002_004E\",\n", + " \"C16002_007E\",\n", + " \"C16002_010E\",\n", + " \"C16002_013E\",\n", + "]\n", "\n", "# Some display settings to make pandas outputs more readable.\n", "pd.set_option(\"display.expand_frame_repr\", False)\n", "pd.set_option(\"display.precision\", 2)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "64df0b63", + "metadata": {}, + "outputs": [], + "source": [ + "# For variable discovery, if necessary.\n", + "# censusdata.search(\n", + "# \"acs5\", 2019, \"label\", \"Limited English speaking\"\n", + "# )" + ] + }, { "cell_type": "code", "execution_count": null, @@ -44,7 +67,8 @@ "source": [ "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", - "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B23025\"))" + "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B23025\"))\n", + "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"C16002\"))" ] }, { @@ -73,10 +97,16 @@ " geo=censusdata.censusgeo(\n", " [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n", " ),\n", - " var=[\"B23025_005E\", \"B23025_003E\"],\n", + " var=[\n", + " # Emploment fields\n", + " \"B23025_005E\",\n", + " \"B23025_003E\",\n", + " ]\n", + " + LINGUISTIC_ISOLATION_FIELDS,\n", " )\n", " )\n", "\n", + "\n", "df = pd.concat(dfs)\n", "\n", "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n", @@ -97,7 +127,34 @@ "# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.\n", "df[UNEMPLOYED_FIELD_NAME] = df.B23025_005E / df.B23025_003E\n", "\n", - "df.head()" + "df[UNEMPLOYED_FIELD_NAME].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e475472c", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Calculate linguistic isolation.\n", + "individual_limited_english_fields = [\n", + " \"C16002_004E\",\n", + " \"C16002_007E\",\n", + " \"C16002_010E\",\n", + " \"C16002_013E\",\n", + "]\n", + "\n", + "df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[individual_limited_english_fields].sum(\n", + " axis=1, skipna=True\n", + ")\n", + "df[LINGUISTIC_ISOLATION_FIELD_NAME] = (\n", + " df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float) / df[\"C16002_001E\"]\n", + ")\n", + "\n", + "df[LINGUISTIC_ISOLATION_FIELD_NAME].describe()" ] }, { @@ -112,18 +169,14 @@ "# mkdir census\n", "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n", "\n", - "columns_to_include = [GEOID_FIELD_NAME, UNEMPLOYED_FIELD_NAME]\n", + "columns_to_include = [\n", + " GEOID_FIELD_NAME,\n", + " UNEMPLOYED_FIELD_NAME,\n", + " LINGUISTIC_ISOLATION_FIELD_NAME,\n", + "]\n", "\n", "df[columns_to_include].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91932af5", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -142,7 +195,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.0" + "version": "3.7.1" } }, "nbformat": 4, diff --git a/score/ipython/ejscreen_etl.ipynb b/score/ipython/ejscreen_etl.ipynb index 05b882d0..37d81d6c 100644 --- a/score/ipython/ejscreen_etl.ipynb +++ b/score/ipython/ejscreen_etl.ipynb @@ -14,7 +14,7 @@ "import sys\n", "import os\n", "\n", - "module_path = os.path.abspath(os.path.join('..'))\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", @@ -23,7 +23,9 @@ "\n", "DATA_PATH = Path.cwd().parent / \"data\"\n", "TMP_PATH = DATA_PATH / \"tmp\"\n", - "EJSCREEN_FTP_URL = \"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\"\n", + "EJSCREEN_FTP_URL = (\n", + " \"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\"\n", + ")\n", "EJSCREEN_CSV = TMP_PATH / \"EJSCREEN_2020_StatePctile.csv\"\n", "CSV_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2020\"\n", "print(DATA_PATH)" @@ -49,7 +51,13 @@ }, "outputs": [], "source": [ - "df = pd.read_csv(EJSCREEN_CSV, dtype={\"ID\": \"string\"}, low_memory=False)" + "df = pd.read_csv(\n", + " EJSCREEN_CSV,\n", + " dtype={\"ID\": \"string\"},\n", + " # EJSCREEN writes the word \"None\" for NA data.\n", + " na_values=[\"None\"],\n", + " low_memory=False,\n", + ")" ] }, { @@ -89,14 +97,6 @@ "# cleanup\n", "remove_all_from_dir(TMP_PATH)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d4f74d7", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -115,7 +115,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.0" + "version": "3.7.1" } }, "nbformat": 4, diff --git a/score/ipython/housing_and_transportation_etl.ipynb b/score/ipython/housing_and_transportation_etl.ipynb index 96eb5d27..ed3f888a 100644 --- a/score/ipython/housing_and_transportation_etl.ipynb +++ b/score/ipython/housing_and_transportation_etl.ipynb @@ -14,7 +14,7 @@ "import os\n", "import sys\n", "\n", - "module_path = os.path.abspath(os.path.join('..'))\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", @@ -44,7 +44,7 @@ "for fips in get_state_fips_codes(DATA_PATH):\n", " print(f\"Downloading housing data for state/territory with FIPS code {fips}\")\n", " unzip_file_from_url(f\"{HOUSING_FTP_URL}{fips}\", TMP_PATH, zip_file_dir)\n", - " \n", + "\n", " # New file name:\n", " tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n", " tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n", @@ -90,14 +90,6 @@ "# cleanup\n", "remove_all_from_dir(TMP_PATH)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9269e497", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -116,7 +108,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.0" + "version": "3.7.1" } }, "nbformat": 4, diff --git a/score/ipython/hud_housing_etl.ipynb b/score/ipython/hud_housing_etl.ipynb new file mode 100644 index 00000000..c8647b5c --- /dev/null +++ b/score/ipython/hud_housing_etl.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c21b63a3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import censusdata\n", + "import csv\n", + "from pathlib import Path\n", + "import os\n", + "import re\n", + "import sys\n", + "\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from etl.sources.census.etl_utils import get_state_fips_codes\n", + "from utils import unzip_file_from_url, remove_all_from_dir\n", + "\n", + "DATA_PATH = Path.cwd().parent / \"data\"\n", + "TMP_PATH = DATA_PATH / \"tmp\"\n", + "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"hud_housing\"\n", + "\n", + "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", + "\n", + "# We measure households earning less than 80% of HUD Area Median Family Income by county\n", + "# and paying greater than 30% of their income to housing costs.\n", + "HOUSING_BURDEN_FIELD_NAME = \"Housing burden (percent)\"\n", + "HOUSING_BURDEN_NUMERATOR_FIELD_NAME = \"HOUSING_BURDEN_NUMERATOR\"\n", + "HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = \"HOUSING_BURDEN_DENOMINATOR\"\n", + "\n", + "# Note: some variable definitions.\n", + "# HUD-adjusted median family income (HAMFI).\n", + "# The four housing problems are: incomplete kitchen facilities, incomplete plumbing facilities, more than 1 person per room, and cost burden greater than 30%.\n", + "# Table 8 is the desired table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6696bc66", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# Download the data.\n", + "dfs = []\n", + "zip_file_dir = TMP_PATH / \"hud_housing\"\n", + "\n", + "print(f\"Downloading 225MB housing data\")\n", + "unzip_file_from_url(\n", + " \"https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip\",\n", + " TMP_PATH,\n", + " zip_file_dir,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e954589", + "metadata": {}, + "outputs": [], + "source": [ + "# New file name:\n", + "tmp_csv_file_path = (\n", + " zip_file_dir\n", + " / \"2012thru2016-140-csv\"\n", + " / \"2012thru2016-140-csv\"\n", + " / \"140\"\n", + " / \"Table8.csv\"\n", + ")\n", + "df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "244e0d03", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename and reformat block group ID\n", + "df.rename(columns={\"geoid\": GEOID_TRACT_FIELD_NAME}, inplace=True)\n", + "\n", + "# The CHAS data has census tract ids such as `14000US01001020100`\n", + "# Whereas the rest of our data uses, for the same tract, `01001020100`.\n", + "# the characters before `US`:\n", + "df[GEOID_TRACT_FIELD_NAME] = df[GEOID_TRACT_FIELD_NAME].str.replace(\n", + " r\"^.*?US\", \"\", regex=True\n", + ")\n", + "\n", + "df[GEOID_TRACT_FIELD_NAME].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03250026", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate housing burden\n", + "# This is quite a number of steps. It does not appear to be accessible nationally in a simpler format, though.\n", + "# See \"CHAS data dictionary 12-16.xlsx\"\n", + "\n", + "# Owner occupied numerator fields\n", + "OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n", + " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", + " # T8_est7\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", + " \"T8_est7\",\n", + " # T8_est10\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tgreater than 50%\tAll\n", + " \"T8_est10\",\n", + " # T8_est20\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", + " \"T8_est20\",\n", + " # T8_est23\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 50%\tAll\n", + " \"T8_est23\",\n", + " # T8_est33\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", + " \"T8_est33\",\n", + " # T8_est36\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 50%\tAll\n", + " \"T8_est36\",\n", + "]\n", + "\n", + "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n", + "OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n", + " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", + " # T8_est13\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est13\",\n", + " # T8_est26\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est26\",\n", + " # T8_est39\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est39\",\n", + " # T8_est52\tSubtotal\tOwner occupied\tgreater than 80% but less than or equal to 100% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est52\",\n", + " # T8_est65\tSubtotal\tOwner occupied\tgreater than 100% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est65\",\n", + "]\n", + "\n", + "# T8_est2\tSubtotal\tOwner occupied\tAll\tAll\tAll\n", + "OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n", + "\n", + "# Renter occupied numerator fields\n", + "RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n", + " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", + " # T8_est73\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", + " \"T8_est73\",\n", + " # T8_est76\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tgreater than 50%\tAll\n", + " \"T8_est76\",\n", + " # T8_est86\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", + " \"T8_est86\",\n", + " # T8_est89\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 50%\tAll\n", + " \"T8_est89\",\n", + " # T8_est99\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n", + " \"T8_est99\",\n", + " # T8_est102\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 50%\tAll\n", + " \"T8_est102\",\n", + "]\n", + "\n", + "# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n", + "RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n", + " # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n", + " # T8_est79\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est79\",\n", + " # T8_est92\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est92\",\n", + " # T8_est105\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est105\",\n", + " # T8_est118\tSubtotal\tRenter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est118\",\n", + " # T8_est131\tSubtotal\tRenter occupied\tgreater than 100% of HAMFI\tnot computed (no/negative income)\tAll\n", + " \"T8_est131\",\n", + "]\n", + "\n", + "\n", + "# T8_est68\tSubtotal\tRenter occupied\tAll\tAll\tAll\n", + "RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\"\n", + "\n", + "\n", + "# Math:\n", + "# (\n", + "# # of Owner Occupied Units Meeting Criteria\n", + "# + # of Renter Occupied Units Meeting Criteria\n", + "# )\n", + "# divided by\n", + "# (\n", + "# Total # of Owner Occupied Units\n", + "# + Total # of Renter Occupied Units\n", + "# - # of Owner Occupied Units with HAMFI Not Computed\n", + "# - # of Renter Occupied Units with HAMFI Not Computed\n", + "# )\n", + "\n", + "df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME] = df[OWNER_OCCUPIED_NUMERATOR_FIELDS].sum(\n", + " axis=1\n", + ") + df[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)\n", + "\n", + "df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME] = (\n", + " df[OWNER_OCCUPIED_POPULATION_FIELD]\n", + " + df[RENTER_OCCUPIED_POPULATION_FIELD]\n", + " - df[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n", + " - df[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n", + ")\n", + "\n", + "# TODO: add small sample size checks\n", + "df[HOUSING_BURDEN_FIELD_NAME] = df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME].astype(\n", + " float\n", + ") / df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(float)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8275c1ef", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Drop unnecessary fields\n", + "df[\n", + " [\n", + " GEOID_TRACT_FIELD_NAME,\n", + " HOUSING_BURDEN_NUMERATOR_FIELD_NAME,\n", + " HOUSING_BURDEN_DENOMINATOR_FIELD_NAME,\n", + " HOUSING_BURDEN_FIELD_NAME,\n", + " ]\n", + "].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef5bb862", + "metadata": {}, + "outputs": [], + "source": [ + "# cleanup\n", + "remove_all_from_dir(TMP_PATH)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/score/ipython/score_calc.ipynb b/score/ipython/score_calc.ipynb index bb5f0a64..39424812 100644 --- a/score/ipython/score_calc.ipynb +++ b/score/ipython/score_calc.ipynb @@ -11,6 +11,7 @@ "# 1. `ejscreen_etl.ipynb`\n", "# 2. `census_etl.ipynb`\n", "# 3. `housing_and_transportation_etl.ipynb`\n", + "# 4. `hud_housing_etl.ipynb`\n", "\n", "import collections\n", "import functools\n", @@ -20,7 +21,7 @@ "import os\n", "import sys\n", "\n", - "module_path = os.path.abspath(os.path.join('..'))\n", + "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", @@ -28,6 +29,7 @@ "\n", "# Define some global parameters\n", "GEOID_FIELD_NAME = \"GEOID10\"\n", + "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", "BUCKET_SOCIOECONOMIC = \"Socioeconomic Factors\"\n", "BUCKET_SENSITIVE = \"Sensitive populations\"\n", "BUCKET_ENVIRONMENTAL = \"Environmental effects\"\n", @@ -39,11 +41,22 @@ " BUCKET_EXPOSURES,\n", "]\n", "\n", + "# A few specific field names\n", + "# TODO: clean this up, I name some fields but not others.\n", + "UNEMPLOYED_FIELD_NAME = \"Unemployed civilians (percent)\"\n", + "LINGUISTIC_ISOLATION_FIELD_NAME = \"Linguistic isolation (percent)\"\n", + "HOUSING_BURDEN_FIELD_NAME = \"Housing burden (percent)\"\n", + "POVERTY_FIELD_NAME = \"Poverty (Less than 200% of federal poverty line)\"\n", + "HIGH_SCHOOL_FIELD_NAME = (\n", + " \"Percent individuals age 25 or over with less than high school degree\"\n", + ")\n", + "\n", "# There's another aggregation level (a second level of \"buckets\").\n", "AGGREGATION_POLLUTION = \"Pollution Burden\"\n", "AGGREGATION_POPULATION = \"Population Characteristics\"\n", "\n", "PERCENTILE_FIELD_SUFFIX = \" (percentile)\"\n", + "MIN_MAX_FIELD_SUFFIX = \" (min-max normalized)\"\n", "\n", "DATA_PATH = Path.cwd().parent / \"data\"\n", "SCORE_CSV_PATH = DATA_PATH / \"score\" / \"csv\"\n", @@ -102,6 +115,23 @@ "housing_and_transportation_df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9202e5d", + "metadata": {}, + "outputs": [], + "source": [ + "# Load HUD housing data\n", + "hud_housing_csv = DATA_PATH / \"dataset\" / \"hud_housing\" / \"usa.csv\"\n", + "hud_housing_df = pd.read_csv(\n", + " hud_housing_csv,\n", + " dtype={GEOID_TRACT_FIELD_NAME: \"string\"},\n", + " low_memory=False,\n", + ")\n", + "hud_housing_df.head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -110,15 +140,59 @@ "outputs": [], "source": [ "# Join all the data sources that use census block groups\n", - "dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n", + "census_block_group_dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n", "\n", - "df = functools.reduce(\n", + "census_block_group_df = functools.reduce(\n", " lambda left, right: pd.merge(\n", " left=left, right=right, on=GEOID_FIELD_NAME, how=\"outer\"\n", " ),\n", - " dfs,\n", + " census_block_group_dfs,\n", ")\n", "\n", + "\n", + "if len(census_block_group_df) > 220333:\n", + " raise ValueError(\"Too many rows in the join.\")\n", + "\n", + "census_block_group_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e79ec27a", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity check the join.\n", + "if len(census_block_group_df[GEOID_FIELD_NAME].str.len().unique()) != 1:\n", + " raise ValueError(\n", + " f\"One of the input CSVs uses {GEOID_FIELD_NAME} with a different length.\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d0d2915", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Join all the data sources that use census tracts\n", + "# TODO: when there's more than one data source using census tract, reduce/merge them here.\n", + "census_tract_df = hud_housing_df\n", + "\n", + "# Calculate the tract for the CBG data.\n", + "census_block_group_df[GEOID_TRACT_FIELD_NAME] = census_block_group_df[\n", + " GEOID_FIELD_NAME\n", + "].str[0:11]\n", + "\n", + "df = census_block_group_df.merge(census_tract_df, on=GEOID_TRACT_FIELD_NAME)\n", + "\n", + "if len(census_block_group_df) > 220333:\n", + " raise ValueError(\"Too many rows in the join.\")\n", + "\n", "df.head()" ] }, @@ -135,13 +209,18 @@ ")\n", "\n", "data_sets = [\n", - " # The following data sets have `bucket=None`, because it's not used in the score.\n", + " # The following data sets have `bucket=None`, because it's not used in the bucket based score (\"Score C\").\n", " DataSet(\n", " input_field=GEOID_FIELD_NAME,\n", " # Use the name `GEOID10` to enable geoplatform.gov's workflow.\n", " renamed_field=GEOID_FIELD_NAME,\n", " bucket=None,\n", " ),\n", + " DataSet(\n", + " input_field=HOUSING_BURDEN_FIELD_NAME,\n", + " renamed_field=HOUSING_BURDEN_FIELD_NAME,\n", + " bucket=None,\n", + " ),\n", " DataSet(input_field=\"ACSTOTPOP\", renamed_field=\"Total population\", bucket=None),\n", " # The following data sets have buckets, because they're used in the score\n", " DataSet(\n", @@ -206,24 +285,28 @@ " bucket=BUCKET_SENSITIVE,\n", " ),\n", " DataSet(\n", + " input_field=LINGUISTIC_ISOLATION_FIELD_NAME,\n", + " renamed_field=LINGUISTIC_ISOLATION_FIELD_NAME,\n", + " bucket=BUCKET_SENSITIVE,\n", + " ),\n", + " DataSet(\n", " input_field=\"LINGISOPCT\",\n", " renamed_field=\"Percent of households in linguistic isolation\",\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", " input_field=\"LOWINCPCT\",\n", - " renamed_field=\"Poverty (Less than 200% of federal poverty line)\",\n", + " renamed_field=POVERTY_FIELD_NAME,\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", " input_field=\"LESSHSPCT\",\n", - " renamed_field=\"Percent individuals age 25 or over with less than high school degree\",\n", + " renamed_field=HIGH_SCHOOL_FIELD_NAME,\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", - " input_field=\"Unemployed Civilians (fraction)\",\n", - " # Following EJSCREEN conventions, where fractional data is named as a percent.\n", - " renamed_field=\"Unemployed Civilians (percent)\",\n", + " input_field=UNEMPLOYED_FIELD_NAME,\n", + " renamed_field=UNEMPLOYED_FIELD_NAME,\n", " bucket=BUCKET_SOCIOECONOMIC,\n", " ),\n", " DataSet(\n", @@ -256,6 +339,21 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "1280cbd4", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert all columns to numeric.\n", + "for data_set in data_sets:\n", + " # Skip GEOID_FIELD_NAME, because it's a string.\n", + " if data_set.renamed_field == GEOID_FIELD_NAME:\n", + " continue\n", + " df[f\"{data_set.renamed_field}\"] = pd.to_numeric(df[data_set.renamed_field])" + ] + }, { "cell_type": "code", "execution_count": null, @@ -274,6 +372,44 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2088013", + "metadata": {}, + "outputs": [], + "source": [ + "# calculate min max\n", + "# Math:\n", + "# (\n", + "# Observed value\n", + "# - minimum of all values\n", + "# )\n", + "# divided by\n", + "# (\n", + "# Maximum of all values\n", + "# - minimum of all values\n", + "# )\n", + "for data_set in data_sets:\n", + " # Skip GEOID_FIELD_NAME, because it's a string.\n", + " if data_set.renamed_field == GEOID_FIELD_NAME:\n", + " continue\n", + "\n", + " min_value = df[data_set.renamed_field].min(skipna=True)\n", + "\n", + " max_value = df[data_set.renamed_field].max(skipna=True)\n", + "\n", + " print(\n", + " f\"For data set {data_set.renamed_field}, the min value is {min_value} and the max value is {max_value}.\"\n", + " )\n", + "\n", + " df[f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"] = (\n", + " df[data_set.renamed_field] - min_value\n", + " ) / (max_value - min_value)\n", + "\n", + "df.head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -333,6 +469,35 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f70106f5", + "metadata": {}, + "outputs": [], + "source": [ + "fields_to_use_in_score = [\n", + " UNEMPLOYED_FIELD_NAME,\n", + " LINGUISTIC_ISOLATION_FIELD_NAME,\n", + " HOUSING_BURDEN_FIELD_NAME,\n", + " POVERTY_FIELD_NAME,\n", + " HIGH_SCHOOL_FIELD_NAME,\n", + "]\n", + "\n", + "fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n", + "fields_percentile = [\n", + " f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n", + "]\n", + "\n", + "# Calculate \"Score D\", which uses min-max normalization\n", + "# and calculate \"Score E\", which uses percentile normalization for the same fields\n", + "df[\"Score D\"] = df[fields_min_max].mean(axis=1)\n", + "df[\"Score E\"] = df[fields_percentile].mean(axis=1)\n", + "\n", + "print(df[\"Score D\"].describe())\n", + "print(df[\"Score E\"].describe())" + ] + }, { "cell_type": "code", "execution_count": null, @@ -343,7 +508,7 @@ "outputs": [], "source": [ "# Create percentiles for the scores\n", - "for score_field in [\"Score A\", \"Score B\", \"Score C\"]:\n", + "for score_field in [\"Score A\", \"Score B\", \"Score C\", \"Score D\", \"Score E\"]:\n", " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[score_field].rank(pct=True)\n", " df[f\"{score_field} (top 25th percentile)\"] = (\n", " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] >= 0.75\n", @@ -376,14 +541,6 @@ " # we need to name the file data01.csv for ogr2ogr csv merge to work\n", " df1.to_csv(SCORE_CSV_PATH / f\"data{states_fips}.csv\", index=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "167ebba3", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -402,7 +559,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.0" + "version": "3.7.1" } }, "nbformat": 4, diff --git a/score/ipython/scoring_comparison.ipynb b/score/ipython/scoring_comparison.ipynb index 49e66835..fc315009 100644 --- a/score/ipython/scoring_comparison.ipynb +++ b/score/ipython/scoring_comparison.ipynb @@ -15,7 +15,12 @@ "import pandas as pd\n", "from pathlib import Path\n", "import requests\n", - "import zipfile" + "import zipfile\n", + "from datetime import datetime\n", + "from tqdm.notebook import tqdm_notebook\n", + "\n", + "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", + "tqdm_notebook.pandas()" ] }, { @@ -50,8 +55,11 @@ "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", "\n", "# Comparison field names\n", + "any_tract_has_at_least_one_cbg = \"Tract has at least one CEJST CBG?\"\n", "tract_has_at_least_one_cbg = \"CES Tract has at least one CEJST CBG?\"\n", - "tract_has_100_percent_cbg = \"CES Tract has 100% CEJST CBGs?\"" + "tract_has_100_percent_cbg = \"CES Tract has 100% CEJST CBGs?\"\n", + "non_ces_tract_has_at_least_one_cbg = \"Non-CES Tract has at least one CEJST CBG?\"\n", + "non_ces_tract_has_100_percent_cbg = \"Non-CES Tract has 100% CEJST CBGs?\"" ] }, { @@ -69,12 +77,15 @@ "cejst_df.head()\n", "\n", "# Rename unclear name \"id\" to \"census_block_group_id\", as well as other renamings.\n", + "\n", + "score_used = \"Score A\"\n", + "\n", "cejst_df.rename(\n", " columns={\n", " \"GEOID10\": CENSUS_BLOCK_GROUP_ID_FIELD,\n", " \"Total population\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n", - " \"Score C\": CEJST_SCORE_FIELD,\n", - " \"Score C (percentile)\": CEJST_PERCENTILE_FIELD,\n", + " score_used: CEJST_SCORE_FIELD,\n", + " f\"{score_used} (percentile)\": CEJST_PERCENTILE_FIELD,\n", " },\n", " inplace=True,\n", " errors=\"raise\",\n", @@ -239,18 +250,42 @@ " # Recall that NaN values are not falsy, so we need to check if `is_a_ces_priority_tract` is True.\n", " is_a_ces_priority_tract = is_a_ces_priority_tract is True\n", "\n", + " # Calculate whether the tract (whether or not it is a CES priority tract) includes CBGs that are priority\n", + " # according to the current CEJST score.\n", + " df[any_tract_has_at_least_one_cbg] = (\n", + " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", + " )\n", + "\n", " # Calculate comparison\n", + " # A CES priority tract has at least one CEJST priority CBG.\n", " df[tract_has_at_least_one_cbg] = (\n", " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", " if is_a_ces_priority_tract\n", " else None\n", " )\n", + "\n", + " # A CES priority tract has all of its contained CBGs as CEJST priority CBGs.\n", " df[tract_has_100_percent_cbg] = (\n", " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n", " if is_a_ces_priority_tract\n", " else None\n", " )\n", "\n", + " # Calculate the inverse\n", + " # A tract that is _not_ a CES priority has at least one CEJST priority CBG.\n", + " df[non_ces_tract_has_at_least_one_cbg] = (\n", + " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", + " if not is_a_ces_priority_tract\n", + " else None\n", + " )\n", + "\n", + " # A tract that is _not_ a CES priority has all of its contained CBGs as CEJST priority CBGs.\n", + " df[non_ces_tract_has_100_percent_cbg] = (\n", + " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n", + " if not is_a_ces_priority_tract\n", + " else None\n", + " )\n", + "\n", " return df\n", "\n", "\n", @@ -258,7 +293,7 @@ "grouped_df = merged_df.groupby(CENSUS_TRACT_ID_FIELD)\n", "\n", "# Run the comparison function on the groups.\n", - "comparison_df = grouped_df.apply(calculate_comparison)\n", + "comparison_df = grouped_df.progress_apply(calculate_comparison)\n", "\n", "# Sort descending by highest CES Score for convenience when viewing output file\n", "comparison_df.sort_values(\n", @@ -283,15 +318,37 @@ "outputs": [], "source": [ "# Prepare some constants for use in the following Markdown cell.\n", - "\n", + "total_cbgs_ca_only = len(cejst_df)\n", "cejst_cbgs_ca_only = cejst_df.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum()\n", + "cejst_cbgs_ca_only_percent = f\"{cejst_cbgs_ca_only / total_cbgs_ca_only:.0%}\"\n", + "\n", + "total_tracts_count = len(comparison_df)\n", "ces_tracts_count = comparison_df.loc[:, CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].sum()\n", + "ces_tracts_count_percent = f\"{ces_tracts_count / total_tracts_count:.0%}\"\n", + "non_ces_tracts_count = total_tracts_count - ces_tracts_count\n", + "\n", + "total_tracts_count = len(comparison_df[CENSUS_TRACT_ID_FIELD])\n", + "cejst_tracts_count = comparison_df.loc[:, any_tract_has_at_least_one_cbg].sum()\n", + "cejst_tracts_count_percent = f\"{cejst_tracts_count / total_tracts_count:.0%}\"\n", + "\n", + "# CES stats\n", "at_least_one_sum = comparison_df.loc[:, tract_has_at_least_one_cbg].sum()\n", "at_least_one_sum_percent = f\"{at_least_one_sum / ces_tracts_count:.0%}\"\n", "\n", "all_100_sum = comparison_df.loc[:, tract_has_100_percent_cbg].sum()\n", "all_100_sum_percent = f\"{all_100_sum / ces_tracts_count:.0%}\"\n", "\n", + "# Non-CES stats:\n", + "non_ces_at_least_one_sum = comparison_df.loc[\n", + " :, non_ces_tract_has_at_least_one_cbg\n", + "].sum()\n", + "non_ces_at_least_one_sum_percent = (\n", + " f\"{non_ces_at_least_one_sum / non_ces_tracts_count:.0%}\"\n", + ")\n", + "\n", + "non_ces_all_100_sum = comparison_df.loc[:, non_ces_tract_has_100_percent_cbg].sum()\n", + "non_ces_all_100_sum_percent = f\"{non_ces_all_100_sum / non_ces_tracts_count:.0%}\"\n", + "\n", "# Note, for the following Markdown cell to render the variables properly, follow the steps at\n", "# \"Activating variable-enabled Markdown for Jupyter notebooks\" within `score/README.md`." ] @@ -301,26 +358,44 @@ "id": "0c534966", "metadata": { "variables": { - "all_100_sum": "1373", - "all_100_sum_percent": "69%", - "at_least_one_sum": "1866", - "at_least_one_sum_percent": "94%", - "cejst_cbgs_ca_only": "10849", - "ces_tracts_count": "1983" + " total_tracts_count": "8057", + "all_100_sum": "1168", + "all_100_sum_percent": "59%", + "at_least_one_sum": "1817", + "at_least_one_sum_percent": "92%", + "cejst_cbgs_ca_only": "6987", + "cejst_cbgs_ca_only_percent": "30%", + "cejst_tracts_count": "3516", + "cejst_tracts_count_percent": "44%", + "ces_tracts_count": "1983", + "ces_tracts_count_percent": "25%", + "datetime.today().strftime('%Y-%m-%d')": "2021-06-28", + "non_ces_all_100_sum": "438", + "non_ces_all_100_sum_percent": "7%", + "non_ces_at_least_one_sum": "1699", + "non_ces_at_least_one_sum_percent": "28%", + "score_used": "Score A", + "total_cbgs_ca_only": "23212" } }, "source": [ - "# Summary of findings\n", + "# Summary of findings for {{score_used}}\n", + "\n", + "(Calculated on {{datetime.today().strftime('%Y-%m-%d')}})\n", "\n", "Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n", "\n", - "There are {{ces_tracts_count}} census tracts designated as Disadvantaged Communities by CalEnviroScreen 4.0. \n", + "There are {{ces_tracts_count}} census tracts designated as Disadvantaged Communities by CalEnviroScreen 4.0, out of {{total_tracts_count}} total tracts ({{ces_tracts_count_percent}}). \n", "\n", - "Within California, there are {{cejst_cbgs_ca_only}} census block groups considered as priority communities by the current version of the CEJST score used in this analysis.\n", + "Within California, there are {{cejst_cbgs_ca_only}} census block groups considered as priority communities by the current version of the CEJST score used in this analysis, out of {{total_cbgs_ca_only}} CBGs in the state ({{cejst_cbgs_ca_only_percent}}). They occupy {{cejst_tracts_count}} ({{cejst_tracts_count_percent}}) of all the census tracts in California.\n", "\n", "Out of every CalEnviroScreen Disadvantaged Community census tract, {{at_least_one_sum}} ({{at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", "\n", - "Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score." + "Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.\n", + "\n", + "Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_at_least_one_sum}} ({{non_ces_at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", + "\n", + "Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_all_100_sum}} ({{non_ces_all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score." ] } ],