Issue 1141: Definition M (#1151)

2025-07-28 07:21:18 -07:00 · 2022-01-18 14:56:55 -05:00 · 2022-01-18 14:56:55 -05:00 · 18f299c5f8
commit 18f299c5f8
parent a07bf752b0
21 changed files with 1000 additions and 143 deletions
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -21,6 +21,7 @@
    "import requests\n",
    "import string\n",
    "import sys\n",
+    "import time\n",
    "import typing\n",
    "import us\n",
    "import zipfile\n",
@ -61,7 +62,10 @@
    "# Set some global parameters\n",
    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
    "TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
-    "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
+    "\n",
+    "time_str = time.strftime(\"%Y%m%d-%H%M%S\")\n",
+    "\n",
+    "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\" / time_str\n",
    "\n",
    "# Make the dirs if they don't exist\n",
    "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
@ -109,7 +113,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "a251a0fb",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Load EJSCREEN Areas of Concern data.\n",
@ -143,7 +149,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "e43a9e23",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Merge EJSCREEN AoCs into CEJST data.\n",
@ -173,10 +181,13 @@
   "source": [
    "# Analyze one field at a time (useful for setting thresholds)\n",
    "\n",
-    "quantile = 0.9\n",
+    "quantile = 0.95\n",
    "\n",
    "for field in [\n",
-    "    field_names.MEDIAN_HOUSE_VALUE_FIELD,\n",
+    "    field_names.COLLEGE_ATTENDANCE_FIELD,\n",
+    "    field_names.HIGH_SCHOOL_ED_FIELD,\n",
+    "    field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,\n",
+    "    field_names.POVERTY_LESS_THAN_200_FPL_FIELD,\n",
    "]:\n",
    "    print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
    "    print(cejst_df[field].describe())\n",
@ -223,7 +234,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "d8ec43dc",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Load persistent poverty data\n",
@ -256,7 +269,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "81826d29",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Load mapping inequality data\n",
@ -314,7 +329,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "605af1ff",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Load alternative energy-related definition\n",
@ -333,7 +350,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "fe4a2939",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Load Michigan EJSCREEN\n",
@ -356,15 +375,13 @@
   "outputs": [],
   "source": [
    "# Load EPA RSEI EJSCREEN\n",
-    "epa_rsei_aggregate_data_path = (\n",
-    "    DATA_DIR / \"dataset\" / \"epa_rsei_aggregated\" / \"usa.csv\"\n",
-    ")\n",
-    "epa_rsei_aggregate_df = pd.read_csv(\n",
-    "    epa_rsei_aggregate_data_path,\n",
+    "epa_rsei_data_path = DATA_DIR / \"dataset\" / \"epa_rsei\" / \"usa.csv\"\n",
+    "epa_rsei_df = pd.read_csv(\n",
+    "    epa_rsei_data_path,\n",
    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
    ")\n",
    "\n",
-    "epa_rsei_aggregate_df.head()"
+    "epa_rsei_df.head()"
   ]
  },
  {
@ -382,7 +399,7 @@
    "    calenviroscreen_df,\n",
    "    persistent_poverty_df,\n",
    "    mapping_inequality_df,\n",
-    "    epa_rsei_aggregate_df,\n",
+    "    epa_rsei_df,\n",
    "    maryland_ejscreen_df,\n",
    "    energy_definition_alternative_draft_df,\n",
    "    michigan_ejscreen_df,\n",
@ -416,7 +433,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "2de78f71",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Special handling for HOLC.\n",
@ -461,13 +480,41 @@
    "    field_names.L_NON_WORKFORCE,\n",
    "]\n",
    "\n",
+    "definition_m_factors = [\n",
+    "    field_names.M_CLIMATE,\n",
+    "    field_names.M_ENERGY,\n",
+    "    field_names.M_TRANSPORTATION,\n",
+    "    field_names.M_HOUSING,\n",
+    "    field_names.M_POLLUTION,\n",
+    "    field_names.M_WATER,\n",
+    "    field_names.M_HEALTH,\n",
+    "    field_names.M_WORKFORCE,\n",
+    "    # Also include a combined factor for all the non-workforce elements.\n",
+    "    field_names.M_NON_WORKFORCE,\n",
+    "]\n",
+    "\n",
    "census_tract_indices = (\n",
    "    [\n",
    "        Index(\n",
+    "            method_name=\"Definition M\",\n",
+    "            priority_communities_field=field_names.SCORE_M_COMMUNITIES,\n",
+    "        ),\n",
+    "    ]\n",
+    "    + [\n",
+    "        Index(\n",
    "            method_name=\"Definition L\",\n",
    "            priority_communities_field=field_names.SCORE_L_COMMUNITIES,\n",
    "        ),\n",
    "    ]\n",
+    "    # Insert indices for each of the factors from Definition M.\n",
+    "    # Note: since these involve no renaming, we write them using list comprehension.\n",
+    "    + [\n",
+    "        Index(\n",
+    "            method_name=factor,\n",
+    "            priority_communities_field=factor,\n",
+    "        )\n",
+    "        for factor in definition_m_factors\n",
+    "    ]\n",
    "    # Insert indices for each of the factors from Definition L.\n",
    "    # Note: since these involve no renaming, we write them using list comprehension.\n",
    "    + [\n",
@ -575,6 +622,7 @@
    "comparison_fields = [\n",
    "    field_names.POVERTY_LESS_THAN_100_FPL_FIELD,\n",
    "    field_names.POVERTY_LESS_THAN_200_FPL_FIELD,\n",
+    "    field_names.COLLEGE_ATTENDANCE_FIELD,\n",
    "    field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,\n",
    "    field_names.LINGUISTIC_ISO_FIELD,\n",
    "    field_names.UNEMPLOYMENT_FIELD,\n",
@ -584,6 +632,8 @@
    "    field_names.LIFE_EXPECTANCY_FIELD,\n",
    "    field_names.HEALTH_INSURANCE_FIELD,\n",
    "    field_names.PHYS_HEALTH_NOT_GOOD_FIELD,\n",
+    "    field_names.DIABETES_FIELD,\n",
+    "    field_names.LOW_READING_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,\n",
    "]"
   ]
  },
@ -874,7 +924,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "2bcbcabf",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "directory = COMPARISON_OUTPUTS_DIR / \"tracts_basic_stats\"\n",
@ -1001,24 +1053,28 @@
    "    E.g., it might show that tracts prioritized by A but not B have a higher average income,\n",
    "    or that tracts prioritized by B but not A have a lower percent of unemployed people.\n",
    "    \"\"\"\n",
-    "    df_subset = df[\n",
-    "        [\n",
-    "            method_a_priority_census_tracts_field,\n",
-    "            method_b_priority_census_tracts_field,\n",
-    "        ]\n",
-    "        + comparison_fields\n",
+    "    fields_to_group_by = [\n",
+    "        method_a_priority_census_tracts_field,\n",
+    "        method_b_priority_census_tracts_field,\n",
    "    ]\n",
    "\n",
+    "    df_subset = df[fields_to_group_by + comparison_fields]\n",
+    "\n",
    "    grouped_df = df_subset.groupby(\n",
-    "        [\n",
-    "            method_a_priority_census_tracts_field,\n",
-    "            method_b_priority_census_tracts_field,\n",
-    "        ],\n",
+    "        fields_to_group_by,\n",
    "        dropna=False,\n",
    "    )\n",
    "\n",
-    "    # Run the comparison function on the groups.\n",
-    "    comparison_df = grouped_df.mean().reset_index()\n",
+    "    # Take the mean of all fields.\n",
+    "    comparison_df = grouped_df.mean()\n",
+    "\n",
+    "    # Also add in the count of census tracts.\n",
+    "    count_field_name = \"Count of census tracts\"\n",
+    "    comparison_df[count_field_name] = grouped_df.size().to_frame(\n",
+    "        count_field_name\n",
+    "    )\n",
+    "\n",
+    "    comparison_df = comparison_df.reset_index()\n",
    "\n",
    "    criteria_description_field_name = \"Description of criteria\"\n",
    "    comparison_df[criteria_description_field_name] = comparison_df.apply(\n",
@ -1030,10 +1086,13 @@
    "    )\n",
    "\n",
    "    # Put criteria description column first.\n",
-    "    new_column_order = [criteria_description_field_name] + [\n",
-    "        col\n",
-    "        for col in comparison_df.columns\n",
-    "        if col != criteria_description_field_name\n",
+    "    columns_to_put_first = (\n",
+    "        [criteria_description_field_name]\n",
+    "        + fields_to_group_by\n",
+    "        + [count_field_name]\n",
+    "    )\n",
+    "    new_column_order = columns_to_put_first + [\n",
+    "        col for col in comparison_df.columns if col not in columns_to_put_first\n",
    "    ]\n",
    "\n",
    "    comparison_df = comparison_df[new_column_order]\n",
@ -1356,7 +1415,9 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "7d095ebd",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
   "outputs": [],
   "source": [
    "# Note: this is helpful because this file is long-running, so it alerts the user when the\n",
@ -1369,7 +1430,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -1383,7 +1444,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,