From 495b03149e73d813ebb0f2e0aedc73cb6816e31d Mon Sep 17 00:00:00 2001
From: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
Date: Thu, 23 Sep 2021 16:11:07 -0500
Subject: [PATCH] adding new fields to comparison

---
 .../ipython/scoring_comparison.ipynb          | 173 +++++++-----------
 1 file changed, 68 insertions(+), 105 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
index bda9aa3d..0b9721b0 100644
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@@ -72,6 +72,11 @@
     "COUNTRY_FIELD_NAME = \"Country\"\n",
     "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
     "URBAN_HEURISTIC_FIELD = \"Urban Heuristic Flag\"\n",
+    "LIFE_EXPECTANCY_FIELD = \"Life expectancy (years)\"\n",
+    "HEALTH_INSURANCE_FIELD = (\n",
+    "    \"Current lack of health insurance among adults aged 18-64 years\"\n",
+    ")\n",
+    "BAD_HEALTH_FIELD = \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
     "\n",
     "CEJST_SCORE_FIELD = \"cejst_score\"\n",
     "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
@@ -118,12 +123,11 @@
    "outputs": [],
    "source": [
     "# Analyze one field at a time (useful for setting thresholds)\n",
-    "\n",
     "quantile = 0.8\n",
     "\n",
     "for field in [\n",
     "    \"Percent of individuals < 200% Federal Poverty Line\",\n",
-    "    \"Life expectancy (years)\",\n",
+    "    LIFE_EXPECTANCY_FIELD,\n",
     "    \"Energy burden\",\n",
     "    URBAN_HEURISTIC_FIELD,\n",
     "]:\n",
@@ -152,17 +156,15 @@
     "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
     "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
     "\n",
-    "calenviroscreen_data_path = (\n",
-    "    DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
-    ")\n",
+    "calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
     "calenviroscreen_df = pd.read_csv(\n",
     "    calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
     ")\n",
     "\n",
     "# Convert priority community field to a bool.\n",
-    "calenviroscreen_df[\n",
+    "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
     "    CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
-    "] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n",
+    "].astype(bool)\n",
     "\n",
     "calenviroscreen_df.head()"
    ]
@@ -175,9 +177,7 @@
    "outputs": [],
    "source": [
     "# Load persistent poverty data\n",
-    "persistent_poverty_path = (\n",
-    "    DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
-    ")\n",
+    "persistent_poverty_path = DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
     "persistent_poverty_df = pd.read_csv(\n",
     "    persistent_poverty_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
     ")\n",
@@ -189,9 +189,7 @@
     "PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
     "\n",
     "persistent_poverty_df.rename(\n",
-    "    columns={\n",
-    "        PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n",
-    "    },\n",
+    "    columns={PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD},\n",
     "    inplace=True,\n",
     "    errors=\"raise\",\n",
     ")\n",
@@ -282,6 +280,21 @@
     "# (`census_tract_indices`).\n",
     "census_block_group_indices = [\n",
     "    Index(\n",
+    "        method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
+    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
+    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
+    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
     "        method_name=\"Score G\",\n",
     "        priority_communities_field=\"Score G (communities)\",\n",
     "        other_census_tract_fields_to_keep=[],\n",
@@ -336,21 +349,6 @@
     "        priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
     "        other_census_tract_fields_to_keep=[],\n",
     "    ),\n",
-    "    Index(\n",
-    "        method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
-    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
-    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
-    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
     "]\n",
     "\n",
     "census_tract_indices = [\n",
@@ -388,8 +386,7 @@
     "    for priority_communities_field in priority_communities_fields:\n",
     "        # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n",
     "        df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
-    "            df[priority_communities_field]\n",
-    "            * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
+    "            df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
     "        )\n",
     "\n",
     "    def calculate_state_comparison(\n",
@@ -428,9 +425,7 @@
     "            summary_dict[\"Geography name\"] = division_id\n",
     "\n",
     "        total_cbgs_in_geography = len(frame)\n",
-    "        total_population_in_geography = frame[\n",
-    "            CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
-    "        ].sum()\n",
+    "        total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n",
     "\n",
     "        if geography_field == URBAN_HEURISTIC_FIELD:\n",
     "            urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n",
@@ -438,9 +433,9 @@
     "            summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
     "\n",
     "        for priority_communities_field in priority_communities_fields:\n",
-    "            summary_dict[\n",
+    "            summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
     "                f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
-    "            ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n",
+    "            ].sum()\n",
     "\n",
     "            summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n",
     "                f\"{priority_communities_field}\"\n",
@@ -452,9 +447,7 @@
     "                / total_cbgs_in_geography\n",
     "            )\n",
     "\n",
-    "            summary_dict[\n",
-    "                f\"{priority_communities_field} (percent population)\"\n",
-    "            ] = (\n",
+    "            summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
     "                summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
     "                / total_population_in_geography\n",
     "            )\n",
@@ -500,9 +493,7 @@
     "\n",
     "    # Run the comparison function on the groups.\n",
     "    region_distribution_df = region_grouped_df.progress_apply(\n",
-    "        lambda frame: calculate_state_comparison(\n",
-    "            frame, geography_field=\"region\"\n",
-    "        )\n",
+    "        lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
     "    )\n",
     "\n",
     "    # Next, run the comparison by division\n",
@@ -510,9 +501,7 @@
     "\n",
     "    # Run the comparison function on the groups.\n",
     "    division_distribution_df = division_grouped_df.progress_apply(\n",
-    "        lambda frame: calculate_state_comparison(\n",
-    "            frame, geography_field=\"division\"\n",
-    "        )\n",
+    "        lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
     "    )\n",
     "\n",
     "    # Next, run the comparison by urban/rural\n",
@@ -567,9 +556,7 @@
     "        column_character = get_excel_column_name(column_index)\n",
     "\n",
     "        # Set all columns to larger width\n",
-    "        worksheet.set_column(\n",
-    "            f\"{column_character}:{column_character}\", column_width\n",
-    "        )\n",
+    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
     "\n",
     "        # Special formatting for all percent columns\n",
     "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
@@ -584,7 +571,9 @@
     "\n",
     "        # Special formatting for columns that capture the percent of population considered priority.\n",
     "        if \"(percent population)\" in column:\n",
-    "            column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
+    "            column_ranges = (\n",
+    "                f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
+    "            )\n",
     "\n",
     "            # Add green to red conditional formatting.\n",
     "            worksheet.conditional_format(\n",
@@ -616,7 +605,7 @@
     "]\n",
     "\n",
     "# Convert all indices to boolean\n",
-    "for field_to_analyze in fields_to_analyze: \n",
+    "for field_to_analyze in fields_to_analyze:\n",
     "    if \"Areas of Concern\" in field_to_analyze:\n",
     "        print(f\"Converting {field_to_analyze} to boolean.\")\n",
     "\n",
@@ -705,9 +694,7 @@
     "\n",
     "    # Put criteria description column first.\n",
     "    new_column_order = [criteria_description_field_name] + [\n",
-    "        col\n",
-    "        for col in comparison_df.columns\n",
-    "        if col != criteria_description_field_name\n",
+    "        col for col in comparison_df.columns if col != criteria_description_field_name\n",
     "    ]\n",
     "\n",
     "    comparison_df = comparison_df[new_column_order]\n",
@@ -753,12 +740,12 @@
     "        column_character = get_excel_column_name(column_index)\n",
     "\n",
     "        # Set all columns to larger width\n",
-    "        worksheet.set_column(\n",
-    "            f\"{column_character}:{column_character}\", column_width\n",
-    "        )\n",
+    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
     "\n",
     "        # Add green to red conditional formatting.\n",
-    "        column_ranges = f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
+    "        column_ranges = (\n",
+    "            f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
+    "        )\n",
     "        worksheet.conditional_format(\n",
     "            column_ranges,\n",
     "            # Min: green, max: red.\n",
@@ -771,11 +758,7 @@
     "\n",
     "        # Special formatting for all percent columns\n",
     "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
-    "        if (\n",
-    "            \"percent \" in column\n",
-    "            or \"(percent)\" in column\n",
-    "            or \"Percent \" in column\n",
-    "        ):\n",
+    "        if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
     "            # Make these columns percentages.\n",
     "            percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
     "            worksheet.set_column(\n",
@@ -813,7 +796,9 @@
     "    )\n",
     "\n",
     "    # Write secondary comparison to CSV.\n",
-    "    file_name_part = f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
+    "    file_name_part = (\n",
+    "        f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
+    "    )\n",
     "    output_dir.mkdir(parents=True, exist_ok=True)\n",
     "    file_path = output_dir / (file_name_part + \".csv\")\n",
     "    file_path_xlsx = output_dir / (file_name_part + \".xlsx\")\n",
@@ -836,10 +821,12 @@
     "    \"Median household income (% of AMI)\",\n",
     "    \"Percent of households in linguistic isolation\",\n",
     "    \"Percent individuals age 25 or over with less than high school degree\",\n",
-    "    \"Linguistic isolation (percent)\",\n",
     "    \"Unemployed civilians (percent)\",\n",
     "    \"Median household income in the past 12 months\",\n",
     "    URBAN_HEURISTIC_FIELD,\n",
+    "    LIFE_EXPECTANCY_FIELD,\n",
+    "    HEALTH_INSURANCE_FIELD,\n",
+    "    BAD_HEALTH_FIELD,\n",
     "]\n",
     "\n",
     "for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n",
@@ -897,9 +884,7 @@
     "\n",
     "    # List of all states/territories in their FIPS codes:\n",
     "    state_ids = sorted(df[state_field].unique())\n",
-    "    state_names = \", \".join(\n",
-    "        [us.states.lookup(state_id).name for state_id in state_ids]\n",
-    "    )\n",
+    "    state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
     "\n",
     "    # Create markdown content for comparisons.\n",
     "    markdown_content = f\"\"\"\n",
@@ -913,9 +898,7 @@
     "\n",
     "\"\"\"\n",
     "\n",
-    "    for (index1, index2) in itertools.combinations(\n",
-    "        census_block_group_indices, 2\n",
-    "    ):\n",
+    "    for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n",
     "        # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n",
     "        count_df = (\n",
     "            df.groupby(\n",
@@ -954,24 +937,16 @@
     "\n",
     "        # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n",
     "        true_true_cbgs = (\n",
-    "            true_true_cbgs_series.iloc[0]\n",
-    "            if len(true_true_cbgs_series) > 0\n",
-    "            else 0\n",
+    "            true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n",
     "        )\n",
     "        true_false_cbgs = (\n",
-    "            true_false_cbgs_series.iloc[0]\n",
-    "            if len(true_false_cbgs_series) > 0\n",
-    "            else 0\n",
+    "            true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n",
     "        )\n",
     "        false_true_cbgs = (\n",
-    "            false_true_cbgs_series.iloc[0]\n",
-    "            if len(false_true_cbgs_series) > 0\n",
-    "            else 0\n",
+    "            false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n",
     "        )\n",
     "        false_false_cbgs = (\n",
-    "            false_false_cbgs_series.iloc[0]\n",
-    "            if len(false_false_cbgs_series) > 0\n",
-    "            else 0\n",
+    "            false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n",
     "        )\n",
     "\n",
     "        markdown_content += (\n",
@@ -1163,20 +1138,15 @@
     "\n",
     "        # Calculate comparison\n",
     "        # A comparison priority tract has at least one CBG that is a priority CBG.\n",
-    "        df[\n",
-    "            comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n",
-    "        ] = (\n",
+    "        df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n",
     "            frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
     "            if is_a_method_b_priority_tract\n",
     "            else None\n",
     "        )\n",
     "\n",
     "        # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n",
-    "        df[\n",
-    "            comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n",
-    "        ] = (\n",
-    "            frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
-    "            == 1\n",
+    "        df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n",
+    "            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
     "            if is_a_method_b_priority_tract\n",
     "            else None\n",
     "        )\n",
@@ -1195,8 +1165,7 @@
     "        df[\n",
     "            comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n",
     "        ] = (\n",
-    "            frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
-    "            == 1\n",
+    "            frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
     "            if not is_a_method_b_priority_tract\n",
     "            else None\n",
     "        )\n",
@@ -1208,6 +1177,9 @@
     "            \"Percent of households in linguistic isolation\",\n",
     "            \"Percent individuals age 25 or over with less than high school degree\",\n",
     "            \"Unemployed civilians (percent)\",\n",
+    "            LIFE_EXPECTANCY_FIELD,\n",
+    "            HEALTH_INSURANCE_FIELD,\n",
+    "            BAD_HEALTH_FIELD,\n",
     "        ]:\n",
     "            df[f\"{field} (average of CBGs)\"] = frame.loc[:, field].mean()\n",
     "\n",
@@ -1237,20 +1209,14 @@
     "\n",
     "    # List of all states/territories in their FIPS codes:\n",
     "    state_ids = sorted(original_df[state_field].unique())\n",
-    "    state_names = \", \".join(\n",
-    "        [us.states.lookup(state_id).name for state_id in state_ids]\n",
-    "    )\n",
+    "    state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
     "\n",
     "    # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n",
     "    # TODO: investigate why sums are sometimes series and sometimes scalar.\n",
     "    method_a_priority_cbgs = (\n",
-    "        original_df.loc[:, method_a_priority_census_block_groups_field]\n",
-    "        .sum()\n",
-    "        .squeeze()\n",
-    "    )\n",
-    "    method_a_priority_cbgs_percent = (\n",
-    "        f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
+    "        original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n",
     "    )\n",
+    "    method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
     "\n",
     "    total_tracts_count = len(comparison_df)\n",
     "\n",
@@ -1272,9 +1238,7 @@
     "        .sum()\n",
     "        .squeeze()\n",
     "    )\n",
-    "    method_a_tracts_count_percent = (\n",
-    "        f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
-    "    )\n",
+    "    method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
     "\n",
     "    # Method A priority community stats\n",
     "    method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
@@ -1405,8 +1369,7 @@
     "\n",
     "    # Write comparison to CSV.\n",
     "    file_path = (\n",
-    "        output_dir\n",
-    "        / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
+    "        output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
     "    )\n",
     "    comparison_df.to_csv(\n",
     "        path_or_buf=file_path,\n",