From 495b03149e73d813ebb0f2e0aedc73cb6816e31d Mon Sep 17 00:00:00 2001 From: lucasmbrown-usds Date: Thu, 23 Sep 2021 16:11:07 -0500 Subject: [PATCH] adding new fields to comparison --- .../ipython/scoring_comparison.ipynb | 173 +++++++----------- 1 file changed, 68 insertions(+), 105 deletions(-) diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index bda9aa3d..0b9721b0 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -72,6 +72,11 @@ "COUNTRY_FIELD_NAME = \"Country\"\n", "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n", "URBAN_HEURISTIC_FIELD = \"Urban Heuristic Flag\"\n", + "LIFE_EXPECTANCY_FIELD = \"Life expectancy (years)\"\n", + "HEALTH_INSURANCE_FIELD = (\n", + " \"Current lack of health insurance among adults aged 18-64 years\"\n", + ")\n", + "BAD_HEALTH_FIELD = \"Physical health not good for >=14 days among adults aged >=18 years\"\n", "\n", "CEJST_SCORE_FIELD = \"cejst_score\"\n", "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n", @@ -118,12 +123,11 @@ "outputs": [], "source": [ "# Analyze one field at a time (useful for setting thresholds)\n", - "\n", "quantile = 0.8\n", "\n", "for field in [\n", " \"Percent of individuals < 200% Federal Poverty Line\",\n", - " \"Life expectancy (years)\",\n", + " LIFE_EXPECTANCY_FIELD,\n", " \"Energy burden\",\n", " URBAN_HEURISTIC_FIELD,\n", "]:\n", @@ -152,17 +156,15 @@ "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n", "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n", "\n", - "calenviroscreen_data_path = (\n", - " DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n", - ")\n", + "calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n", "calenviroscreen_df = pd.read_csv(\n", " calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", ")\n", "\n", "# Convert priority community field to a bool.\n", - "calenviroscreen_df[\n", + "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n", " CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n", - "] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n", + "].astype(bool)\n", "\n", "calenviroscreen_df.head()" ] @@ -175,9 +177,7 @@ "outputs": [], "source": [ "# Load persistent poverty data\n", - "persistent_poverty_path = (\n", - " DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n", - ")\n", + "persistent_poverty_path = DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n", "persistent_poverty_df = pd.read_csv(\n", " persistent_poverty_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n", ")\n", @@ -189,9 +189,7 @@ "PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n", "\n", "persistent_poverty_df.rename(\n", - " columns={\n", - " PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n", - " },\n", + " columns={PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD},\n", " inplace=True,\n", " errors=\"raise\",\n", ")\n", @@ -282,6 +280,21 @@ "# (`census_tract_indices`).\n", "census_block_group_indices = [\n", " Index(\n", + " method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n", + " priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n", + " priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n", + " priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", " method_name=\"Score G\",\n", " priority_communities_field=\"Score G (communities)\",\n", " other_census_tract_fields_to_keep=[],\n", @@ -336,21 +349,6 @@ " priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n", " other_census_tract_fields_to_keep=[],\n", " ),\n", - " Index(\n", - " method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n", - " priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n", - " priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n", - " priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", "]\n", "\n", "census_tract_indices = [\n", @@ -388,8 +386,7 @@ " for priority_communities_field in priority_communities_fields:\n", " # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n", " df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n", - " df[priority_communities_field]\n", - " * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n", + " df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n", " )\n", "\n", " def calculate_state_comparison(\n", @@ -428,9 +425,7 @@ " summary_dict[\"Geography name\"] = division_id\n", "\n", " total_cbgs_in_geography = len(frame)\n", - " total_population_in_geography = frame[\n", - " CENSUS_BLOCK_GROUP_POPULATION_FIELD\n", - " ].sum()\n", + " total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n", "\n", " if geography_field == URBAN_HEURISTIC_FIELD:\n", " urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n", @@ -438,9 +433,9 @@ " summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n", "\n", " for priority_communities_field in priority_communities_fields:\n", - " summary_dict[\n", + " summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n", " f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n", - " ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n", + " ].sum()\n", "\n", " summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n", " f\"{priority_communities_field}\"\n", @@ -452,9 +447,7 @@ " / total_cbgs_in_geography\n", " )\n", "\n", - " summary_dict[\n", - " f\"{priority_communities_field} (percent population)\"\n", - " ] = (\n", + " summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n", " summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n", " / total_population_in_geography\n", " )\n", @@ -500,9 +493,7 @@ "\n", " # Run the comparison function on the groups.\n", " region_distribution_df = region_grouped_df.progress_apply(\n", - " lambda frame: calculate_state_comparison(\n", - " frame, geography_field=\"region\"\n", - " )\n", + " lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n", " )\n", "\n", " # Next, run the comparison by division\n", @@ -510,9 +501,7 @@ "\n", " # Run the comparison function on the groups.\n", " division_distribution_df = division_grouped_df.progress_apply(\n", - " lambda frame: calculate_state_comparison(\n", - " frame, geography_field=\"division\"\n", - " )\n", + " lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n", " )\n", "\n", " # Next, run the comparison by urban/rural\n", @@ -567,9 +556,7 @@ " column_character = get_excel_column_name(column_index)\n", "\n", " # Set all columns to larger width\n", - " worksheet.set_column(\n", - " f\"{column_character}:{column_character}\", column_width\n", - " )\n", + " worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n", "\n", " # Special formatting for all percent columns\n", " # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n", @@ -584,7 +571,9 @@ "\n", " # Special formatting for columns that capture the percent of population considered priority.\n", " if \"(percent population)\" in column:\n", - " column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n", + " column_ranges = (\n", + " f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n", + " )\n", "\n", " # Add green to red conditional formatting.\n", " worksheet.conditional_format(\n", @@ -616,7 +605,7 @@ "]\n", "\n", "# Convert all indices to boolean\n", - "for field_to_analyze in fields_to_analyze: \n", + "for field_to_analyze in fields_to_analyze:\n", " if \"Areas of Concern\" in field_to_analyze:\n", " print(f\"Converting {field_to_analyze} to boolean.\")\n", "\n", @@ -705,9 +694,7 @@ "\n", " # Put criteria description column first.\n", " new_column_order = [criteria_description_field_name] + [\n", - " col\n", - " for col in comparison_df.columns\n", - " if col != criteria_description_field_name\n", + " col for col in comparison_df.columns if col != criteria_description_field_name\n", " ]\n", "\n", " comparison_df = comparison_df[new_column_order]\n", @@ -753,12 +740,12 @@ " column_character = get_excel_column_name(column_index)\n", "\n", " # Set all columns to larger width\n", - " worksheet.set_column(\n", - " f\"{column_character}:{column_character}\", column_width\n", - " )\n", + " worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n", "\n", " # Add green to red conditional formatting.\n", - " column_ranges = f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n", + " column_ranges = (\n", + " f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n", + " )\n", " worksheet.conditional_format(\n", " column_ranges,\n", " # Min: green, max: red.\n", @@ -771,11 +758,7 @@ "\n", " # Special formatting for all percent columns\n", " # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n", - " if (\n", - " \"percent \" in column\n", - " or \"(percent)\" in column\n", - " or \"Percent \" in column\n", - " ):\n", + " if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n", " # Make these columns percentages.\n", " percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n", " worksheet.set_column(\n", @@ -813,7 +796,9 @@ " )\n", "\n", " # Write secondary comparison to CSV.\n", - " file_name_part = f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n", + " file_name_part = (\n", + " f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n", + " )\n", " output_dir.mkdir(parents=True, exist_ok=True)\n", " file_path = output_dir / (file_name_part + \".csv\")\n", " file_path_xlsx = output_dir / (file_name_part + \".xlsx\")\n", @@ -836,10 +821,12 @@ " \"Median household income (% of AMI)\",\n", " \"Percent of households in linguistic isolation\",\n", " \"Percent individuals age 25 or over with less than high school degree\",\n", - " \"Linguistic isolation (percent)\",\n", " \"Unemployed civilians (percent)\",\n", " \"Median household income in the past 12 months\",\n", " URBAN_HEURISTIC_FIELD,\n", + " LIFE_EXPECTANCY_FIELD,\n", + " HEALTH_INSURANCE_FIELD,\n", + " BAD_HEALTH_FIELD,\n", "]\n", "\n", "for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n", @@ -897,9 +884,7 @@ "\n", " # List of all states/territories in their FIPS codes:\n", " state_ids = sorted(df[state_field].unique())\n", - " state_names = \", \".join(\n", - " [us.states.lookup(state_id).name for state_id in state_ids]\n", - " )\n", + " state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n", "\n", " # Create markdown content for comparisons.\n", " markdown_content = f\"\"\"\n", @@ -913,9 +898,7 @@ "\n", "\"\"\"\n", "\n", - " for (index1, index2) in itertools.combinations(\n", - " census_block_group_indices, 2\n", - " ):\n", + " for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n", " # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n", " count_df = (\n", " df.groupby(\n", @@ -954,24 +937,16 @@ "\n", " # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n", " true_true_cbgs = (\n", - " true_true_cbgs_series.iloc[0]\n", - " if len(true_true_cbgs_series) > 0\n", - " else 0\n", + " true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n", " )\n", " true_false_cbgs = (\n", - " true_false_cbgs_series.iloc[0]\n", - " if len(true_false_cbgs_series) > 0\n", - " else 0\n", + " true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n", " )\n", " false_true_cbgs = (\n", - " false_true_cbgs_series.iloc[0]\n", - " if len(false_true_cbgs_series) > 0\n", - " else 0\n", + " false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n", " )\n", " false_false_cbgs = (\n", - " false_false_cbgs_series.iloc[0]\n", - " if len(false_false_cbgs_series) > 0\n", - " else 0\n", + " false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n", " )\n", "\n", " markdown_content += (\n", @@ -1163,20 +1138,15 @@ "\n", " # Calculate comparison\n", " # A comparison priority tract has at least one CBG that is a priority CBG.\n", - " df[\n", - " comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n", - " ] = (\n", + " df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n", " frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n", " if is_a_method_b_priority_tract\n", " else None\n", " )\n", "\n", " # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n", - " df[\n", - " comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n", - " ] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].mean()\n", - " == 1\n", + " df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n", + " frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n", " if is_a_method_b_priority_tract\n", " else None\n", " )\n", @@ -1195,8 +1165,7 @@ " df[\n", " comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n", " ] = (\n", - " frame.loc[:, method_a_priority_census_block_groups_field].mean()\n", - " == 1\n", + " frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n", " if not is_a_method_b_priority_tract\n", " else None\n", " )\n", @@ -1208,6 +1177,9 @@ " \"Percent of households in linguistic isolation\",\n", " \"Percent individuals age 25 or over with less than high school degree\",\n", " \"Unemployed civilians (percent)\",\n", + " LIFE_EXPECTANCY_FIELD,\n", + " HEALTH_INSURANCE_FIELD,\n", + " BAD_HEALTH_FIELD,\n", " ]:\n", " df[f\"{field} (average of CBGs)\"] = frame.loc[:, field].mean()\n", "\n", @@ -1237,20 +1209,14 @@ "\n", " # List of all states/territories in their FIPS codes:\n", " state_ids = sorted(original_df[state_field].unique())\n", - " state_names = \", \".join(\n", - " [us.states.lookup(state_id).name for state_id in state_ids]\n", - " )\n", + " state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n", "\n", " # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n", " # TODO: investigate why sums are sometimes series and sometimes scalar.\n", " method_a_priority_cbgs = (\n", - " original_df.loc[:, method_a_priority_census_block_groups_field]\n", - " .sum()\n", - " .squeeze()\n", - " )\n", - " method_a_priority_cbgs_percent = (\n", - " f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n", + " original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n", " )\n", + " method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n", "\n", " total_tracts_count = len(comparison_df)\n", "\n", @@ -1272,9 +1238,7 @@ " .sum()\n", " .squeeze()\n", " )\n", - " method_a_tracts_count_percent = (\n", - " f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n", - " )\n", + " method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n", "\n", " # Method A priority community stats\n", " method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n", @@ -1405,8 +1369,7 @@ "\n", " # Write comparison to CSV.\n", " file_path = (\n", - " output_dir\n", - " / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n", + " output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n", " )\n", " comparison_df.to_csv(\n", " path_or_buf=file_path,\n",