Adding VA and CO ETL from mapping for environmental justice (#1177)

Adding the mapping for environmental justice data, which contains information about VA and CO, to the ETL pipeline.
2025-07-25 07:10:16 -07:00 · 2022-02-04 10:00:41 -05:00 · 2022-02-04 10:00:41 -05:00 · 6a00b29f5d
commit 6a00b29f5d
parent 1d399d3ca9
6 changed files with 209 additions and 115 deletions
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -43,6 +43,7 @@
    "\n",
    "from data_pipeline.score import field_names\n",
    "\n",
+    "%load_ext lab_black\n",
    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
    "tqdm_notebook.pandas()"
   ]
@ -101,9 +102,7 @@
    "# Create the state ID by taking the first two digits of the FIPS CODE of the tract.\n",
    "# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n",
    "cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n",
-    "    cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]\n",
-    "    .astype(str)\n",
-    "    .str[0:2]\n",
+    "    cejst_df.loc[:, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].astype(str).str[0:2]\n",
    ")\n",
    "\n",
    "cejst_df.head()"
@ -113,9 +112,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "a251a0fb",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Load EJSCREEN Areas of Concern data.\n",
@ -149,9 +146,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "e43a9e23",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge EJSCREEN AoCs into CEJST data.\n",
@ -174,9 +169,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "38c0dc2f",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze one field at a time (useful for setting thresholds)\n",
@ -214,35 +207,71 @@
    "CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
    "CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
    "\n",
-    "calenviroscreen_data_path = (\n",
-    "    DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
-    ")\n",
+    "calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
    "calenviroscreen_df = pd.read_csv(\n",
    "    calenviroscreen_data_path,\n",
    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
    ")\n",
    "\n",
    "# Convert priority community field to a bool.\n",
-    "calenviroscreen_df[\n",
+    "calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
    "    CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
-    "] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n",
+    "].astype(bool)\n",
    "\n",
    "calenviroscreen_df.head()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1ac2854-80c8-42a8-85e8-84c5684bbe43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mapping for EJ\n",
+    "mapping_for_ej_path = DATA_DIR / \"dataset\" / \"mapping_for_ej\" / \"co_va.csv\"\n",
+    "\n",
+    "mapping_for_ej_df = pd.read_csv(\n",
+    "    mapping_for_ej_path,\n",
+    "    dtype={\n",
+    "        ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\",\n",
+    "        field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD: \"bool\",\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "mapping_for_ej_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1ac2854-80c8-42a8-85e8-84c5684bbe43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mapping for EJ\n",
+    "mapping_for_ej_path = DATA_DIR / \"dataset\" / \"mapping_for_ej\" / \"co_va.csv\"\n",
+    "\n",
+    "mapping_for_ej_df = pd.read_csv(\n",
+    "    mapping_for_ej_path,\n",
+    "    dtype={\n",
+    "        ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\",\n",
+    "        field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD: \"bool\",\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "mapping_for_ej_df.head()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8ec43dc",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Load persistent poverty data\n",
-    "persistent_poverty_path = (\n",
-    "    DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
-    ")\n",
+    "persistent_poverty_path = DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
    "persistent_poverty_df = pd.read_csv(\n",
    "    persistent_poverty_path,\n",
    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
@ -255,9 +284,7 @@
    "PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
    "\n",
    "persistent_poverty_df.rename(\n",
-    "    columns={\n",
-    "        PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n",
-    "    },\n",
+    "    columns={PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD},\n",
    "    inplace=True,\n",
    "    errors=\"raise\",\n",
    ")\n",
@ -269,9 +296,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "81826d29",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Load mapping inequality data\n",
@ -280,9 +305,7 @@
    "    field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,\n",
    "    field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,\n",
    "]\n",
-    "mapping_inequality_path = (\n",
-    "    DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
-    ")\n",
+    "mapping_inequality_path = DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
    "mapping_inequality_df = pd.read_csv(\n",
    "    mapping_inequality_path,\n",
    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
@ -329,9 +352,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "605af1ff",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Load alternative energy-related definition\n",
@ -350,9 +371,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "fe4a2939",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Michigan EJSCREEN\n",
@ -404,6 +423,7 @@
    "    energy_definition_alternative_draft_df,\n",
    "    michigan_ejscreen_df,\n",
    "    cdc_svi_index_df,\n",
+    "    mapping_for_ej_df,\n",
    "]\n",
    "\n",
    "merged_df = functools.reduce(\n",
@ -416,9 +436,7 @@
    "    census_tract_dfs,\n",
    ")\n",
    "\n",
-    "tract_values = (\n",
-    "    merged_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
-    ")\n",
+    "tract_values = merged_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
    "if any(tract_values != [11]):\n",
    "    print(tract_values)\n",
    "    raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
@ -433,9 +451,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "2de78f71",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Special handling for HOLC.\n",
@ -555,6 +571,10 @@
    "            priority_communities_field=\"calenviroscreen_priority_community\",\n",
    "        ),\n",
    "        Index(\n",
+    "            method_name=\"Mapping for EJ\",\n",
+    "            priority_communities_field=field_names.MAPPING_FOR_EJ_PRIORITY_COMMUNITY_FIELD,\n",
+    "        ),\n",
+    "        Index(\n",
    "            method_name=\"EPA RSEI Aggregate Microdata\",\n",
    "            priority_communities_field=field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD,\n",
    "        ),\n",
@ -708,13 +728,13 @@
    "            summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
    "\n",
    "        for priority_communities_field in priority_communities_fields:\n",
-    "            summary_dict[\n",
+    "            summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
    "                f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
-    "            ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n",
+    "            ].sum()\n",
    "\n",
-    "            summary_dict[\n",
-    "                f\"{priority_communities_field} (total tracts)\"\n",
-    "            ] = frame[f\"{priority_communities_field}\"].sum()\n",
+    "            summary_dict[f\"{priority_communities_field} (total tracts)\"] = frame[\n",
+    "                f\"{priority_communities_field}\"\n",
+    "            ].sum()\n",
    "\n",
    "            # Calculate some combinations of other variables.\n",
    "            summary_dict[f\"{priority_communities_field} (percent tracts)\"] = (\n",
@ -722,9 +742,7 @@
    "                / total_tracts_in_geography\n",
    "            )\n",
    "\n",
-    "            summary_dict[\n",
-    "                f\"{priority_communities_field} (percent population)\"\n",
-    "            ] = (\n",
+    "            summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
    "                summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
    "                / total_population_in_geography\n",
    "            )\n",
@ -770,9 +788,7 @@
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    region_distribution_df = region_grouped_df.progress_apply(\n",
-    "        lambda frame: calculate_state_comparison(\n",
-    "            frame, geography_field=\"region\"\n",
-    "        )\n",
+    "        lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
    "    )\n",
    "\n",
    "    # Next, run the comparison by division\n",
@ -780,9 +796,7 @@
    "\n",
    "    # Run the comparison function on the groups.\n",
    "    division_distribution_df = division_grouped_df.progress_apply(\n",
-    "        lambda frame: calculate_state_comparison(\n",
-    "            frame, geography_field=\"division\"\n",
-    "        )\n",
+    "        lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
    "    )\n",
    "\n",
    "    # Next, run the comparison by urban/rural\n",
@ -837,9 +851,7 @@
    "        column_character = get_excel_column_name(column_index)\n",
    "\n",
    "        # Set all columns to larger width\n",
-    "        worksheet.set_column(\n",
-    "            f\"{column_character}:{column_character}\", column_width\n",
-    "        )\n",
+    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
    "\n",
    "        # Special formatting for all percent columns\n",
    "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
@ -854,7 +866,9 @@
    "\n",
    "        # Special formatting for columns that capture the percent of population considered priority.\n",
    "        if \"(percent population)\" in column:\n",
-    "            column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
+    "            column_ranges = (\n",
+    "                f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
+    "            )\n",
    "\n",
    "            # Add green to red conditional formatting.\n",
    "            worksheet.conditional_format(\n",
@ -880,18 +894,14 @@
    "    writer.save()\n",
    "\n",
    "\n",
-    "fields_to_analyze = [\n",
-    "    index.priority_communities_field for index in census_tract_indices\n",
-    "]\n",
+    "fields_to_analyze = [index.priority_communities_field for index in census_tract_indices]\n",
    "\n",
    "# Convert all indices to boolean\n",
    "for field_to_analyze in fields_to_analyze:\n",
    "    if \"Areas of Concern\" in field_to_analyze:\n",
    "        print(f\"Converting {field_to_analyze} to boolean.\")\n",
    "\n",
-    "        merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(\n",
-    "            value=0\n",
-    "        )\n",
+    "        merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(value=0)\n",
    "        merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n",
    "\n",
    "\n",
@ -924,9 +934,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "2bcbcabf",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "directory = COMPARISON_OUTPUTS_DIR / \"tracts_basic_stats\"\n",
@ -960,14 +968,10 @@
    "        column_character = get_excel_column_name(column_index)\n",
    "\n",
    "        # Set all columns to larger width\n",
-    "        worksheet.set_column(\n",
-    "            f\"{column_character}:{column_character}\", column_width\n",
-    "        )\n",
+    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
    "\n",
    "        # Add green to red conditional formatting.\n",
-    "        column_ranges = (\n",
-    "            f\"{column_character}2:{column_character}{len(basic_stats_df)+1}\"\n",
-    "        )\n",
+    "        column_ranges = f\"{column_character}2:{column_character}{len(basic_stats_df)+1}\"\n",
    "        worksheet.conditional_format(\n",
    "            column_ranges,\n",
    "            # Min: green, max: red.\n",
@ -980,11 +984,7 @@
    "\n",
    "        # Special formatting for all percent columns\n",
    "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
-    "        if (\n",
-    "            \"percent \" in column\n",
-    "            or \"(percent)\" in column\n",
-    "            or \"Percent \" in column\n",
-    "        ):\n",
+    "        if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
    "            # Make these columns percentages.\n",
    "            percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
    "            worksheet.set_column(\n",
@ -1013,15 +1013,9 @@
    "        temp_df[index.priority_communities_field] == True\n",
    "    )\n",
    "\n",
-    "    grouped_df = (\n",
-    "        temp_df.groupby(index.priority_communities_field).mean().reset_index()\n",
-    "    )\n",
-    "    result_df = grouped_df[\n",
-    "        [index.priority_communities_field] + comparison_fields\n",
-    "    ]\n",
-    "    result_df.to_csv(\n",
-    "        directory / f\"{index.method_name} Basic Stats.csv\", index=False\n",
-    "    )\n",
+    "    grouped_df = temp_df.groupby(index.priority_communities_field).mean().reset_index()\n",
+    "    result_df = grouped_df[[index.priority_communities_field] + comparison_fields]\n",
+    "    result_df.to_csv(directory / f\"{index.method_name} Basic Stats.csv\", index=False)\n",
    "    write_basic_stats_excel(\n",
    "        basic_stats_df=result_df,\n",
    "        file_path=directory / f\"{index.method_name} Basic Stats.xlsx\",\n",
@ -1070,9 +1064,7 @@
    "\n",
    "    # Also add in the count of census tracts.\n",
    "    count_field_name = \"Count of census tracts\"\n",
-    "    comparison_df[count_field_name] = grouped_df.size().to_frame(\n",
-    "        count_field_name\n",
-    "    )\n",
+    "    comparison_df[count_field_name] = grouped_df.size().to_frame(count_field_name)\n",
    "\n",
    "    comparison_df = comparison_df.reset_index()\n",
    "\n",
@ -1087,9 +1079,7 @@
    "\n",
    "    # Put criteria description column first.\n",
    "    columns_to_put_first = (\n",
-    "        [criteria_description_field_name]\n",
-    "        + fields_to_group_by\n",
-    "        + [count_field_name]\n",
+    "        [criteria_description_field_name] + fields_to_group_by + [count_field_name]\n",
    "    )\n",
    "    new_column_order = columns_to_put_first + [\n",
    "        col for col in comparison_df.columns if col not in columns_to_put_first\n",
@ -1120,9 +1110,7 @@
    "\n",
    "    # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n",
    "    # index column at the left of the output dataframe.\n",
-    "    census_tracts_score_comparison_df.to_excel(\n",
-    "        writer, sheet_name=\"Sheet1\", index=False\n",
-    "    )\n",
+    "    census_tracts_score_comparison_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n",
    "\n",
    "    # Get the xlsxwriter workbook and worksheet objects.\n",
    "    workbook = writer.book\n",
@ -1144,9 +1132,7 @@
    "        column_character = get_excel_column_name(column_index)\n",
    "\n",
    "        # Set all columns to larger width\n",
-    "        worksheet.set_column(\n",
-    "            f\"{column_character}:{column_character}\", column_width\n",
-    "        )\n",
+    "        worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
    "\n",
    "        # Add green to red conditional formatting.\n",
    "        column_ranges = f\"{column_character}2:{column_character}{len(census_tracts_score_comparison_df)+1}\"\n",
@ -1162,11 +1148,7 @@
    "\n",
    "        # Special formatting for all percent columns\n",
    "        # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
-    "        if (\n",
-    "            \"percent \" in column\n",
-    "            or \"(percent)\" in column\n",
-    "            or \"Percent \" in column\n",
-    "        ):\n",
+    "        if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
    "            # Make these columns percentages.\n",
    "            percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
    "            worksheet.set_column(\n",
@ -1182,9 +1164,7 @@
    "    # Overwrite both the value and the format of each header cell\n",
    "    # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n",
    "    # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n",
-    "    for col_num, value in enumerate(\n",
-    "        census_tracts_score_comparison_df.columns.values\n",
-    "    ):\n",
+    "    for col_num, value in enumerate(census_tracts_score_comparison_df.columns.values):\n",
    "        worksheet.write(0, col_num, value, header_format)\n",
    "\n",
    "    writer.save()\n",
@ -1415,9 +1395,7 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "7d095ebd",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Note: this is helpful because this file is long-running, so it alerts the user when the\n",
@ -1444,7 +1422,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.9.10"
  }
 },
 "nbformat": 4,