Ticket 355: Adding map to Urban vs Rural Census Tracts (#696)

* Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * adding urban to comparison tool * renaming file * adding urban rural to more comp tool outputs * updating requirements and poetry * Adding ej screen notebook * removing ej screen notebook since it's in justice40-tool-iss-719 Co-authored-by: La <ryy0@cdc.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-07-25 08:20:16 -07:00 · 2021-09-22 12:31:03 -04:00 · 2021-09-22 12:31:03 -04:00 · 7709836a12
commit 7709836a12
parent aaf304fc89
10 changed files with 563 additions and 142 deletions
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -59,6 +59,11 @@ DATASET_LIST = [
        "module_dir": "doe_energy_burden",
        "class_name": "DOEEnergyBurden",
    },
+    {
+        "name": "geocorr",
+        "module_dir": "geocorr",
+        "class_name": "GeoCorrETL",
+    },
 ]
 CENSUS_INFO = {
    "name": "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -80,6 +80,9 @@ class ScoreETL(ExtractTransformLoad):

        self.SCORE_CSV_PATH: Path = self.DATA_PATH / "score" / "csv" / "full"

+        # Urban Rural Map
+        self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
+
        # dataframes
        self.df: pd.DataFrame
        self.ejscreen_df: pd.DataFrame
@ -91,6 +94,7 @@ class ScoreETL(ExtractTransformLoad):
        self.cdc_life_expectancy_df: pd.DataFrame
        self.doe_energy_burden_df: pd.DataFrame
        self.national_risk_index_df: pd.DataFrame
+        self.geocorr_urban_rural_df: pd.DataFrame

    def data_sets(self) -> list:
        # Define a named tuple that will be used for each data set input.
@ -197,6 +201,11 @@ class ScoreETL(ExtractTransformLoad):
                renamed_field=self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
                bucket=None,
            ),
+            DataSet(
+                input_field=self.URBAN_HERUISTIC_FIELD_NAME,
+                renamed_field=self.URBAN_HERUISTIC_FIELD_NAME,
+                bucket=None,
+            ),
            # The following data sets have buckets, because they're used in Score C
            DataSet(
                input_field="CANCER",
@ -386,6 +395,16 @@ class ScoreETL(ExtractTransformLoad):
            low_memory=False,
        )

+        # Load GeoCorr Urban Rural Map
+        geocorr_urban_rural_csv = (
+            self.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
+        )
+        self.geocorr_urban_rural_df = pd.read_csv(
+            geocorr_urban_rural_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+
    def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
        logger.info("Joining Census Block Group dataframes")
        census_block_group_df = functools.reduce(
@ -619,6 +638,15 @@ class ScoreETL(ExtractTransformLoad):
        df["Score G"] = df["Score G (communities)"].astype(int)
        df["Score G (percentile)"] = df["Score G"]

+        df["Score H (communities)"] = (
+            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
+            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
+        ) | (
+            (df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.40)
+            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
+        )
+        df["Score H"] = df["Score H (communities)"].astype(int)
+
        df["Score I (communities)"] = (
            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.7)
            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
@ -629,20 +657,10 @@ class ScoreETL(ExtractTransformLoad):
        df["Score I"] = df["Score I (communities)"].astype(int)
        df["Score I (percentile)"] = df["Score I"]

-        df["Score H (communities)"] = (
-            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
-            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
-        ) | (
-            (df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.40)
-            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
-        )
-        df["Score H"] = df["Score H (communities)"].astype(int)
-
        df["NMTC (communities)"] = (
            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
        ) | (df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)

-
        df["Score K (communities)"] = (
            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
@ -673,6 +691,7 @@ class ScoreETL(ExtractTransformLoad):
            self.cdc_places_df,
            self.cdc_life_expectancy_df,
            self.doe_energy_burden_df,
+            self.geocorr_urban_rural_df,
        ]
        census_tract_df = self._join_tract_dfs(census_tract_dfs)

--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/README.md
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
@ -0,0 +1,70 @@
+import pandas as pd
+
+from data_pipeline.config import settings
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.utils import (
+    get_module_logger,
+    unzip_file_from_url,
+)
+
+logger = get_module_logger(__name__)
+
+
+class GeoCorrETL(ExtractTransformLoad):
+    def __init__(self):
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
+
+        # Need to change hyperlink to S3
+        self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
+        self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
+        self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
+
+        self.df: pd.DataFrame
+
+    def extract(self) -> None:
+        logger.info(
+            "Starting to download 2MB GeoCorr Urban Rural Census Tract Map file."
+        )
+        unzip_file_from_url(
+            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/geocorr_urban_rural.csv.zip",
+            download_path=self.TMP_PATH,
+            unzipped_file_path=self.TMP_PATH / "geocorr",
+        )
+
+        self.df = pd.read_csv(
+            filepath_or_buffer=self.TMP_PATH
+            / "geocorr"
+            / "geocorr_urban_rural.csv",
+            dtype={
+                self.GEOCORR_GEOID_FIELD_NAME: "string",
+            },
+            low_memory=False,
+        )
+
+    def transform(self) -> None:
+        logger.info("Starting GeoCorr Urban Rural Map transform")
+
+        self.df.rename(
+            columns={
+                "urban_heuristic_flag": self.URBAN_HERUISTIC_FIELD_NAME,
+            },
+            inplace=True,
+        )
+
+        pass
+
+        # Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr
+
+    def load(self) -> None:
+        logger.info("Saving GeoCorr Urban Rural Map Data")
+
+        # mkdir census
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+
+        self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
+
+    def validate(self) -> None:
+        logger.info("Validating GeoCorr Urban Rural Map Data")
+
+        pass
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -75,7 +75,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        # Reduce columns.
        # Note: normally we wait until writing to CSV for this step, but since the file is so huge,
        # move this up here for performance reasons.
-        df_nri = df_nri[ # pylint: disable=unsubscriptable-object
+        df_nri = df_nri[  # pylint: disable=unsubscriptable-object
            [self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, TRACT_COL]
        ]

--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -71,6 +71,7 @@
    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
    "COUNTRY_FIELD_NAME = \"Country\"\n",
    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
+    "URBAN_HEURISTIC_FIELD = \"Urban Heuristic Flag\"\n",
    "\n",
    "CEJST_SCORE_FIELD = \"cejst_score\"\n",
    "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
@ -124,6 +125,7 @@
    "    \"Percent of individuals < 200% Federal Poverty Line\",\n",
    "    \"Life expectancy (years)\",\n",
    "    \"Energy burden\",\n",
+    "    URBAN_HEURISTIC_FIELD,\n",
    "]:\n",
    "    print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
    "    print(cejst_df[field].describe())\n",
@ -230,7 +232,7 @@
    ")\n",
    "\n",
    "\n",
-    "if len(merged_df) > 220335:\n",
+    "if len(merged_df) > 220405:\n",
    "    raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
    "\n",
    "merged_df.head()\n",
@ -273,21 +275,16 @@
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
+    "        method_name=\"Score I\",\n",
+    "        priority_communities_field=\"Score I (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
    "        method_name=\"NMTC\",\n",
    "        priority_communities_field=\"NMTC (communities)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
-    "        method_name=\"NMTC modified\",\n",
-    "        priority_communities_field=\"NMTC modified (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score F\",\n",
-    "        priority_communities_field=\"Score F (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
    "        method_name=\"Score A\",\n",
    "        priority_communities_field=\"Score A (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
@ -308,6 +305,11 @@
    "        other_census_tract_fields_to_keep=[],\n",
    "    ),\n",
    "    Index(\n",
+    "        method_name=\"Score F\",\n",
+    "        priority_communities_field=\"Score F (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
    "        method_name=\"Poverty\",\n",
    "        priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
    "        other_census_tract_fields_to_keep=[],\n",
@ -365,6 +367,8 @@
    "        summary_dict = {}\n",
    "        summary_dict[COUNTRY_FIELD_NAME] = frame[COUNTRY_FIELD_NAME].unique()[0]\n",
    "\n",
+    "        summary_dict[\"Analysis grouped by\"] = geography_field\n",
+    "\n",
    "        if geography_field == COUNTRY_FIELD_NAME:\n",
    "            summary_dict[GEOID_STATE_FIELD_NAME] = \"00\"\n",
    "            summary_dict[\"Geography name\"] = \"(Entire USA)\"\n",
@ -389,9 +393,12 @@
    "            summary_dict[\"Geography name\"] = division_id\n",
    "\n",
    "        total_cbgs_in_geography = len(frame)\n",
-    "        total_population_in_geography = frame[\n",
-    "            CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
-    "        ].sum()\n",
+    "        total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n",
+    "\n",
+    "        if geography_field == URBAN_HEURISTIC_FIELD:\n",
+    "            urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n",
+    "            summary_dict[\"Urban vs Rural\"] = \"Urban\" if urban_flag else \"Rural\"\n",
+    "            summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
    "\n",
    "        for priority_communities_field in priority_communities_fields:\n",
    "            summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
@ -465,13 +472,24 @@
    "        lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
    "    )\n",
    "\n",
-    "    # Combine the three\n",
+    "    # Next, run the comparison by urban/rural\n",
+    "    urban_grouped_df = df.groupby(URBAN_HEURISTIC_FIELD)\n",
+    "\n",
+    "    # Run the comparison function on the groups.\n",
+    "    urban_grouped_df = urban_grouped_df.progress_apply(\n",
+    "        lambda frame: calculate_state_comparison(\n",
+    "            frame, geography_field=URBAN_HEURISTIC_FIELD\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "    # Combine the five\n",
    "    combined_df = pd.concat(\n",
    "        [\n",
    "            usa_distribution_df,\n",
    "            state_distribution_df,\n",
    "            region_distribution_df,\n",
    "            division_distribution_df,\n",
+    "            urban_grouped_df,\n",
    "        ]\n",
    "    )\n",
    "\n",
@ -565,15 +583,17 @@
    "    priority_communities_fields=fields_to_analyze,\n",
    ")\n",
    "\n",
+    "file_prefix = \"Priority CBGs – Different geographic groupings\"\n",
+    "\n",
    "state_distribution_df.to_csv(\n",
-    "    path_or_buf=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.csv\",\n",
+    "    path_or_buf=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.csv\",\n",
    "    na_rep=\"\",\n",
    "    index=False,\n",
    ")\n",
    "\n",
    "write_state_distribution_excel(\n",
    "    state_distribution_df=state_distribution_df,\n",
-    "    file_path=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.xlsx\",\n",
+    "    file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n",
    ")\n",
    "\n",
    "state_distribution_df.head()"
@ -625,10 +645,10 @@
    "\n",
    "    criteria_description_field_name = \"Description of criteria\"\n",
    "    comparison_df[criteria_description_field_name] = comparison_df.apply(\n",
-    "        func=lambda row: f\"CBGs that are {'not' if row[method_a_priority_census_block_groups_field] is False else ''} \" + \n",
-    "        f\"prioritized by {method_a_priority_census_block_groups_field} \" + \n",
-    "        f\"and are {'not' if row[method_b_priority_census_block_groups_field] is False else ''} \" + \n",
-    "        f\"prioritized by {method_b_priority_census_block_groups_field}\",\n",
+    "        func=lambda row: f\"CBGs that are {'not' if row[method_a_priority_census_block_groups_field] is False else ''} \"\n",
+    "        + f\"prioritized by {method_a_priority_census_block_groups_field} \"\n",
+    "        + f\"and are {'not' if row[method_b_priority_census_block_groups_field] is False else ''} \"\n",
+    "        + f\"prioritized by {method_b_priority_census_block_groups_field}\",\n",
    "        axis=1,\n",
    "    )\n",
    "\n",
@ -636,7 +656,7 @@
    "    new_column_order = [criteria_description_field_name] + [\n",
    "        col for col in comparison_df.columns if col != criteria_description_field_name\n",
    "    ]\n",
-    "    \n",
+    "\n",
    "    comparison_df = comparison_df[new_column_order]\n",
    "\n",
    "    # Rename fields to reflect the mean aggregation\n",
@ -763,6 +783,7 @@
    "    \"Linguistic isolation (percent)\",\n",
    "    \"Unemployed civilians (percent)\",\n",
    "    \"Median household income in the past 12 months\",\n",
+    "    URBAN_HEURISTIC_FIELD,\n",
    "]\n",
    "\n",
    "for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n",
--- a/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51412a14",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import collections\n",
+    "from datetime import datetime\n",
+    "import functools\n",
+    "import itertools\n",
+    "import os\n",
+    "import pathlib\n",
+    "import requests\n",
+    "import string\n",
+    "import sys\n",
+    "import typing\n",
+    "import zipfile\n",
+    "\n",
+    "import IPython\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pypandoc\n",
+    "\n",
+    "from tqdm.notebook import tqdm_notebook\n",
+    "\n",
+    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)\n",
+    "\n",
+    "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
+    "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
+    "\n",
+    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
+    "tqdm_notebook.pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3234c61",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
+    "pd.options.display.float_format = \"{:.2f}\".format\n",
+    "\n",
+    "# Set some global parameters\n",
+    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
+    "TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
+    "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
+    "\n",
+    "## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
+    "GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
+    "\n",
+    "# Make the dirs if they don't exist\n",
+    "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
+    "\n",
+    "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
+    "# and introducing the risk of misspelling the field name.)\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\"\n",
+    "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
+    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
+    "GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
+    "COUNTRY_FIELD_NAME = \"Country\"\n",
+    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
+    "\n",
+    "CEJST_SCORE_FIELD = \"cejst_score\"\n",
+    "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
+    "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
+    "\n",
+    "# Define some suffixes\n",
+    "POPULATION_SUFFIX = \" (priority population)\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "376f5b2e",
+   "metadata": {},
+   "source": [
+    "## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
+    "\n",
+    "The end result is a dataframe `urban_rural_map`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4147c081",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map = pd.read_csv(\n",
+    "    os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
+    "    encoding = \"ISO-8859-1\",\n",
+    "    skiprows=[1],\n",
+    "    dtype='str',\n",
+    ")\n",
+    "\n",
+    "geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
+    "geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
+    "\n",
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78276a83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2890779",
+   "metadata": {},
+   "source": [
+    "We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd89f6c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
+    "    GEOID_TRACT_FIELD_NAME,\n",
+    "    'ur',\n",
+    "    'ua',\n",
+    "    'cntyname',\n",
+    "    'uaname',\n",
+    "    'pop10',\n",
+    "    'afact'\n",
+    "]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e597d7e2",
+   "metadata": {},
+   "source": [
+    "Checking Primary Key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29929046",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e4c0c3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d52761e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
+    "total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
+    "total_geo_population.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38225b78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
+    "geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
+    "geocorr_urban_rural_with_total_pop_map.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41b9448a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb4ddb9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e03d1e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d976cb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
+    "urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
+    "urban_rural_map.reset_index(inplace=True)\n",
+    "urban_rural_map['urban_heuristic_flag'] = 0\n",
+    "mask = urban_rural_map['afact_U'] >= 0.5\n",
+    "urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f3a0993",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map.rename(\n",
+    "    columns={\n",
+    "        'pop10_R': 'population_in_rural_areas',\n",
+    "        'pop10_U': 'population_in_urban_areas',\n",
+    "        'afact_R': 'perc_population_in_rural_areas',\n",
+    "        'afact_U': 'perc_population_in_urban_areas',\n",
+    "    }, \n",
+    "    inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba10f07c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56098d7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urban_rural_map.to_csv(\n",
+    "    path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}