Ticket 355: Adding map to Urban vs Rural Census Tracts (#696)

* Adding urban vs rural notebook

* Adding new code

* Adding settings

* Adding usa.csv

* Adding etl

* Adding etl

* Adding to etl_score

* quick changes to notebook

* Ensuring notebook can run

* Adding urban vs rural notebook

* Adding new code

* Adding settings

* Adding usa.csv

* Adding etl

* Adding etl

* Adding to etl_score

* quick changes to notebook

* Ensuring notebook can run

* adding urban to comparison tool

* renaming file

* adding urban rural to more comp tool outputs

* updating requirements and poetry

* Adding ej screen notebook

* removing ej screen notebook since it's in justice40-tool-iss-719

Co-authored-by: La <ryy0@cdc.gov>
Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Vincent La 2021-09-22 12:31:03 -04:00 committed by GitHub
commit 7709836a12
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 563 additions and 142 deletions

View file

@ -59,6 +59,11 @@ DATASET_LIST = [
"module_dir": "doe_energy_burden",
"class_name": "DOEEnergyBurden",
},
{
"name": "geocorr",
"module_dir": "geocorr",
"class_name": "GeoCorrETL",
},
]
CENSUS_INFO = {
"name": "census",

View file

@ -80,6 +80,9 @@ class ScoreETL(ExtractTransformLoad):
self.SCORE_CSV_PATH: Path = self.DATA_PATH / "score" / "csv" / "full"
# Urban Rural Map
self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
# dataframes
self.df: pd.DataFrame
self.ejscreen_df: pd.DataFrame
@ -91,6 +94,7 @@ class ScoreETL(ExtractTransformLoad):
self.cdc_life_expectancy_df: pd.DataFrame
self.doe_energy_burden_df: pd.DataFrame
self.national_risk_index_df: pd.DataFrame
self.geocorr_urban_rural_df: pd.DataFrame
def data_sets(self) -> list:
# Define a named tuple that will be used for each data set input.
@ -197,6 +201,11 @@ class ScoreETL(ExtractTransformLoad):
renamed_field=self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.URBAN_HERUISTIC_FIELD_NAME,
renamed_field=self.URBAN_HERUISTIC_FIELD_NAME,
bucket=None,
),
# The following data sets have buckets, because they're used in Score C
DataSet(
input_field="CANCER",
@ -386,6 +395,16 @@ class ScoreETL(ExtractTransformLoad):
low_memory=False,
)
# Load GeoCorr Urban Rural Map
geocorr_urban_rural_csv = (
self.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
)
self.geocorr_urban_rural_df = pd.read_csv(
geocorr_urban_rural_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
logger.info("Joining Census Block Group dataframes")
census_block_group_df = functools.reduce(
@ -619,6 +638,15 @@ class ScoreETL(ExtractTransformLoad):
df["Score G"] = df["Score G (communities)"].astype(int)
df["Score G (percentile)"] = df["Score G"]
df["Score H (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
) | (
(df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.40)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
)
df["Score H"] = df["Score H (communities)"].astype(int)
df["Score I (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.7)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
@ -629,20 +657,10 @@ class ScoreETL(ExtractTransformLoad):
df["Score I"] = df["Score I (communities)"].astype(int)
df["Score I (percentile)"] = df["Score I"]
df["Score H (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
) | (
(df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.40)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
)
df["Score H"] = df["Score H (communities)"].astype(int)
df["NMTC (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
) | (df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
df["Score K (communities)"] = (
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
@ -673,6 +691,7 @@ class ScoreETL(ExtractTransformLoad):
self.cdc_places_df,
self.cdc_life_expectancy_df,
self.doe_energy_burden_df,
self.geocorr_urban_rural_df,
]
census_tract_df = self._join_tract_dfs(census_tract_dfs)

View file

@ -0,0 +1,70 @@
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import (
get_module_logger,
unzip_file_from_url,
)
logger = get_module_logger(__name__)
class GeoCorrETL(ExtractTransformLoad):
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
# Need to change hyperlink to S3
self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
self.df: pd.DataFrame
def extract(self) -> None:
logger.info(
"Starting to download 2MB GeoCorr Urban Rural Census Tract Map file."
)
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr_urban_rural.csv.zip",
download_path=self.TMP_PATH,
unzipped_file_path=self.TMP_PATH / "geocorr",
)
self.df = pd.read_csv(
filepath_or_buffer=self.TMP_PATH
/ "geocorr"
/ "geocorr_urban_rural.csv",
dtype={
self.GEOCORR_GEOID_FIELD_NAME: "string",
},
low_memory=False,
)
def transform(self) -> None:
logger.info("Starting GeoCorr Urban Rural Map transform")
self.df.rename(
columns={
"urban_heuristic_flag": self.URBAN_HERUISTIC_FIELD_NAME,
},
inplace=True,
)
pass
# Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr
def load(self) -> None:
logger.info("Saving GeoCorr Urban Rural Map Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
def validate(self) -> None:
logger.info("Validating GeoCorr Urban Rural Map Data")
pass

View file

@ -75,7 +75,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
# Reduce columns.
# Note: normally we wait until writing to CSV for this step, but since the file is so huge,
# move this up here for performance reasons.
df_nri = df_nri[ # pylint: disable=unsubscriptable-object
df_nri = df_nri[ # pylint: disable=unsubscriptable-object
[self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, TRACT_COL]
]

View file

@ -71,6 +71,7 @@
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
"COUNTRY_FIELD_NAME = \"Country\"\n",
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
"URBAN_HEURISTIC_FIELD = \"Urban Heuristic Flag\"\n",
"\n",
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
@ -124,6 +125,7 @@
" \"Percent of individuals < 200% Federal Poverty Line\",\n",
" \"Life expectancy (years)\",\n",
" \"Energy burden\",\n",
" URBAN_HEURISTIC_FIELD,\n",
"]:\n",
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
" print(cejst_df[field].describe())\n",
@ -230,7 +232,7 @@
")\n",
"\n",
"\n",
"if len(merged_df) > 220335:\n",
"if len(merged_df) > 220405:\n",
" raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
"\n",
"merged_df.head()\n",
@ -273,21 +275,16 @@
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score I\",\n",
" priority_communities_field=\"Score I (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"NMTC\",\n",
" priority_communities_field=\"NMTC (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"NMTC modified\",\n",
" priority_communities_field=\"NMTC modified (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score F\",\n",
" priority_communities_field=\"Score F (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score A\",\n",
" priority_communities_field=\"Score A (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
@ -308,6 +305,11 @@
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score F\",\n",
" priority_communities_field=\"Score F (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Poverty\",\n",
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
@ -365,6 +367,8 @@
" summary_dict = {}\n",
" summary_dict[COUNTRY_FIELD_NAME] = frame[COUNTRY_FIELD_NAME].unique()[0]\n",
"\n",
" summary_dict[\"Analysis grouped by\"] = geography_field\n",
"\n",
" if geography_field == COUNTRY_FIELD_NAME:\n",
" summary_dict[GEOID_STATE_FIELD_NAME] = \"00\"\n",
" summary_dict[\"Geography name\"] = \"(Entire USA)\"\n",
@ -389,9 +393,12 @@
" summary_dict[\"Geography name\"] = division_id\n",
"\n",
" total_cbgs_in_geography = len(frame)\n",
" total_population_in_geography = frame[\n",
" CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
" ].sum()\n",
" total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n",
"\n",
" if geography_field == URBAN_HEURISTIC_FIELD:\n",
" urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n",
" summary_dict[\"Urban vs Rural\"] = \"Urban\" if urban_flag else \"Rural\"\n",
" summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
"\n",
" for priority_communities_field in priority_communities_fields:\n",
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
@ -465,13 +472,24 @@
" lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
" )\n",
"\n",
" # Combine the three\n",
" # Next, run the comparison by urban/rural\n",
" urban_grouped_df = df.groupby(URBAN_HEURISTIC_FIELD)\n",
"\n",
" # Run the comparison function on the groups.\n",
" urban_grouped_df = urban_grouped_df.progress_apply(\n",
" lambda frame: calculate_state_comparison(\n",
" frame, geography_field=URBAN_HEURISTIC_FIELD\n",
" )\n",
" )\n",
"\n",
" # Combine the five\n",
" combined_df = pd.concat(\n",
" [\n",
" usa_distribution_df,\n",
" state_distribution_df,\n",
" region_distribution_df,\n",
" division_distribution_df,\n",
" urban_grouped_df,\n",
" ]\n",
" )\n",
"\n",
@ -565,15 +583,17 @@
" priority_communities_fields=fields_to_analyze,\n",
")\n",
"\n",
"file_prefix = \"Priority CBGs Different geographic groupings\"\n",
"\n",
"state_distribution_df.to_csv(\n",
" path_or_buf=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.csv\",\n",
" path_or_buf=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.csv\",\n",
" na_rep=\"\",\n",
" index=False,\n",
")\n",
"\n",
"write_state_distribution_excel(\n",
" state_distribution_df=state_distribution_df,\n",
" file_path=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.xlsx\",\n",
" file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n",
")\n",
"\n",
"state_distribution_df.head()"
@ -625,10 +645,10 @@
"\n",
" criteria_description_field_name = \"Description of criteria\"\n",
" comparison_df[criteria_description_field_name] = comparison_df.apply(\n",
" func=lambda row: f\"CBGs that are {'not' if row[method_a_priority_census_block_groups_field] is False else ''} \" + \n",
" f\"prioritized by {method_a_priority_census_block_groups_field} \" + \n",
" f\"and are {'not' if row[method_b_priority_census_block_groups_field] is False else ''} \" + \n",
" f\"prioritized by {method_b_priority_census_block_groups_field}\",\n",
" func=lambda row: f\"CBGs that are {'not' if row[method_a_priority_census_block_groups_field] is False else ''} \"\n",
" + f\"prioritized by {method_a_priority_census_block_groups_field} \"\n",
" + f\"and are {'not' if row[method_b_priority_census_block_groups_field] is False else ''} \"\n",
" + f\"prioritized by {method_b_priority_census_block_groups_field}\",\n",
" axis=1,\n",
" )\n",
"\n",
@ -636,7 +656,7 @@
" new_column_order = [criteria_description_field_name] + [\n",
" col for col in comparison_df.columns if col != criteria_description_field_name\n",
" ]\n",
" \n",
"\n",
" comparison_df = comparison_df[new_column_order]\n",
"\n",
" # Rename fields to reflect the mean aggregation\n",
@ -763,6 +783,7 @@
" \"Linguistic isolation (percent)\",\n",
" \"Unemployed civilians (percent)\",\n",
" \"Median household income in the past 12 months\",\n",
" URBAN_HEURISTIC_FIELD,\n",
"]\n",
"\n",
"for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n",

View file

@ -0,0 +1,311 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "51412a14",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import collections\n",
"from datetime import datetime\n",
"import functools\n",
"import itertools\n",
"import os\n",
"import pathlib\n",
"import requests\n",
"import string\n",
"import sys\n",
"import typing\n",
"import zipfile\n",
"\n",
"import IPython\n",
"import numpy as np\n",
"import pandas as pd\n",
"import pypandoc\n",
"\n",
"from tqdm.notebook import tqdm_notebook\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
"\n",
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
"tqdm_notebook.pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e3234c61",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
"pd.options.display.float_format = \"{:.2f}\".format\n",
"\n",
"# Set some global parameters\n",
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
"\n",
"## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
"GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
"\n",
"# Make the dirs if they don't exist\n",
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
"COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
"CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
"\n",
"# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
"# and introducing the risk of misspelling the field name.)\n",
"\n",
"GEOID_FIELD_NAME = \"GEOID10\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
"GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
"COUNTRY_FIELD_NAME = \"Country\"\n",
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
"\n",
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
"\n",
"# Define some suffixes\n",
"POPULATION_SUFFIX = \" (priority population)\""
]
},
{
"cell_type": "markdown",
"id": "376f5b2e",
"metadata": {},
"source": [
"## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
"\n",
"The end result is a dataframe `urban_rural_map`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4147c081",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map = pd.read_csv(\n",
" os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
" encoding = \"ISO-8859-1\",\n",
" skiprows=[1],\n",
" dtype='str',\n",
")\n",
"\n",
"geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
"geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
"\n",
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78276a83",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
]
},
{
"cell_type": "markdown",
"id": "f2890779",
"metadata": {},
"source": [
"We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd89f6c8",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
" GEOID_TRACT_FIELD_NAME,\n",
" 'ur',\n",
" 'ua',\n",
" 'cntyname',\n",
" 'uaname',\n",
" 'pop10',\n",
" 'afact'\n",
"]]"
]
},
{
"cell_type": "markdown",
"id": "e597d7e2",
"metadata": {},
"source": [
"Checking Primary Key"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29929046",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e4c0c3f",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d52761e8",
"metadata": {},
"outputs": [],
"source": [
"total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
"total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
"total_geo_population.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38225b78",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
"geocorr_urban_rural_with_total_pop_map.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41b9448a",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb4ddb9b",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_with_total_pop_map.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e03d1e6",
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1d976cb5",
"metadata": {},
"outputs": [],
"source": [
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
"urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
"urban_rural_map.reset_index(inplace=True)\n",
"urban_rural_map['urban_heuristic_flag'] = 0\n",
"mask = urban_rural_map['afact_U'] >= 0.5\n",
"urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f3a0993",
"metadata": {},
"outputs": [],
"source": [
"urban_rural_map.rename(\n",
" columns={\n",
" 'pop10_R': 'population_in_rural_areas',\n",
" 'pop10_U': 'population_in_urban_areas',\n",
" 'afact_R': 'perc_population_in_rural_areas',\n",
" 'afact_U': 'perc_population_in_urban_areas',\n",
" }, \n",
" inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba10f07c",
"metadata": {},
"outputs": [],
"source": [
"urban_rural_map.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56098d7b",
"metadata": {},
"outputs": [],
"source": [
"urban_rural_map.to_csv(\n",
" path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}