mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 08:20:16 -07:00
Ticket 355: Adding map to Urban vs Rural Census Tracts (#696)
* Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * Adding urban vs rural notebook * Adding new code * Adding settings * Adding usa.csv * Adding etl * Adding etl * Adding to etl_score * quick changes to notebook * Ensuring notebook can run * adding urban to comparison tool * renaming file * adding urban rural to more comp tool outputs * updating requirements and poetry * Adding ej screen notebook * removing ej screen notebook since it's in justice40-tool-iss-719 Co-authored-by: La <ryy0@cdc.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
parent
aaf304fc89
commit
7709836a12
10 changed files with 563 additions and 142 deletions
|
@ -59,6 +59,11 @@ DATASET_LIST = [
|
|||
"module_dir": "doe_energy_burden",
|
||||
"class_name": "DOEEnergyBurden",
|
||||
},
|
||||
{
|
||||
"name": "geocorr",
|
||||
"module_dir": "geocorr",
|
||||
"class_name": "GeoCorrETL",
|
||||
},
|
||||
]
|
||||
CENSUS_INFO = {
|
||||
"name": "census",
|
||||
|
|
|
@ -80,6 +80,9 @@ class ScoreETL(ExtractTransformLoad):
|
|||
|
||||
self.SCORE_CSV_PATH: Path = self.DATA_PATH / "score" / "csv" / "full"
|
||||
|
||||
# Urban Rural Map
|
||||
self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
|
||||
|
||||
# dataframes
|
||||
self.df: pd.DataFrame
|
||||
self.ejscreen_df: pd.DataFrame
|
||||
|
@ -91,6 +94,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.cdc_life_expectancy_df: pd.DataFrame
|
||||
self.doe_energy_burden_df: pd.DataFrame
|
||||
self.national_risk_index_df: pd.DataFrame
|
||||
self.geocorr_urban_rural_df: pd.DataFrame
|
||||
|
||||
def data_sets(self) -> list:
|
||||
# Define a named tuple that will be used for each data set input.
|
||||
|
@ -197,6 +201,11 @@ class ScoreETL(ExtractTransformLoad):
|
|||
renamed_field=self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
||||
bucket=None,
|
||||
),
|
||||
DataSet(
|
||||
input_field=self.URBAN_HERUISTIC_FIELD_NAME,
|
||||
renamed_field=self.URBAN_HERUISTIC_FIELD_NAME,
|
||||
bucket=None,
|
||||
),
|
||||
# The following data sets have buckets, because they're used in Score C
|
||||
DataSet(
|
||||
input_field="CANCER",
|
||||
|
@ -386,6 +395,16 @@ class ScoreETL(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
|
||||
# Load GeoCorr Urban Rural Map
|
||||
geocorr_urban_rural_csv = (
|
||||
self.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
|
||||
)
|
||||
self.geocorr_urban_rural_df = pd.read_csv(
|
||||
geocorr_urban_rural_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
|
||||
logger.info("Joining Census Block Group dataframes")
|
||||
census_block_group_df = functools.reduce(
|
||||
|
@ -619,6 +638,15 @@ class ScoreETL(ExtractTransformLoad):
|
|||
df["Score G"] = df["Score G (communities)"].astype(int)
|
||||
df["Score G (percentile)"] = df["Score G"]
|
||||
|
||||
df["Score H (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
|
||||
) | (
|
||||
(df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.40)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
|
||||
)
|
||||
df["Score H"] = df["Score H (communities)"].astype(int)
|
||||
|
||||
df["Score I (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.7)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
|
||||
|
@ -629,20 +657,10 @@ class ScoreETL(ExtractTransformLoad):
|
|||
df["Score I"] = df["Score I (communities)"].astype(int)
|
||||
df["Score I (percentile)"] = df["Score I"]
|
||||
|
||||
df["Score H (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
|
||||
) | (
|
||||
(df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.40)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
|
||||
)
|
||||
df["Score H"] = df["Score H (communities)"].astype(int)
|
||||
|
||||
df["NMTC (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
|
||||
) | (df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
|
||||
|
||||
|
||||
df["Score K (communities)"] = (
|
||||
(df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
|
||||
& (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold_2)
|
||||
|
@ -673,6 +691,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.cdc_places_df,
|
||||
self.cdc_life_expectancy_df,
|
||||
self.doe_energy_burden_df,
|
||||
self.geocorr_urban_rural_df,
|
||||
]
|
||||
census_tract_df = self._join_tract_dfs(census_tract_dfs)
|
||||
|
||||
|
|
70
data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
Normal file
70
data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import (
|
||||
get_module_logger,
|
||||
unzip_file_from_url,
|
||||
)
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class GeoCorrETL(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
|
||||
|
||||
# Need to change hyperlink to S3
|
||||
self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
|
||||
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
|
||||
self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
|
||||
|
||||
self.df: pd.DataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info(
|
||||
"Starting to download 2MB GeoCorr Urban Rural Census Tract Map file."
|
||||
)
|
||||
unzip_file_from_url(
|
||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||
+ "/geocorr_urban_rural.csv.zip",
|
||||
download_path=self.TMP_PATH,
|
||||
unzipped_file_path=self.TMP_PATH / "geocorr",
|
||||
)
|
||||
|
||||
self.df = pd.read_csv(
|
||||
filepath_or_buffer=self.TMP_PATH
|
||||
/ "geocorr"
|
||||
/ "geocorr_urban_rural.csv",
|
||||
dtype={
|
||||
self.GEOCORR_GEOID_FIELD_NAME: "string",
|
||||
},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting GeoCorr Urban Rural Map transform")
|
||||
|
||||
self.df.rename(
|
||||
columns={
|
||||
"urban_heuristic_flag": self.URBAN_HERUISTIC_FIELD_NAME,
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
pass
|
||||
|
||||
# Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving GeoCorr Urban Rural Map Data")
|
||||
|
||||
# mkdir census
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating GeoCorr Urban Rural Map Data")
|
||||
|
||||
pass
|
|
@ -75,7 +75,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
|||
# Reduce columns.
|
||||
# Note: normally we wait until writing to CSV for this step, but since the file is so huge,
|
||||
# move this up here for performance reasons.
|
||||
df_nri = df_nri[ # pylint: disable=unsubscriptable-object
|
||||
df_nri = df_nri[ # pylint: disable=unsubscriptable-object
|
||||
[self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, TRACT_COL]
|
||||
]
|
||||
|
||||
|
|
|
@ -71,6 +71,7 @@
|
|||
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
|
||||
"COUNTRY_FIELD_NAME = \"Country\"\n",
|
||||
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
|
||||
"URBAN_HEURISTIC_FIELD = \"Urban Heuristic Flag\"\n",
|
||||
"\n",
|
||||
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
|
||||
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
|
||||
|
@ -124,6 +125,7 @@
|
|||
" \"Percent of individuals < 200% Federal Poverty Line\",\n",
|
||||
" \"Life expectancy (years)\",\n",
|
||||
" \"Energy burden\",\n",
|
||||
" URBAN_HEURISTIC_FIELD,\n",
|
||||
"]:\n",
|
||||
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
|
||||
" print(cejst_df[field].describe())\n",
|
||||
|
@ -230,7 +232,7 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if len(merged_df) > 220335:\n",
|
||||
"if len(merged_df) > 220405:\n",
|
||||
" raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
|
||||
"\n",
|
||||
"merged_df.head()\n",
|
||||
|
@ -273,21 +275,16 @@
|
|||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score I\",\n",
|
||||
" priority_communities_field=\"Score I (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"NMTC\",\n",
|
||||
" priority_communities_field=\"NMTC (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"NMTC modified\",\n",
|
||||
" priority_communities_field=\"NMTC modified (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score F\",\n",
|
||||
" priority_communities_field=\"Score F (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score A\",\n",
|
||||
" priority_communities_field=\"Score A (top 25th percentile)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
|
@ -308,6 +305,11 @@
|
|||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Score F\",\n",
|
||||
" priority_communities_field=\"Score F (communities)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Poverty\",\n",
|
||||
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
|
@ -365,6 +367,8 @@
|
|||
" summary_dict = {}\n",
|
||||
" summary_dict[COUNTRY_FIELD_NAME] = frame[COUNTRY_FIELD_NAME].unique()[0]\n",
|
||||
"\n",
|
||||
" summary_dict[\"Analysis grouped by\"] = geography_field\n",
|
||||
"\n",
|
||||
" if geography_field == COUNTRY_FIELD_NAME:\n",
|
||||
" summary_dict[GEOID_STATE_FIELD_NAME] = \"00\"\n",
|
||||
" summary_dict[\"Geography name\"] = \"(Entire USA)\"\n",
|
||||
|
@ -389,9 +393,12 @@
|
|||
" summary_dict[\"Geography name\"] = division_id\n",
|
||||
"\n",
|
||||
" total_cbgs_in_geography = len(frame)\n",
|
||||
" total_population_in_geography = frame[\n",
|
||||
" CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
|
||||
" ].sum()\n",
|
||||
" total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n",
|
||||
"\n",
|
||||
" if geography_field == URBAN_HEURISTIC_FIELD:\n",
|
||||
" urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n",
|
||||
" summary_dict[\"Urban vs Rural\"] = \"Urban\" if urban_flag else \"Rural\"\n",
|
||||
" summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
|
||||
"\n",
|
||||
" for priority_communities_field in priority_communities_fields:\n",
|
||||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
|
||||
|
@ -465,13 +472,24 @@
|
|||
" lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Combine the three\n",
|
||||
" # Next, run the comparison by urban/rural\n",
|
||||
" urban_grouped_df = df.groupby(URBAN_HEURISTIC_FIELD)\n",
|
||||
"\n",
|
||||
" # Run the comparison function on the groups.\n",
|
||||
" urban_grouped_df = urban_grouped_df.progress_apply(\n",
|
||||
" lambda frame: calculate_state_comparison(\n",
|
||||
" frame, geography_field=URBAN_HEURISTIC_FIELD\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Combine the five\n",
|
||||
" combined_df = pd.concat(\n",
|
||||
" [\n",
|
||||
" usa_distribution_df,\n",
|
||||
" state_distribution_df,\n",
|
||||
" region_distribution_df,\n",
|
||||
" division_distribution_df,\n",
|
||||
" urban_grouped_df,\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
|
@ -565,15 +583,17 @@
|
|||
" priority_communities_fields=fields_to_analyze,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"file_prefix = \"Priority CBGs – Different geographic groupings\"\n",
|
||||
"\n",
|
||||
"state_distribution_df.to_csv(\n",
|
||||
" path_or_buf=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.csv\",\n",
|
||||
" path_or_buf=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.csv\",\n",
|
||||
" na_rep=\"\",\n",
|
||||
" index=False,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"write_state_distribution_excel(\n",
|
||||
" state_distribution_df=state_distribution_df,\n",
|
||||
" file_path=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.xlsx\",\n",
|
||||
" file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"state_distribution_df.head()"
|
||||
|
@ -625,10 +645,10 @@
|
|||
"\n",
|
||||
" criteria_description_field_name = \"Description of criteria\"\n",
|
||||
" comparison_df[criteria_description_field_name] = comparison_df.apply(\n",
|
||||
" func=lambda row: f\"CBGs that are {'not' if row[method_a_priority_census_block_groups_field] is False else ''} \" + \n",
|
||||
" f\"prioritized by {method_a_priority_census_block_groups_field} \" + \n",
|
||||
" f\"and are {'not' if row[method_b_priority_census_block_groups_field] is False else ''} \" + \n",
|
||||
" f\"prioritized by {method_b_priority_census_block_groups_field}\",\n",
|
||||
" func=lambda row: f\"CBGs that are {'not' if row[method_a_priority_census_block_groups_field] is False else ''} \"\n",
|
||||
" + f\"prioritized by {method_a_priority_census_block_groups_field} \"\n",
|
||||
" + f\"and are {'not' if row[method_b_priority_census_block_groups_field] is False else ''} \"\n",
|
||||
" + f\"prioritized by {method_b_priority_census_block_groups_field}\",\n",
|
||||
" axis=1,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
|
@ -636,7 +656,7 @@
|
|||
" new_column_order = [criteria_description_field_name] + [\n",
|
||||
" col for col in comparison_df.columns if col != criteria_description_field_name\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" comparison_df = comparison_df[new_column_order]\n",
|
||||
"\n",
|
||||
" # Rename fields to reflect the mean aggregation\n",
|
||||
|
@ -763,6 +783,7 @@
|
|||
" \"Linguistic isolation (percent)\",\n",
|
||||
" \"Unemployed civilians (percent)\",\n",
|
||||
" \"Median household income in the past 12 months\",\n",
|
||||
" URBAN_HEURISTIC_FIELD,\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n",
|
||||
|
|
311
data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
Normal file
311
data/data-pipeline/data_pipeline/ipython/urban_vs_rural.ipynb
Normal file
|
@ -0,0 +1,311 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51412a14",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import collections\n",
|
||||
"from datetime import datetime\n",
|
||||
"import functools\n",
|
||||
"import itertools\n",
|
||||
"import os\n",
|
||||
"import pathlib\n",
|
||||
"import requests\n",
|
||||
"import string\n",
|
||||
"import sys\n",
|
||||
"import typing\n",
|
||||
"import zipfile\n",
|
||||
"\n",
|
||||
"import IPython\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import pypandoc\n",
|
||||
"\n",
|
||||
"from tqdm.notebook import tqdm_notebook\n",
|
||||
"\n",
|
||||
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
|
||||
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
|
||||
"\n",
|
||||
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
||||
"tqdm_notebook.pandas()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e3234c61",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
|
||||
"pd.options.display.float_format = \"{:.2f}\".format\n",
|
||||
"\n",
|
||||
"# Set some global parameters\n",
|
||||
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
|
||||
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
|
||||
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
|
||||
"\n",
|
||||
"## I (Vincent) created this manually locally. Will need to change potentially when putting into official ETL scripts\n",
|
||||
"GEOCORR_DATA_DIR = DATA_DIR / \"geocorr\"\n",
|
||||
"\n",
|
||||
"# Make the dirs if they don't exist\n",
|
||||
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
"CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
|
||||
"\n",
|
||||
"# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
|
||||
"# and introducing the risk of misspelling the field name.)\n",
|
||||
"\n",
|
||||
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
||||
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
|
||||
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
|
||||
"GEOID_CBG_FIELD_NAME = \"GEOID10_CBG\"\n",
|
||||
"COUNTRY_FIELD_NAME = \"Country\"\n",
|
||||
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
|
||||
"\n",
|
||||
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
|
||||
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
|
||||
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
|
||||
"\n",
|
||||
"# Define some suffixes\n",
|
||||
"POPULATION_SUFFIX = \" (priority population)\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "376f5b2e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Mapping Census Block Group to Urban and Rural Indicators using Geocorr Data\n",
|
||||
"\n",
|
||||
"The end result is a dataframe `urban_rural_map`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4147c081",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map = pd.read_csv(\n",
|
||||
" os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
|
||||
" encoding = \"ISO-8859-1\",\n",
|
||||
" skiprows=[1],\n",
|
||||
" dtype='str',\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
|
||||
"geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
|
||||
"\n",
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "78276a83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.len().value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f2890779",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We want to see that the length of the derived Census Block Group is always 12 digits. Census Tracts are always 11 digits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fd89f6c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
|
||||
" GEOID_TRACT_FIELD_NAME,\n",
|
||||
" 'ur',\n",
|
||||
" 'ua',\n",
|
||||
" 'cntyname',\n",
|
||||
" 'uaname',\n",
|
||||
" 'pop10',\n",
|
||||
" 'afact'\n",
|
||||
"]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e597d7e2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Checking Primary Key"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29929046",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9e4c0c3f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d52761e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
|
||||
"total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
|
||||
"total_geo_population.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "38225b78",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
|
||||
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
|
||||
"geocorr_urban_rural_with_total_pop_map.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "41b9448a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eb4ddb9b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1e03d1e6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1d976cb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
|
||||
"urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
|
||||
"urban_rural_map.reset_index(inplace=True)\n",
|
||||
"urban_rural_map['urban_heuristic_flag'] = 0\n",
|
||||
"mask = urban_rural_map['afact_U'] >= 0.5\n",
|
||||
"urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0f3a0993",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map.rename(\n",
|
||||
" columns={\n",
|
||||
" 'pop10_R': 'population_in_rural_areas',\n",
|
||||
" 'pop10_U': 'population_in_urban_areas',\n",
|
||||
" 'afact_R': 'perc_population_in_rural_areas',\n",
|
||||
" 'afact_U': 'perc_population_in_urban_areas',\n",
|
||||
" }, \n",
|
||||
" inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ba10f07c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "56098d7b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map.to_csv(\n",
|
||||
" path_or_buf=GEOCORR_DATA_DIR / \"urban_rural_map.csv\", na_rep=\"\", index=False\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue