Adding persistent poverty tracts (#738)

* persistent poverty working

* fixing left-padding

* running black and adding persistent poverty to comp tool

* fixing bug

* running black and fixing linter

* fixing linter

* fixing linter error
This commit is contained in:
Lucas Merrill Brown 2021-09-22 16:57:08 -05:00 committed by GitHub
parent d1ced6d584
commit b1a4d26be8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 518 additions and 201 deletions

View file

@ -22,8 +22,9 @@ class ExtractTransformLoad:
FILES_PATH: Path = settings.APP_ROOT / "files"
GEOID_FIELD_NAME: str = "GEOID10"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
# TODO: investigate. Census says there are only 217,740 CBGs in the US.
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
EXPECTED_MAX_CENSUS_TRACTS: int = 73076
def get_yaml_config(self) -> None:
"""Reads the YAML configuration file for the dataset and stores

View file

@ -64,6 +64,11 @@ DATASET_LIST = [
"module_dir": "geocorr",
"class_name": "GeoCorrETL",
},
{
"name": "persistent_poverty",
"module_dir": "persistent_poverty",
"class_name": "PersistentPovertyETL",
},
]
CENSUS_INFO = {
"name": "census",

View file

@ -83,6 +83,9 @@ class ScoreETL(ExtractTransformLoad):
# Urban Rural Map
self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
# Persistent poverty
self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
# dataframes
self.df: pd.DataFrame
self.ejscreen_df: pd.DataFrame
@ -95,6 +98,7 @@ class ScoreETL(ExtractTransformLoad):
self.doe_energy_burden_df: pd.DataFrame
self.national_risk_index_df: pd.DataFrame
self.geocorr_urban_rural_df: pd.DataFrame
self.persistent_poverty_df: pd.DataFrame
def data_sets(self) -> list:
# Define a named tuple that will be used for each data set input.
@ -206,6 +210,11 @@ class ScoreETL(ExtractTransformLoad):
renamed_field=self.URBAN_HERUISTIC_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.PERSISTENT_POVERTY_FIELD,
renamed_field=self.PERSISTENT_POVERTY_FIELD,
bucket=None,
),
# The following data sets have buckets, because they're used in Score C
DataSet(
input_field="CANCER",
@ -405,6 +414,16 @@ class ScoreETL(ExtractTransformLoad):
low_memory=False,
)
# Load persistent poverty
persistent_poverty_csv = (
self.DATA_PATH / "dataset" / "persistent_poverty" / "usa.csv"
)
self.persistent_poverty_df = pd.read_csv(
persistent_poverty_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
logger.info("Joining Census Block Group dataframes")
census_block_group_df = functools.reduce(
@ -692,6 +711,7 @@ class ScoreETL(ExtractTransformLoad):
self.cdc_life_expectancy_df,
self.doe_energy_burden_df,
self.geocorr_urban_rural_df,
self.persistent_poverty_df,
]
census_tract_df = self._join_tract_dfs(census_tract_dfs)
@ -743,7 +763,11 @@ class ScoreETL(ExtractTransformLoad):
# TODO do this at the same time as calculating percentiles in future refactor
for data_set in data_sets:
# Skip GEOID_FIELD_NAME, because it's a string.
if data_set.renamed_field == self.GEOID_FIELD_NAME:
# Skip `PERSISTENT_POVERTY_FIELD` because it's a straight pass-through.
if data_set.renamed_field in (
self.GEOID_FIELD_NAME,
self.PERSISTENT_POVERTY_FIELD,
):
continue
df[data_set.renamed_field] = pd.to_numeric(

View file

@ -0,0 +1,174 @@
import functools
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import (
get_module_logger,
unzip_file_from_url,
)
logger = get_module_logger(__name__)
class PersistentPovertyETL(ExtractTransformLoad):
"""Persistent poverty data.
Loaded from `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTDB.htm`.
Codebook: `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTBDDload/Dfiles/codebooks.pdf`.
"""
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
# Need to change hyperlink to S3
# self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/persistent_poverty_urban_rural.csv.zip"
self.GEOID_TRACT_INPUT_FIELD_NAME_1 = "TRTID10"
self.GEOID_TRACT_INPUT_FIELD_NAME_2 = "tractid"
# self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
self.POVERTY_PREFIX = "Individuals in Poverty (percent)"
self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
f"{self.POVERTY_PREFIX} (1990)",
f"{self.POVERTY_PREFIX} (2000)",
f"{self.POVERTY_PREFIX} (2010)",
self.PERSISTENT_POVERTY_FIELD,
]
self.df: pd.DataFrame
def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
df = functools.reduce(
lambda df_a, df_b: pd.merge(
left=df_a,
right=df_b,
# All data frames will now have this field for tract.
on=self.GEOID_TRACT_FIELD_NAME,
how="outer",
),
dfs,
)
# Left-pad the tracts with 0s
expected_length_of_census_tract_field = 11
df[self.GEOID_TRACT_FIELD_NAME] = (
df[self.GEOID_TRACT_FIELD_NAME]
.astype(str)
.apply(lambda x: x.zfill(expected_length_of_census_tract_field))
)
# Sanity check the join.
if len(df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()) != 1:
raise ValueError(
f"One of the input CSVs uses {self.GEOID_TRACT_FIELD_NAME} with a different length."
)
if len(df) > self.EXPECTED_MAX_CENSUS_TRACTS:
raise ValueError(f"Too many rows in the join: {len(df)}")
return df
def extract(self) -> None:
logger.info("Starting to download 86MB persistent poverty file.")
unzipped_file_path = self.TMP_PATH / "persistent_poverty"
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/LTDB_Std_All_Sample.zip",
download_path=self.TMP_PATH,
unzipped_file_path=unzipped_file_path,
)
file_names = [
"ltdb_std_1990_sample.csv",
"ltdb_std_2000_sample.csv",
"ltdb_std_2010_sample.csv",
]
temporary_input_dfs = []
for file_name in file_names:
print(file_name)
temporary_input_df = pd.read_csv(
filepath_or_buffer=unzipped_file_path
/ f"ltdb_std_all_sample/{file_name}",
dtype={
self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
},
low_memory=False,
encoding="latin1",
)
# Some CSVs have self.GEOID_TRACT_INPUT_FIELD_NAME_1 as the name of the tract field,
# and some have self.GEOID_TRACT_INPUT_FIELD_NAME_2. Rename them both to the same tract name.
temporary_input_df.rename(
columns={
self.GEOID_TRACT_INPUT_FIELD_NAME_1: self.GEOID_TRACT_FIELD_NAME,
self.GEOID_TRACT_INPUT_FIELD_NAME_2: self.GEOID_TRACT_FIELD_NAME,
},
inplace=True,
# Ignore errors b/c of the different field names in different CSVs.
errors="ignore",
)
temporary_input_dfs.append(temporary_input_df)
self.df = self._join_input_dfs(temporary_input_dfs)
def transform(self) -> None:
logger.info("Starting persistent poverty transform")
transformed_df = self.df
# Note: the fields are defined as following.
# dpovXX Description: persons for whom poverty status is determined
# npovXX Description: persons in poverty
transformed_df[f"{self.POVERTY_PREFIX} (1990)"] = (
transformed_df["NPOV90"] / transformed_df["DPOV90"]
)
transformed_df[f"{self.POVERTY_PREFIX} (2000)"] = (
transformed_df["NPOV00"] / transformed_df["DPOV00"]
)
# Note: for 2010, they use ACS data ending in 2012 that has 2010 as its midpoint year.
transformed_df[f"{self.POVERTY_PREFIX} (2010)"] = (
transformed_df["npov12"] / transformed_df["dpov12"]
)
poverty_threshold = 0.2
transformed_df[self.PERSISTENT_POVERTY_FIELD] = (
(
transformed_df[f"{self.POVERTY_PREFIX} (1990)"]
>= poverty_threshold
)
& (
transformed_df[f"{self.POVERTY_PREFIX} (2000)"]
>= poverty_threshold
)
& (
transformed_df[f"{self.POVERTY_PREFIX} (2010)"]
>= poverty_threshold
)
)
self.df = transformed_df
def load(self) -> None:
logger.info("Saving persistent poverty data.")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)
def validate(self) -> None:
logger.info("Validating persistent poverty data.")
pass

View file

@ -36,12 +36,8 @@
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"ACS_YEAR = \"2019\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
"OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
"CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
]
},
{
@ -52,12 +48,12 @@
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
" CENSUS_USA_CSV,\n",
" names=[\"GEOID10\"],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None,\n",
")"
]
},
{
@ -163,10 +159,10 @@
"outputs": [],
"source": [
"acs_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
")"
]
},
{
@ -292,9 +288,7 @@
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" acs_df, on=\"GEOID10\", how=\"left\"\n",
" )"
"merged_df = cbg_usa_df.merge(acs_df, on=\"GEOID10\", how=\"left\")"
]
},
{

View file

@ -35,12 +35,8 @@
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
"OUTPUT_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n",
"CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
]
},
{
@ -51,12 +47,12 @@
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
" CENSUS_USA_CSV,\n",
" names=[\"GEOID10\"],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None,\n",
")"
]
},
{
@ -162,10 +158,10 @@
"outputs": [],
"source": [
"ejscreen_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"ID\": \"string\"},\n",
" low_memory=False,\n",
" )"
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"ID\": \"string\"},\n",
" low_memory=False,\n",
")"
]
},
{
@ -176,9 +172,9 @@
"outputs": [],
"source": [
"ejscreen_df.rename(\n",
" columns={\"ID\": \"GEOID10\"},\n",
" inplace=True,\n",
" )"
" columns={\"ID\": \"GEOID10\"},\n",
" inplace=True,\n",
")"
]
},
{
@ -458,9 +454,7 @@
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" ejscreen_df, on=\"GEOID10\", how=\"left\"\n",
" )"
"merged_df = cbg_usa_df.merge(ejscreen_df, on=\"GEOID10\", how=\"left\")"
]
},
{
@ -1092,9 +1086,7 @@
"id": "d1a7b71d",
"metadata": {},
"outputs": [],
"source": [
"\n"
]
"source": []
}
],
"metadata": {

View file

@ -35,12 +35,8 @@
"source": [
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
"OUTPUT_PATH = (\n",
" DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
" )\n",
"CENSUS_USA_CSV = (\n",
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
" )"
"OUTPUT_PATH = DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
"CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
]
},
{
@ -51,12 +47,12 @@
"outputs": [],
"source": [
"cbg_usa_df = pd.read_csv(\n",
" CENSUS_USA_CSV,\n",
" names=['GEOID10'],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None\n",
" )"
" CENSUS_USA_CSV,\n",
" names=[\"GEOID10\"],\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" header=None,\n",
")"
]
},
{
@ -162,10 +158,10 @@
"outputs": [],
"source": [
"score_df = pd.read_csv(\n",
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
" )"
" OUTPUT_PATH / \"usa.csv\",\n",
" dtype={\"GEOID10\": \"string\"},\n",
" low_memory=False,\n",
")"
]
},
{
@ -381,9 +377,7 @@
"metadata": {},
"outputs": [],
"source": [
"merged_df = cbg_usa_df.merge(\n",
" score_df, on=\"GEOID10\", how=\"left\"\n",
" )"
"merged_df = cbg_usa_df.merge(score_df, on=\"GEOID10\", how=\"left\")"
]
},
{

View file

@ -33,7 +33,9 @@
"source": [
"def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
" state_gdf = gpd.read_file(file_name)\n",
" state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n",
" state_repr = state_gdf.to_crs(\n",
" \"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\"\n",
" )\n",
" state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n",
" state_merged_simplified = state_merged[\n",
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
@ -67,9 +69,9 @@
"\n",
"def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n",
" # dissolve tracts by bucket\n",
" state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n",
" drop=True\n",
" )\n",
" state_attr = state_tracts[\n",
" [\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]\n",
" ].reset_index(drop=True)\n",
" state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n",
" return state_dissolve\n",
"\n",
@ -91,10 +93,12 @@
" gdf_compressed = gpd.GeoDataFrame(\n",
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )\n",
" gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n",
" gdf_compressed.to_file(\n",
" CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\"\n",
" )\n",
"\n",
"\n",
"def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n",
"def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets: int):\n",
" print(f\"Processing file {file_name}...\")\n",
" state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n",
" state_tracts = aggregate_to_tracts(state_merged_simplified)\n",
@ -115,7 +119,9 @@
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
"CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
"score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
"score_df = pd.read_csv(\n",
" CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False\n",
")"
]
},
{
@ -185,9 +191,9 @@
},
"outputs": [],
"source": [
"for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
" state_gdf = gpd.read_file(file_name)\n",
" master_df = master_df.append(state_gdf)"
"for file_name in CENSUS_GEOJSON_DIR.rglob(\"*.json\"):\n",
" state_gdf = gpd.read_file(file_name)\n",
" master_df = master_df.append(state_gdf)"
]
},
{
@ -672,7 +678,9 @@
},
"outputs": [],
"source": [
"usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
"usa_merged_compressed.to_file(\n",
" CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\"\n",
")"
]
},
{
@ -684,8 +692,8 @@
"outputs": [],
"source": [
"usa_simplified = usa_merged[\n",
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
" ].reset_index(drop=True)"
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
"].reset_index(drop=True)"
]
},
{
@ -696,9 +704,7 @@
},
"outputs": [],
"source": [
"usa_simplified.rename(\n",
" columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
" )"
"usa_simplified.rename(columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True)"
]
},
{
@ -714,8 +720,8 @@
"outputs": [],
"source": [
"usa_cbg_compressed = gpd.GeoDataFrame(\n",
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
")"
]
},
{
@ -726,7 +732,9 @@
},
"outputs": [],
"source": [
"usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
"usa_cbg_compressed.to_file(\n",
" CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\"\n",
")"
]
},
{
@ -764,8 +772,8 @@
"outputs": [],
"source": [
"tracts_compressed = gpd.GeoDataFrame(\n",
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
")"
]
},
{
@ -776,7 +784,9 @@
},
"outputs": [],
"source": [
"tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
"tracts_compressed.to_file(\n",
" CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\"\n",
")"
]
},
{
@ -877,8 +887,8 @@
"outputs": [],
"source": [
"gdf_compressed = gpd.GeoDataFrame(\n",
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
" )"
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
")"
]
},
{
@ -917,7 +927,9 @@
},
"outputs": [],
"source": [
"gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
"gdf_compressed.to_file(\n",
" CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\"\n",
")"
]
}
],

View file

@ -39,7 +39,9 @@
"source": [
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
"censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))"
"censusdata.printtable(\n",
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n",
")"
],
"outputs": [],
"metadata": {
@ -65,8 +67,8 @@
" year=ACS_YEAR,\n",
" geo=censusdata.censusgeo(\n",
" [\n",
" (\"state\", fips) \n",
" #, (\"county\", \"*\"), (\"block group\", \"*\")\n",
" (\"state\", fips)\n",
" # , (\"county\", \"*\"), (\"block group\", \"*\")\n",
" ]\n",
" ),\n",
" var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n",
@ -75,7 +77,9 @@
"\n",
"df = pd.concat(dfs)\n",
"\n",
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(\n",
" func=fips_from_censusdata_censusgeo\n",
")\n",
"\n",
"df.head()"
],
@ -90,7 +94,13 @@
"source": [
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
"\n",
"df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n",
"df.rename(\n",
" columns={\n",
" \"GEOID10\": \"GEOID2\",\n",
" \"B19013_001E\": \"Median household income (State)\",\n",
" },\n",
" inplace=True,\n",
")\n",
"\n",
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
],

View file

@ -20,7 +20,7 @@
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
" \n",
"\n",
"from data_pipeline.utils import unzip_file_from_url\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes"
],
@ -57,9 +57,16 @@
"cell_type": "code",
"execution_count": null,
"source": [
"counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n",
"counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n",
"counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n",
"counties_df = pd.read_csv(\n",
" CENSUS_COUNTIES_TXT,\n",
" sep=\"\\t\",\n",
" dtype={\"GEOID\": \"string\", \"USPS\": \"string\"},\n",
" low_memory=False,\n",
")\n",
"counties_df = counties_df[[\"USPS\", \"GEOID\", \"NAME\"]]\n",
"counties_df.rename(\n",
" columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True\n",
")\n",
"counties_df.head()"
],
"outputs": [],
@ -69,8 +76,17 @@
"cell_type": "code",
"execution_count": null,
"source": [
"states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n",
"states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n",
"states_df = pd.read_csv(\n",
" STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"}\n",
")\n",
"states_df.rename(\n",
" columns={\n",
" \"fips\": \"State Code\",\n",
" \"state_name\": \"State Name\",\n",
" \"state_abbreviation\": \"State Abbreviation\",\n",
" },\n",
" inplace=True,\n",
")\n",
"states_df.head()"
],
"outputs": [],
@ -80,7 +96,7 @@
"cell_type": "code",
"execution_count": null,
"source": [
"county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n",
"county_state_merged = counties_df.join(states_df, rsuffix=\" Other\")\n",
"del county_state_merged[\"State Abbreviation Other\"]\n",
"county_state_merged.head()"
],
@ -102,7 +118,7 @@
"cell_type": "code",
"execution_count": null,
"source": [
"score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n",
"score_county_state_merged = score_df.join(county_state_merged, rsuffix=\"_OTHER\")\n",
"del score_county_state_merged[\"GEOID_OTHER\"]\n",
"score_county_state_merged.head()"
],

View file

@ -35,7 +35,7 @@
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
"\n",
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
"tqdm_notebook.pandas()\n"
"tqdm_notebook.pandas()"
]
},
{
@ -89,14 +89,12 @@
" \"Poverty (Less than 200% of federal poverty line)\",\n",
" \"Percent individuals age 25 or over with less than high school degree\",\n",
" \"Unemployed civilians (percent)\",\n",
" \"Linguistic isolation (percent)\"\n",
" \"Linguistic isolation (percent)\",\n",
"]\n",
"\n",
"column_to_plot = columns_to_plot[0]\n",
"print(f\"Plotting {column_to_plot}\")\n",
"print(cejst_df[\n",
" column_to_plot\n",
"].hist())"
"print(cejst_df[column_to_plot].hist())"
]
},
{

View file

@ -152,15 +152,17 @@
"CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
"CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
"\n",
"calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
"calenviroscreen_data_path = (\n",
" DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
")\n",
"calenviroscreen_df = pd.read_csv(\n",
" calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
")\n",
"\n",
"# Convert priority community field to a bool.\n",
"calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
"calenviroscreen_df[\n",
" CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
"].astype(bool)\n",
"] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n",
"\n",
"calenviroscreen_df.head()"
]
@ -168,19 +170,33 @@
{
"cell_type": "code",
"execution_count": null,
"id": "1bf54af1",
"metadata": {
"scrolled": true
},
"id": "df458f08",
"metadata": {},
"outputs": [],
"source": [
"# Load HUD data\n",
"hud_recap_data_path = DATA_DIR / \"dataset\" / \"hud_recap\" / \"usa.csv\"\n",
"hud_recap_df = pd.read_csv(\n",
" hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
"# Load persistent poverty data\n",
"persistent_poverty_path = (\n",
" DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
")\n",
"persistent_poverty_df = pd.read_csv(\n",
" persistent_poverty_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
")\n",
"\n",
"hud_recap_df.head()"
"# Since \"Persistent Poverty Census Tract\" is labeled in both the score file (at the CBG level) and this tract file,\n",
"# rename this field so it's easy to access the tract-level scores directly.\n",
"\n",
"PERSISTENT_POVERTY_TRACT_LEVEL_FIELD = \"Persistent Poverty, Tract Level\"\n",
"PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
"\n",
"persistent_poverty_df.rename(\n",
" columns={\n",
" PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n",
" },\n",
" inplace=True,\n",
" errors=\"raise\",\n",
")\n",
"\n",
"persistent_poverty_df"
]
},
{
@ -193,7 +209,7 @@
"outputs": [],
"source": [
"# Join all dataframes that use tracts\n",
"census_tract_dfs = [calenviroscreen_df, hud_recap_df]\n",
"census_tract_dfs = [calenviroscreen_df, persistent_poverty_df]\n",
"\n",
"census_tract_df = functools.reduce(\n",
" lambda left, right: pd.merge(\n",
@ -231,7 +247,6 @@
" on=GEOID_TRACT_FIELD_NAME,\n",
")\n",
"\n",
"\n",
"if len(merged_df) > 220405:\n",
" raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
"\n",
@ -314,10 +329,20 @@
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Persistent Poverty (CBG)\",\n",
" priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
"]\n",
"\n",
"census_tract_indices = [\n",
" Index(\n",
" method_name=\"Persistent Poverty\",\n",
" priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"CalEnviroScreen 4.0\",\n",
" priority_communities_field=\"calenviroscreen_priority_community\",\n",
" other_census_tract_fields_to_keep=[\n",
@ -325,11 +350,6 @@
" CALENVIROSCREEN_PERCENTILE_FIELD,\n",
" ],\n",
" ),\n",
" Index(\n",
" method_name=\"HUD RECAP\",\n",
" priority_communities_field=\"hud_recap_priority_community\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
"]"
]
},
@ -354,7 +374,8 @@
"\n",
" # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n",
" df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
" df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
" df[priority_communities_field]\n",
" * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
" )\n",
"\n",
" def calculate_state_comparison(\n",
@ -393,7 +414,9 @@
" summary_dict[\"Geography name\"] = division_id\n",
"\n",
" total_cbgs_in_geography = len(frame)\n",
" total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n",
" total_population_in_geography = frame[\n",
" CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
" ].sum()\n",
"\n",
" if geography_field == URBAN_HEURISTIC_FIELD:\n",
" urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n",
@ -401,9 +424,9 @@
" summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
"\n",
" for priority_communities_field in priority_communities_fields:\n",
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
" summary_dict[\n",
" f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
" ].sum()\n",
" ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n",
"\n",
" summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n",
" f\"{priority_communities_field}\"\n",
@ -415,7 +438,9 @@
" / total_cbgs_in_geography\n",
" )\n",
"\n",
" summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
" summary_dict[\n",
" f\"{priority_communities_field} (percent population)\"\n",
" ] = (\n",
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
" / total_population_in_geography\n",
" )\n",
@ -461,7 +486,9 @@
"\n",
" # Run the comparison function on the groups.\n",
" region_distribution_df = region_grouped_df.progress_apply(\n",
" lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
" lambda frame: calculate_state_comparison(\n",
" frame, geography_field=\"region\"\n",
" )\n",
" )\n",
"\n",
" # Next, run the comparison by division\n",
@ -469,7 +496,9 @@
"\n",
" # Run the comparison function on the groups.\n",
" division_distribution_df = division_grouped_df.progress_apply(\n",
" lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
" lambda frame: calculate_state_comparison(\n",
" frame, geography_field=\"division\"\n",
" )\n",
" )\n",
"\n",
" # Next, run the comparison by urban/rural\n",
@ -524,7 +553,9 @@
" column_character = get_excel_column_name(column_index)\n",
"\n",
" # Set all columns to larger width\n",
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
" worksheet.set_column(\n",
" f\"{column_character}:{column_character}\", column_width\n",
" )\n",
"\n",
" # Special formatting for all percent columns\n",
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
@ -539,9 +570,7 @@
"\n",
" # Special formatting for columns that capture the percent of population considered priority.\n",
" if \"(percent population)\" in column:\n",
" column_ranges = (\n",
" f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
" )\n",
" column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
"\n",
" # Add green to red conditional formatting.\n",
" worksheet.conditional_format(\n",
@ -654,7 +683,9 @@
"\n",
" # Put criteria description column first.\n",
" new_column_order = [criteria_description_field_name] + [\n",
" col for col in comparison_df.columns if col != criteria_description_field_name\n",
" col\n",
" for col in comparison_df.columns\n",
" if col != criteria_description_field_name\n",
" ]\n",
"\n",
" comparison_df = comparison_df[new_column_order]\n",
@ -700,12 +731,12 @@
" column_character = get_excel_column_name(column_index)\n",
"\n",
" # Set all columns to larger width\n",
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
" worksheet.set_column(\n",
" f\"{column_character}:{column_character}\", column_width\n",
" )\n",
"\n",
" # Add green to red conditional formatting.\n",
" column_ranges = (\n",
" f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
" )\n",
" column_ranges = f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
" worksheet.conditional_format(\n",
" column_ranges,\n",
" # Min: green, max: red.\n",
@ -718,7 +749,11 @@
"\n",
" # Special formatting for all percent columns\n",
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
" if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
" if (\n",
" \"percent \" in column\n",
" or \"(percent)\" in column\n",
" or \"Percent \" in column\n",
" ):\n",
" # Make these columns percentages.\n",
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
" worksheet.set_column(\n",
@ -756,9 +791,7 @@
" )\n",
"\n",
" # Write secondary comparison to CSV.\n",
" file_name_part = (\n",
" f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
" )\n",
" file_name_part = f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
" output_dir.mkdir(parents=True, exist_ok=True)\n",
" file_path = output_dir / (file_name_part + \".csv\")\n",
" file_path_xlsx = output_dir / (file_name_part + \".xlsx\")\n",
@ -770,7 +803,8 @@
" )\n",
"\n",
" write_cbg_score_comparison_excel(\n",
" cbg_score_comparison_df=cbg_score_comparison_df, file_path=file_path_xlsx\n",
" cbg_score_comparison_df=cbg_score_comparison_df,\n",
" file_path=file_path_xlsx,\n",
" )\n",
"\n",
"\n",
@ -801,11 +835,15 @@
"cell_type": "code",
"execution_count": null,
"id": "eeb9699d",
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def write_markdown_and_docx_content(\n",
" markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str\n",
" markdown_content: str,\n",
" file_dir: pathlib.PosixPath,\n",
" file_name_without_extension: str,\n",
") -> pathlib.PosixPath:\n",
" \"\"\"Write Markdown content to both .md and .docx files.\"\"\"\n",
" # Set the file paths for both files.\n",
@ -837,7 +875,9 @@
"\n",
" # List of all states/territories in their FIPS codes:\n",
" state_ids = sorted(df[state_field].unique())\n",
" state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
" state_names = \", \".join(\n",
" [us.states.lookup(state_id).name for state_id in state_ids]\n",
" )\n",
"\n",
" # Create markdown content for comparisons.\n",
" markdown_content = f\"\"\"\n",
@ -851,11 +891,16 @@
"\n",
"\"\"\"\n",
"\n",
" for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n",
" for (index1, index2) in itertools.combinations(\n",
" census_block_group_indices, 2\n",
" ):\n",
" # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n",
" count_df = (\n",
" df.groupby(\n",
" [index1.priority_communities_field, index2.priority_communities_field]\n",
" [\n",
" index1.priority_communities_field,\n",
" index2.priority_communities_field,\n",
" ]\n",
" )[GEOID_FIELD_NAME]\n",
" .count()\n",
" .reset_index(name=count_field_name)\n",
@ -887,16 +932,24 @@
"\n",
" # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n",
" true_true_cbgs = (\n",
" true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n",
" true_true_cbgs_series.iloc[0]\n",
" if len(true_true_cbgs_series) > 0\n",
" else 0\n",
" )\n",
" true_false_cbgs = (\n",
" true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n",
" true_false_cbgs_series.iloc[0]\n",
" if len(true_false_cbgs_series) > 0\n",
" else 0\n",
" )\n",
" false_true_cbgs = (\n",
" false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n",
" false_true_cbgs_series.iloc[0]\n",
" if len(false_true_cbgs_series) > 0\n",
" else 0\n",
" )\n",
" false_false_cbgs = (\n",
" false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n",
" false_false_cbgs_series.iloc[0]\n",
" if len(false_false_cbgs_series) > 0\n",
" else 0\n",
" )\n",
"\n",
" markdown_content += (\n",
@ -1088,15 +1141,20 @@
"\n",
" # Calculate comparison\n",
" # A comparison priority tract has at least one CBG that is a priority CBG.\n",
" df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n",
" df[\n",
" comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n",
" ] = (\n",
" frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
" if is_a_method_b_priority_tract\n",
" else None\n",
" )\n",
"\n",
" # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n",
" df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n",
" frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
" df[\n",
" comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n",
" ] = (\n",
" frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
" == 1\n",
" if is_a_method_b_priority_tract\n",
" else None\n",
" )\n",
@ -1115,7 +1173,8 @@
" df[\n",
" comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n",
" ] = (\n",
" frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
" frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
" == 1\n",
" if not is_a_method_b_priority_tract\n",
" else None\n",
" )\n",
@ -1156,14 +1215,20 @@
"\n",
" # List of all states/territories in their FIPS codes:\n",
" state_ids = sorted(original_df[state_field].unique())\n",
" state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
" state_names = \", \".join(\n",
" [us.states.lookup(state_id).name for state_id in state_ids]\n",
" )\n",
"\n",
" # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n",
" # TODO: investigate why sums are sometimes series and sometimes scalar.\n",
" method_a_priority_cbgs = (\n",
" original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n",
" original_df.loc[:, method_a_priority_census_block_groups_field]\n",
" .sum()\n",
" .squeeze()\n",
" )\n",
" method_a_priority_cbgs_percent = (\n",
" f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
" )\n",
" method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
"\n",
" total_tracts_count = len(comparison_df)\n",
"\n",
@ -1185,7 +1250,9 @@
" .sum()\n",
" .squeeze()\n",
" )\n",
" method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
" method_a_tracts_count_percent = (\n",
" f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
" )\n",
"\n",
" # Method A priority community stats\n",
" method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
@ -1316,7 +1383,8 @@
"\n",
" # Write comparison to CSV.\n",
" file_path = (\n",
" output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
" output_dir\n",
" / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
" )\n",
" comparison_df.to_csv(\n",
" path_or_buf=file_path,\n",

View file

@ -101,17 +101,25 @@
"outputs": [],
"source": [
"geocorr_urban_rural_map = pd.read_csv(\n",
" os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
" encoding = \"ISO-8859-1\",\n",
" os.path.join(GEOCORR_DATA_DIR, \"geocorr2014_2125804280.csv\"),\n",
" encoding=\"ISO-8859-1\",\n",
" skiprows=[1],\n",
" dtype='str',\n",
" dtype=\"str\",\n",
")\n",
"\n",
"geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
"geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
"geocorr_urban_rural_map[\"pop10\"] = pd.to_numeric(\n",
" geocorr_urban_rural_map[\"pop10\"]\n",
")\n",
"geocorr_urban_rural_map[\"afact\"] = pd.to_numeric(\n",
" geocorr_urban_rural_map[\"afact\"]\n",
")\n",
"\n",
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = (\n",
" geocorr_urban_rural_map[\"county\"] + geocorr_urban_rural_map[\"tract\"]\n",
") # + geocorr_urban_rural_map['bg']\n",
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[\n",
" GEOID_TRACT_FIELD_NAME\n",
"].str.replace(\".\", \"\", regex=False)"
]
},
{
@ -139,15 +147,9 @@
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
" GEOID_TRACT_FIELD_NAME,\n",
" 'ur',\n",
" 'ua',\n",
" 'cntyname',\n",
" 'uaname',\n",
" 'pop10',\n",
" 'afact'\n",
"]]"
"geocorr_urban_rural_map = geocorr_urban_rural_map[\n",
" [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\", \"cntyname\", \"uaname\", \"pop10\", \"afact\"]\n",
"]"
]
},
{
@ -165,7 +167,9 @@
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
"geocorr_urban_rural_map.groupby(\n",
" [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\"], dropna=False\n",
").size().sort_values(ascending=False)"
]
},
{
@ -175,7 +179,9 @@
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
"geocorr_urban_rural_map.loc[\n",
" geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == \"36117020302\"\n",
"]"
]
},
{
@ -185,8 +191,12 @@
"metadata": {},
"outputs": [],
"source": [
"total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
"total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
"total_geo_population = (\n",
" geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME)\n",
" .agg({\"pop10\": np.sum})\n",
" .reset_index()\n",
")\n",
"total_geo_population.rename(columns={\"pop10\": \"total_population\"}, inplace=True)\n",
"total_geo_population.head()"
]
},
@ -197,8 +207,16 @@
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
"geocorr_urban_rural_with_total_pop_map = (\n",
" geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, \"ur\"])\n",
" .agg({\"pop10\": np.sum})\n",
" .reset_index()\n",
")\n",
"geocorr_urban_rural_with_total_pop_map = (\n",
" geocorr_urban_rural_with_total_pop_map.merge(\n",
" total_geo_population, how=\"inner\", on=GEOID_TRACT_FIELD_NAME\n",
" )\n",
")\n",
"geocorr_urban_rural_with_total_pop_map.head()"
]
},
@ -209,7 +227,10 @@
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
"geocorr_urban_rural_with_total_pop_map[\"afact\"] = (\n",
" geocorr_urban_rural_with_total_pop_map[\"pop10\"]\n",
" / geocorr_urban_rural_with_total_pop_map[\"total_population\"]\n",
")"
]
},
{
@ -229,7 +250,10 @@
"metadata": {},
"outputs": [],
"source": [
"geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
"geocorr_urban_rural_with_total_pop_map.loc[\n",
" geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME]\n",
" == \"01001020200\"\n",
"]"
]
},
{
@ -239,12 +263,16 @@
"metadata": {},
"outputs": [],
"source": [
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
"urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(\n",
" index=GEOID_TRACT_FIELD_NAME, columns=\"ur\", values=[\"pop10\", \"afact\"]\n",
")\n",
"urban_rural_map.columns = [\n",
" \"_\".join(col).strip() for col in urban_rural_map.columns.values\n",
"]\n",
"urban_rural_map.reset_index(inplace=True)\n",
"urban_rural_map['urban_heuristic_flag'] = 0\n",
"mask = urban_rural_map['afact_U'] >= 0.5\n",
"urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
"urban_rural_map[\"urban_heuristic_flag\"] = 0\n",
"mask = urban_rural_map[\"afact_U\"] >= 0.5\n",
"urban_rural_map.loc[mask, \"urban_heuristic_flag\"] = 1"
]
},
{
@ -256,12 +284,13 @@
"source": [
"urban_rural_map.rename(\n",
" columns={\n",
" 'pop10_R': 'population_in_rural_areas',\n",
" 'pop10_U': 'population_in_urban_areas',\n",
" 'afact_R': 'perc_population_in_rural_areas',\n",
" 'afact_U': 'perc_population_in_urban_areas',\n",
" }, \n",
" inplace=True)"
" \"pop10_R\": \"population_in_rural_areas\",\n",
" \"pop10_U\": \"population_in_urban_areas\",\n",
" \"afact_R\": \"perc_population_in_rural_areas\",\n",
" \"afact_U\": \"perc_population_in_urban_areas\",\n",
" },\n",
" inplace=True,\n",
")"
]
},
{