mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 09:41:26 -08:00
Adding persistent poverty tracts (#738)
* persistent poverty working * fixing left-padding * running black and adding persistent poverty to comp tool * fixing bug * running black and fixing linter * fixing linter * fixing linter error
This commit is contained in:
parent
d1ced6d584
commit
b1a4d26be8
15 changed files with 518 additions and 201 deletions
|
@ -22,8 +22,9 @@ class ExtractTransformLoad:
|
|||
FILES_PATH: Path = settings.APP_ROOT / "files"
|
||||
GEOID_FIELD_NAME: str = "GEOID10"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US.
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might be from CBGs at different time periods.
|
||||
EXPECTED_MAX_CENSUS_BLOCK_GROUPS: int = 220405
|
||||
EXPECTED_MAX_CENSUS_TRACTS: int = 73076
|
||||
|
||||
def get_yaml_config(self) -> None:
|
||||
"""Reads the YAML configuration file for the dataset and stores
|
||||
|
|
|
@ -64,6 +64,11 @@ DATASET_LIST = [
|
|||
"module_dir": "geocorr",
|
||||
"class_name": "GeoCorrETL",
|
||||
},
|
||||
{
|
||||
"name": "persistent_poverty",
|
||||
"module_dir": "persistent_poverty",
|
||||
"class_name": "PersistentPovertyETL",
|
||||
},
|
||||
]
|
||||
CENSUS_INFO = {
|
||||
"name": "census",
|
||||
|
|
|
@ -83,6 +83,9 @@ class ScoreETL(ExtractTransformLoad):
|
|||
# Urban Rural Map
|
||||
self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
|
||||
|
||||
# Persistent poverty
|
||||
self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
|
||||
|
||||
# dataframes
|
||||
self.df: pd.DataFrame
|
||||
self.ejscreen_df: pd.DataFrame
|
||||
|
@ -95,6 +98,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.doe_energy_burden_df: pd.DataFrame
|
||||
self.national_risk_index_df: pd.DataFrame
|
||||
self.geocorr_urban_rural_df: pd.DataFrame
|
||||
self.persistent_poverty_df: pd.DataFrame
|
||||
|
||||
def data_sets(self) -> list:
|
||||
# Define a named tuple that will be used for each data set input.
|
||||
|
@ -206,6 +210,11 @@ class ScoreETL(ExtractTransformLoad):
|
|||
renamed_field=self.URBAN_HERUISTIC_FIELD_NAME,
|
||||
bucket=None,
|
||||
),
|
||||
DataSet(
|
||||
input_field=self.PERSISTENT_POVERTY_FIELD,
|
||||
renamed_field=self.PERSISTENT_POVERTY_FIELD,
|
||||
bucket=None,
|
||||
),
|
||||
# The following data sets have buckets, because they're used in Score C
|
||||
DataSet(
|
||||
input_field="CANCER",
|
||||
|
@ -405,6 +414,16 @@ class ScoreETL(ExtractTransformLoad):
|
|||
low_memory=False,
|
||||
)
|
||||
|
||||
# Load persistent poverty
|
||||
persistent_poverty_csv = (
|
||||
self.DATA_PATH / "dataset" / "persistent_poverty" / "usa.csv"
|
||||
)
|
||||
self.persistent_poverty_df = pd.read_csv(
|
||||
persistent_poverty_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
|
||||
logger.info("Joining Census Block Group dataframes")
|
||||
census_block_group_df = functools.reduce(
|
||||
|
@ -692,6 +711,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.cdc_life_expectancy_df,
|
||||
self.doe_energy_burden_df,
|
||||
self.geocorr_urban_rural_df,
|
||||
self.persistent_poverty_df,
|
||||
]
|
||||
census_tract_df = self._join_tract_dfs(census_tract_dfs)
|
||||
|
||||
|
@ -743,7 +763,11 @@ class ScoreETL(ExtractTransformLoad):
|
|||
# TODO do this at the same time as calculating percentiles in future refactor
|
||||
for data_set in data_sets:
|
||||
# Skip GEOID_FIELD_NAME, because it's a string.
|
||||
if data_set.renamed_field == self.GEOID_FIELD_NAME:
|
||||
# Skip `PERSISTENT_POVERTY_FIELD` because it's a straight pass-through.
|
||||
if data_set.renamed_field in (
|
||||
self.GEOID_FIELD_NAME,
|
||||
self.PERSISTENT_POVERTY_FIELD,
|
||||
):
|
||||
continue
|
||||
|
||||
df[data_set.renamed_field] = pd.to_numeric(
|
||||
|
|
|
@ -0,0 +1,174 @@
|
|||
import functools
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.utils import (
|
||||
get_module_logger,
|
||||
unzip_file_from_url,
|
||||
)
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class PersistentPovertyETL(ExtractTransformLoad):
|
||||
"""Persistent poverty data.
|
||||
|
||||
Loaded from `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTDB.htm`.
|
||||
|
||||
Codebook: `https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTBDDload/Dfiles/codebooks.pdf`.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
|
||||
|
||||
# Need to change hyperlink to S3
|
||||
# self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/persistent_poverty_urban_rural.csv.zip"
|
||||
self.GEOID_TRACT_INPUT_FIELD_NAME_1 = "TRTID10"
|
||||
self.GEOID_TRACT_INPUT_FIELD_NAME_2 = "tractid"
|
||||
# self.URBAN_HERUISTIC_FIELD_NAME = "Urban Heuristic Flag"
|
||||
|
||||
self.POVERTY_PREFIX = "Individuals in Poverty (percent)"
|
||||
self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
|
||||
|
||||
self.COLUMNS_TO_KEEP = [
|
||||
self.GEOID_TRACT_FIELD_NAME,
|
||||
f"{self.POVERTY_PREFIX} (1990)",
|
||||
f"{self.POVERTY_PREFIX} (2000)",
|
||||
f"{self.POVERTY_PREFIX} (2010)",
|
||||
self.PERSISTENT_POVERTY_FIELD,
|
||||
]
|
||||
|
||||
self.df: pd.DataFrame
|
||||
|
||||
def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
|
||||
df = functools.reduce(
|
||||
lambda df_a, df_b: pd.merge(
|
||||
left=df_a,
|
||||
right=df_b,
|
||||
# All data frames will now have this field for tract.
|
||||
on=self.GEOID_TRACT_FIELD_NAME,
|
||||
how="outer",
|
||||
),
|
||||
dfs,
|
||||
)
|
||||
|
||||
# Left-pad the tracts with 0s
|
||||
expected_length_of_census_tract_field = 11
|
||||
df[self.GEOID_TRACT_FIELD_NAME] = (
|
||||
df[self.GEOID_TRACT_FIELD_NAME]
|
||||
.astype(str)
|
||||
.apply(lambda x: x.zfill(expected_length_of_census_tract_field))
|
||||
)
|
||||
|
||||
# Sanity check the join.
|
||||
if len(df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()) != 1:
|
||||
raise ValueError(
|
||||
f"One of the input CSVs uses {self.GEOID_TRACT_FIELD_NAME} with a different length."
|
||||
)
|
||||
|
||||
if len(df) > self.EXPECTED_MAX_CENSUS_TRACTS:
|
||||
raise ValueError(f"Too many rows in the join: {len(df)}")
|
||||
|
||||
return df
|
||||
|
||||
def extract(self) -> None:
|
||||
logger.info("Starting to download 86MB persistent poverty file.")
|
||||
|
||||
unzipped_file_path = self.TMP_PATH / "persistent_poverty"
|
||||
|
||||
unzip_file_from_url(
|
||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||
+ "/LTDB_Std_All_Sample.zip",
|
||||
download_path=self.TMP_PATH,
|
||||
unzipped_file_path=unzipped_file_path,
|
||||
)
|
||||
|
||||
file_names = [
|
||||
"ltdb_std_1990_sample.csv",
|
||||
"ltdb_std_2000_sample.csv",
|
||||
"ltdb_std_2010_sample.csv",
|
||||
]
|
||||
|
||||
temporary_input_dfs = []
|
||||
|
||||
for file_name in file_names:
|
||||
print(file_name)
|
||||
temporary_input_df = pd.read_csv(
|
||||
filepath_or_buffer=unzipped_file_path
|
||||
/ f"ltdb_std_all_sample/{file_name}",
|
||||
dtype={
|
||||
self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
|
||||
self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
|
||||
},
|
||||
low_memory=False,
|
||||
encoding="latin1",
|
||||
)
|
||||
|
||||
# Some CSVs have self.GEOID_TRACT_INPUT_FIELD_NAME_1 as the name of the tract field,
|
||||
# and some have self.GEOID_TRACT_INPUT_FIELD_NAME_2. Rename them both to the same tract name.
|
||||
temporary_input_df.rename(
|
||||
columns={
|
||||
self.GEOID_TRACT_INPUT_FIELD_NAME_1: self.GEOID_TRACT_FIELD_NAME,
|
||||
self.GEOID_TRACT_INPUT_FIELD_NAME_2: self.GEOID_TRACT_FIELD_NAME,
|
||||
},
|
||||
inplace=True,
|
||||
# Ignore errors b/c of the different field names in different CSVs.
|
||||
errors="ignore",
|
||||
)
|
||||
|
||||
temporary_input_dfs.append(temporary_input_df)
|
||||
|
||||
self.df = self._join_input_dfs(temporary_input_dfs)
|
||||
|
||||
def transform(self) -> None:
|
||||
logger.info("Starting persistent poverty transform")
|
||||
transformed_df = self.df
|
||||
|
||||
# Note: the fields are defined as following.
|
||||
# dpovXX Description: persons for whom poverty status is determined
|
||||
# npovXX Description: persons in poverty
|
||||
transformed_df[f"{self.POVERTY_PREFIX} (1990)"] = (
|
||||
transformed_df["NPOV90"] / transformed_df["DPOV90"]
|
||||
)
|
||||
transformed_df[f"{self.POVERTY_PREFIX} (2000)"] = (
|
||||
transformed_df["NPOV00"] / transformed_df["DPOV00"]
|
||||
)
|
||||
# Note: for 2010, they use ACS data ending in 2012 that has 2010 as its midpoint year.
|
||||
transformed_df[f"{self.POVERTY_PREFIX} (2010)"] = (
|
||||
transformed_df["npov12"] / transformed_df["dpov12"]
|
||||
)
|
||||
|
||||
poverty_threshold = 0.2
|
||||
|
||||
transformed_df[self.PERSISTENT_POVERTY_FIELD] = (
|
||||
(
|
||||
transformed_df[f"{self.POVERTY_PREFIX} (1990)"]
|
||||
>= poverty_threshold
|
||||
)
|
||||
& (
|
||||
transformed_df[f"{self.POVERTY_PREFIX} (2000)"]
|
||||
>= poverty_threshold
|
||||
)
|
||||
& (
|
||||
transformed_df[f"{self.POVERTY_PREFIX} (2010)"]
|
||||
>= poverty_threshold
|
||||
)
|
||||
)
|
||||
|
||||
self.df = transformed_df
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Saving persistent poverty data.")
|
||||
|
||||
# mkdir census
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
)
|
||||
|
||||
def validate(self) -> None:
|
||||
logger.info("Validating persistent poverty data.")
|
||||
|
||||
pass
|
|
@ -36,12 +36,8 @@
|
|||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
|
||||
"ACS_YEAR = \"2019\"\n",
|
||||
"OUTPUT_PATH = (\n",
|
||||
" DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
|
||||
" )\n",
|
||||
"CENSUS_USA_CSV = (\n",
|
||||
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
|
||||
" )"
|
||||
"OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
|
||||
"CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -52,12 +48,12 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"cbg_usa_df = pd.read_csv(\n",
|
||||
" CENSUS_USA_CSV,\n",
|
||||
" names=['GEOID10'],\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" header=None\n",
|
||||
" )"
|
||||
" CENSUS_USA_CSV,\n",
|
||||
" names=[\"GEOID10\"],\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" header=None,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -163,10 +159,10 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"acs_df = pd.read_csv(\n",
|
||||
" OUTPUT_PATH / \"usa.csv\",\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" )"
|
||||
" OUTPUT_PATH / \"usa.csv\",\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -292,9 +288,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"merged_df = cbg_usa_df.merge(\n",
|
||||
" acs_df, on=\"GEOID10\", how=\"left\"\n",
|
||||
" )"
|
||||
"merged_df = cbg_usa_df.merge(acs_df, on=\"GEOID10\", how=\"left\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -35,12 +35,8 @@
|
|||
"source": [
|
||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
|
||||
"OUTPUT_PATH = (\n",
|
||||
" DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n",
|
||||
" )\n",
|
||||
"CENSUS_USA_CSV = (\n",
|
||||
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
|
||||
" )"
|
||||
"OUTPUT_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2019\"\n",
|
||||
"CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -51,12 +47,12 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"cbg_usa_df = pd.read_csv(\n",
|
||||
" CENSUS_USA_CSV,\n",
|
||||
" names=['GEOID10'],\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" header=None\n",
|
||||
" )"
|
||||
" CENSUS_USA_CSV,\n",
|
||||
" names=[\"GEOID10\"],\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" header=None,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -162,10 +158,10 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"ejscreen_df = pd.read_csv(\n",
|
||||
" OUTPUT_PATH / \"usa.csv\",\n",
|
||||
" dtype={\"ID\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" )"
|
||||
" OUTPUT_PATH / \"usa.csv\",\n",
|
||||
" dtype={\"ID\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -176,9 +172,9 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"ejscreen_df.rename(\n",
|
||||
" columns={\"ID\": \"GEOID10\"},\n",
|
||||
" inplace=True,\n",
|
||||
" )"
|
||||
" columns={\"ID\": \"GEOID10\"},\n",
|
||||
" inplace=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -458,9 +454,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"merged_df = cbg_usa_df.merge(\n",
|
||||
" ejscreen_df, on=\"GEOID10\", how=\"left\"\n",
|
||||
" )"
|
||||
"merged_df = cbg_usa_df.merge(ejscreen_df, on=\"GEOID10\", how=\"left\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1092,9 +1086,7 @@
|
|||
"id": "d1a7b71d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n"
|
||||
]
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
|
@ -35,12 +35,8 @@
|
|||
"source": [
|
||||
"DATA_PATH = Path.cwd().parent / \"data\"\n",
|
||||
"TMP_PATH: Path = DATA_PATH / \"tmp\"\n",
|
||||
"OUTPUT_PATH = (\n",
|
||||
" DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
|
||||
" )\n",
|
||||
"CENSUS_USA_CSV = (\n",
|
||||
" DATA_PATH / \"census\" / \"csv\" / \"us.csv\"\n",
|
||||
" )"
|
||||
"OUTPUT_PATH = DATA_PATH / \"score\" / \"csv\" / \"tiles\"\n",
|
||||
"CENSUS_USA_CSV = DATA_PATH / \"census\" / \"csv\" / \"us.csv\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -51,12 +47,12 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"cbg_usa_df = pd.read_csv(\n",
|
||||
" CENSUS_USA_CSV,\n",
|
||||
" names=['GEOID10'],\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" header=None\n",
|
||||
" )"
|
||||
" CENSUS_USA_CSV,\n",
|
||||
" names=[\"GEOID10\"],\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" header=None,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -162,10 +158,10 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"score_df = pd.read_csv(\n",
|
||||
" OUTPUT_PATH / \"usa.csv\",\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
" )"
|
||||
" OUTPUT_PATH / \"usa.csv\",\n",
|
||||
" dtype={\"GEOID10\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -381,9 +377,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"merged_df = cbg_usa_df.merge(\n",
|
||||
" score_df, on=\"GEOID10\", how=\"left\"\n",
|
||||
" )"
|
||||
"merged_df = cbg_usa_df.merge(score_df, on=\"GEOID10\", how=\"left\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -33,7 +33,9 @@
|
|||
"source": [
|
||||
"def merge_and_simplify_file(file_name: str, usa_df: pd.DataFrame):\n",
|
||||
" state_gdf = gpd.read_file(file_name)\n",
|
||||
" state_repr = state_gdf.to_crs(\"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\")\n",
|
||||
" state_repr = state_gdf.to_crs(\n",
|
||||
" \"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs\"\n",
|
||||
" )\n",
|
||||
" state_merged = state_repr.merge(usa_df, on=\"GEOID10\", how=\"left\")\n",
|
||||
" state_merged_simplified = state_merged[\n",
|
||||
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
|
||||
|
@ -67,9 +69,9 @@
|
|||
"\n",
|
||||
"def aggregate_buckets(state_tracts: pd.DataFrame, agg_func: str):\n",
|
||||
" # dissolve tracts by bucket\n",
|
||||
" state_attr = state_tracts[[\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]].reset_index(\n",
|
||||
" drop=True\n",
|
||||
" )\n",
|
||||
" state_attr = state_tracts[\n",
|
||||
" [\"D_SCORE\", \"D_SCORE_bucket\", \"geometry\"]\n",
|
||||
" ].reset_index(drop=True)\n",
|
||||
" state_dissolve = state_attr.dissolve(by=\"D_SCORE_bucket\", aggfunc=agg_func)\n",
|
||||
" return state_dissolve\n",
|
||||
"\n",
|
||||
|
@ -91,10 +93,12 @@
|
|||
" gdf_compressed = gpd.GeoDataFrame(\n",
|
||||
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
|
||||
" )\n",
|
||||
" gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\")\n",
|
||||
" gdf_compressed.to_file(\n",
|
||||
" CENSUS_GEOJSON_DIR / f\"{file_name}_low.geojson\", driver=\"GeoJSON\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets:int):\n",
|
||||
"def process_file(file_name: str, usa_df: pd.DataFrame, num_buckets: int):\n",
|
||||
" print(f\"Processing file {file_name}...\")\n",
|
||||
" state_merged_simplified = merge_and_simplify_file(file_name, usa_df)\n",
|
||||
" state_tracts = aggregate_to_tracts(state_merged_simplified)\n",
|
||||
|
@ -115,7 +119,9 @@
|
|||
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
|
||||
"CENSUS_GEOJSON_DIR = DATA_DIR / \"census\" / \"geojson\"\n",
|
||||
"CEJST_DATA_PATH = DATA_DIR / \"score\" / \"csv\" / \"tiles\" / \"usa.csv\"\n",
|
||||
"score_df = pd.read_csv(CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False)"
|
||||
"score_df = pd.read_csv(\n",
|
||||
" CEJST_DATA_PATH, dtype={\"GEOID10\": \"object\"}, low_memory=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -185,9 +191,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for file_name in CENSUS_GEOJSON_DIR.rglob('*.json'):\n",
|
||||
" state_gdf = gpd.read_file(file_name)\n",
|
||||
" master_df = master_df.append(state_gdf)"
|
||||
"for file_name in CENSUS_GEOJSON_DIR.rglob(\"*.json\"):\n",
|
||||
" state_gdf = gpd.read_file(file_name)\n",
|
||||
" master_df = master_df.append(state_gdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -672,7 +678,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"usa_merged_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\")"
|
||||
"usa_merged_compressed.to_file(\n",
|
||||
" CENSUS_GEOJSON_DIR / \"usa_merged.geojson\", driver=\"GeoJSON\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -684,8 +692,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"usa_simplified = usa_merged[\n",
|
||||
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
|
||||
" ].reset_index(drop=True)"
|
||||
" [\"GEOID10\", \"Score D (percentile)\", \"geometry\"]\n",
|
||||
"].reset_index(drop=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -696,9 +704,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"usa_simplified.rename(\n",
|
||||
" columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True\n",
|
||||
" )"
|
||||
"usa_simplified.rename(columns={\"Score D (percentile)\": \"D_SCORE\"}, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -714,8 +720,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"usa_cbg_compressed = gpd.GeoDataFrame(\n",
|
||||
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
|
||||
" )"
|
||||
" usa_simplified, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -726,7 +732,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"usa_cbg_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\")"
|
||||
"usa_cbg_compressed.to_file(\n",
|
||||
" CENSUS_GEOJSON_DIR / \"usa_cbg_scoreD.geojson\", driver=\"GeoJSON\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -764,8 +772,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"tracts_compressed = gpd.GeoDataFrame(\n",
|
||||
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
|
||||
" )"
|
||||
" usa_tracts, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -776,7 +784,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tracts_compressed.to_file(CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\")"
|
||||
"tracts_compressed.to_file(\n",
|
||||
" CENSUS_GEOJSON_DIR / \"usa_tracts_score.geojson\", driver=\"GeoJSON\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -877,8 +887,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"gdf_compressed = gpd.GeoDataFrame(\n",
|
||||
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
|
||||
" )"
|
||||
" compressed, columns=[\"D_SCORE\", \"geometry\"], crs=\"EPSG:4326\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -917,7 +927,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gdf_compressed.to_file(CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\")"
|
||||
"gdf_compressed.to_file(\n",
|
||||
" CENSUS_GEOJSON_DIR / f\"usa_low.geojson\", driver=\"GeoJSON\"\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
@ -39,7 +39,9 @@
|
|||
"source": [
|
||||
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
|
||||
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
|
||||
"censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))"
|
||||
"censusdata.printtable(\n",
|
||||
" censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n",
|
||||
")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
|
@ -65,8 +67,8 @@
|
|||
" year=ACS_YEAR,\n",
|
||||
" geo=censusdata.censusgeo(\n",
|
||||
" [\n",
|
||||
" (\"state\", fips) \n",
|
||||
" #, (\"county\", \"*\"), (\"block group\", \"*\")\n",
|
||||
" (\"state\", fips)\n",
|
||||
" # , (\"county\", \"*\"), (\"block group\", \"*\")\n",
|
||||
" ]\n",
|
||||
" ),\n",
|
||||
" var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n",
|
||||
|
@ -75,7 +77,9 @@
|
|||
"\n",
|
||||
"df = pd.concat(dfs)\n",
|
||||
"\n",
|
||||
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
|
||||
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(\n",
|
||||
" func=fips_from_censusdata_censusgeo\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"df.head()"
|
||||
],
|
||||
|
@ -90,7 +94,13 @@
|
|||
"source": [
|
||||
"columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
|
||||
"\n",
|
||||
"df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n",
|
||||
"df.rename(\n",
|
||||
" columns={\n",
|
||||
" \"GEOID10\": \"GEOID2\",\n",
|
||||
" \"B19013_001E\": \"Median household income (State)\",\n",
|
||||
" },\n",
|
||||
" inplace=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
|
||||
],
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"from data_pipeline.utils import unzip_file_from_url\n",
|
||||
"from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes"
|
||||
],
|
||||
|
@ -57,9 +57,16 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"counties_df = pd.read_csv(CENSUS_COUNTIES_TXT, sep=\"\\t\", dtype={\"GEOID\": \"string\", \"USPS\": \"string\"}, low_memory=False)\n",
|
||||
"counties_df = counties_df[['USPS', 'GEOID', 'NAME']]\n",
|
||||
"counties_df.rename(columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True)\n",
|
||||
"counties_df = pd.read_csv(\n",
|
||||
" CENSUS_COUNTIES_TXT,\n",
|
||||
" sep=\"\\t\",\n",
|
||||
" dtype={\"GEOID\": \"string\", \"USPS\": \"string\"},\n",
|
||||
" low_memory=False,\n",
|
||||
")\n",
|
||||
"counties_df = counties_df[[\"USPS\", \"GEOID\", \"NAME\"]]\n",
|
||||
"counties_df.rename(\n",
|
||||
" columns={\"USPS\": \"State Abbreviation\", \"NAME\": \"County Name\"}, inplace=True\n",
|
||||
")\n",
|
||||
"counties_df.head()"
|
||||
],
|
||||
"outputs": [],
|
||||
|
@ -69,8 +76,17 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"states_df = pd.read_csv(STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"})\n",
|
||||
"states_df.rename(columns={\"fips\": \"State Code\", \"state_name\": \"State Name\", \"state_abbreviation\": \"State Abbreviation\"}, inplace=True)\n",
|
||||
"states_df = pd.read_csv(\n",
|
||||
" STATE_CSV, dtype={\"fips\": \"string\", \"state_abbreviation\": \"string\"}\n",
|
||||
")\n",
|
||||
"states_df.rename(\n",
|
||||
" columns={\n",
|
||||
" \"fips\": \"State Code\",\n",
|
||||
" \"state_name\": \"State Name\",\n",
|
||||
" \"state_abbreviation\": \"State Abbreviation\",\n",
|
||||
" },\n",
|
||||
" inplace=True,\n",
|
||||
")\n",
|
||||
"states_df.head()"
|
||||
],
|
||||
"outputs": [],
|
||||
|
@ -80,7 +96,7 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"county_state_merged = counties_df.join(states_df, rsuffix=' Other')\n",
|
||||
"county_state_merged = counties_df.join(states_df, rsuffix=\" Other\")\n",
|
||||
"del county_state_merged[\"State Abbreviation Other\"]\n",
|
||||
"county_state_merged.head()"
|
||||
],
|
||||
|
@ -102,7 +118,7 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"score_county_state_merged = score_df.join(county_state_merged, rsuffix='_OTHER')\n",
|
||||
"score_county_state_merged = score_df.join(county_state_merged, rsuffix=\"_OTHER\")\n",
|
||||
"del score_county_state_merged[\"GEOID_OTHER\"]\n",
|
||||
"score_county_state_merged.head()"
|
||||
],
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
|
||||
"\n",
|
||||
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
||||
"tqdm_notebook.pandas()\n"
|
||||
"tqdm_notebook.pandas()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -89,14 +89,12 @@
|
|||
" \"Poverty (Less than 200% of federal poverty line)\",\n",
|
||||
" \"Percent individuals age 25 or over with less than high school degree\",\n",
|
||||
" \"Unemployed civilians (percent)\",\n",
|
||||
" \"Linguistic isolation (percent)\"\n",
|
||||
" \"Linguistic isolation (percent)\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"column_to_plot = columns_to_plot[0]\n",
|
||||
"print(f\"Plotting {column_to_plot}\")\n",
|
||||
"print(cejst_df[\n",
|
||||
" column_to_plot\n",
|
||||
"].hist())"
|
||||
"print(cejst_df[column_to_plot].hist())"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -152,15 +152,17 @@
|
|||
"CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
|
||||
"CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
|
||||
"\n",
|
||||
"calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
|
||||
"calenviroscreen_data_path = (\n",
|
||||
" DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
|
||||
")\n",
|
||||
"calenviroscreen_df = pd.read_csv(\n",
|
||||
" calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Convert priority community field to a bool.\n",
|
||||
"calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
|
||||
"calenviroscreen_df[\n",
|
||||
" CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
|
||||
"].astype(bool)\n",
|
||||
"] = calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].astype(bool)\n",
|
||||
"\n",
|
||||
"calenviroscreen_df.head()"
|
||||
]
|
||||
|
@ -168,19 +170,33 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1bf54af1",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"id": "df458f08",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load HUD data\n",
|
||||
"hud_recap_data_path = DATA_DIR / \"dataset\" / \"hud_recap\" / \"usa.csv\"\n",
|
||||
"hud_recap_df = pd.read_csv(\n",
|
||||
" hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
|
||||
"# Load persistent poverty data\n",
|
||||
"persistent_poverty_path = (\n",
|
||||
" DATA_DIR / \"dataset\" / \"persistent_poverty\" / \"usa.csv\"\n",
|
||||
")\n",
|
||||
"persistent_poverty_df = pd.read_csv(\n",
|
||||
" persistent_poverty_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"hud_recap_df.head()"
|
||||
"# Since \"Persistent Poverty Census Tract\" is labeled in both the score file (at the CBG level) and this tract file,\n",
|
||||
"# rename this field so it's easy to access the tract-level scores directly.\n",
|
||||
"\n",
|
||||
"PERSISTENT_POVERTY_TRACT_LEVEL_FIELD = \"Persistent Poverty, Tract Level\"\n",
|
||||
"PERSISTENT_POVERTY_CBG_LEVEL_FIELD = \"Persistent Poverty Census Tract\"\n",
|
||||
"\n",
|
||||
"persistent_poverty_df.rename(\n",
|
||||
" columns={\n",
|
||||
" PERSISTENT_POVERTY_CBG_LEVEL_FIELD: PERSISTENT_POVERTY_TRACT_LEVEL_FIELD\n",
|
||||
" },\n",
|
||||
" inplace=True,\n",
|
||||
" errors=\"raise\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"persistent_poverty_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -193,7 +209,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Join all dataframes that use tracts\n",
|
||||
"census_tract_dfs = [calenviroscreen_df, hud_recap_df]\n",
|
||||
"census_tract_dfs = [calenviroscreen_df, persistent_poverty_df]\n",
|
||||
"\n",
|
||||
"census_tract_df = functools.reduce(\n",
|
||||
" lambda left, right: pd.merge(\n",
|
||||
|
@ -231,7 +247,6 @@
|
|||
" on=GEOID_TRACT_FIELD_NAME,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if len(merged_df) > 220405:\n",
|
||||
" raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
|
||||
"\n",
|
||||
|
@ -314,10 +329,20 @@
|
|||
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Persistent Poverty (CBG)\",\n",
|
||||
" priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"census_tract_indices = [\n",
|
||||
" Index(\n",
|
||||
" method_name=\"Persistent Poverty\",\n",
|
||||
" priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"CalEnviroScreen 4.0\",\n",
|
||||
" priority_communities_field=\"calenviroscreen_priority_community\",\n",
|
||||
" other_census_tract_fields_to_keep=[\n",
|
||||
|
@ -325,11 +350,6 @@
|
|||
" CALENVIROSCREEN_PERCENTILE_FIELD,\n",
|
||||
" ],\n",
|
||||
" ),\n",
|
||||
" Index(\n",
|
||||
" method_name=\"HUD RECAP\",\n",
|
||||
" priority_communities_field=\"hud_recap_priority_community\",\n",
|
||||
" other_census_tract_fields_to_keep=[],\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
|
@ -354,7 +374,8 @@
|
|||
"\n",
|
||||
" # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n",
|
||||
" df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
|
||||
" df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
|
||||
" df[priority_communities_field]\n",
|
||||
" * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def calculate_state_comparison(\n",
|
||||
|
@ -393,7 +414,9 @@
|
|||
" summary_dict[\"Geography name\"] = division_id\n",
|
||||
"\n",
|
||||
" total_cbgs_in_geography = len(frame)\n",
|
||||
" total_population_in_geography = frame[CENSUS_BLOCK_GROUP_POPULATION_FIELD].sum()\n",
|
||||
" total_population_in_geography = frame[\n",
|
||||
" CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
|
||||
" ].sum()\n",
|
||||
"\n",
|
||||
" if geography_field == URBAN_HEURISTIC_FIELD:\n",
|
||||
" urban_flag = frame[URBAN_HEURISTIC_FIELD].unique()[0]\n",
|
||||
|
@ -401,9 +424,9 @@
|
|||
" summary_dict[\"Geography name\"] = summary_dict[\"Urban vs Rural\"]\n",
|
||||
"\n",
|
||||
" for priority_communities_field in priority_communities_fields:\n",
|
||||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
|
||||
" summary_dict[\n",
|
||||
" f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
|
||||
" ].sum()\n",
|
||||
" ] = frame[f\"{priority_communities_field}{POPULATION_SUFFIX}\"].sum()\n",
|
||||
"\n",
|
||||
" summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n",
|
||||
" f\"{priority_communities_field}\"\n",
|
||||
|
@ -415,7 +438,9 @@
|
|||
" / total_cbgs_in_geography\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
|
||||
" summary_dict[\n",
|
||||
" f\"{priority_communities_field} (percent population)\"\n",
|
||||
" ] = (\n",
|
||||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
|
||||
" / total_population_in_geography\n",
|
||||
" )\n",
|
||||
|
@ -461,7 +486,9 @@
|
|||
"\n",
|
||||
" # Run the comparison function on the groups.\n",
|
||||
" region_distribution_df = region_grouped_df.progress_apply(\n",
|
||||
" lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
|
||||
" lambda frame: calculate_state_comparison(\n",
|
||||
" frame, geography_field=\"region\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Next, run the comparison by division\n",
|
||||
|
@ -469,7 +496,9 @@
|
|||
"\n",
|
||||
" # Run the comparison function on the groups.\n",
|
||||
" division_distribution_df = division_grouped_df.progress_apply(\n",
|
||||
" lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
|
||||
" lambda frame: calculate_state_comparison(\n",
|
||||
" frame, geography_field=\"division\"\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Next, run the comparison by urban/rural\n",
|
||||
|
@ -524,7 +553,9 @@
|
|||
" column_character = get_excel_column_name(column_index)\n",
|
||||
"\n",
|
||||
" # Set all columns to larger width\n",
|
||||
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||
" worksheet.set_column(\n",
|
||||
" f\"{column_character}:{column_character}\", column_width\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Special formatting for all percent columns\n",
|
||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||
|
@ -539,9 +570,7 @@
|
|||
"\n",
|
||||
" # Special formatting for columns that capture the percent of population considered priority.\n",
|
||||
" if \"(percent population)\" in column:\n",
|
||||
" column_ranges = (\n",
|
||||
" f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
|
||||
" )\n",
|
||||
" column_ranges = f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
|
||||
"\n",
|
||||
" # Add green to red conditional formatting.\n",
|
||||
" worksheet.conditional_format(\n",
|
||||
|
@ -654,7 +683,9 @@
|
|||
"\n",
|
||||
" # Put criteria description column first.\n",
|
||||
" new_column_order = [criteria_description_field_name] + [\n",
|
||||
" col for col in comparison_df.columns if col != criteria_description_field_name\n",
|
||||
" col\n",
|
||||
" for col in comparison_df.columns\n",
|
||||
" if col != criteria_description_field_name\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" comparison_df = comparison_df[new_column_order]\n",
|
||||
|
@ -700,12 +731,12 @@
|
|||
" column_character = get_excel_column_name(column_index)\n",
|
||||
"\n",
|
||||
" # Set all columns to larger width\n",
|
||||
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||||
" worksheet.set_column(\n",
|
||||
" f\"{column_character}:{column_character}\", column_width\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Add green to red conditional formatting.\n",
|
||||
" column_ranges = (\n",
|
||||
" f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
|
||||
" )\n",
|
||||
" column_ranges = f\"{column_character}2:{column_character}{len(cbg_score_comparison_df)+1}\"\n",
|
||||
" worksheet.conditional_format(\n",
|
||||
" column_ranges,\n",
|
||||
" # Min: green, max: red.\n",
|
||||
|
@ -718,7 +749,11 @@
|
|||
"\n",
|
||||
" # Special formatting for all percent columns\n",
|
||||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||||
" if \"percent \" in column or \"(percent)\" in column or \"Percent \" in column:\n",
|
||||
" if (\n",
|
||||
" \"percent \" in column\n",
|
||||
" or \"(percent)\" in column\n",
|
||||
" or \"Percent \" in column\n",
|
||||
" ):\n",
|
||||
" # Make these columns percentages.\n",
|
||||
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
||||
" worksheet.set_column(\n",
|
||||
|
@ -756,9 +791,7 @@
|
|||
" )\n",
|
||||
"\n",
|
||||
" # Write secondary comparison to CSV.\n",
|
||||
" file_name_part = (\n",
|
||||
" f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
|
||||
" )\n",
|
||||
" file_name_part = f\"CBG Comparison Output - {index_a.method_name} and {index_b.method_name}\"\n",
|
||||
" output_dir.mkdir(parents=True, exist_ok=True)\n",
|
||||
" file_path = output_dir / (file_name_part + \".csv\")\n",
|
||||
" file_path_xlsx = output_dir / (file_name_part + \".xlsx\")\n",
|
||||
|
@ -770,7 +803,8 @@
|
|||
" )\n",
|
||||
"\n",
|
||||
" write_cbg_score_comparison_excel(\n",
|
||||
" cbg_score_comparison_df=cbg_score_comparison_df, file_path=file_path_xlsx\n",
|
||||
" cbg_score_comparison_df=cbg_score_comparison_df,\n",
|
||||
" file_path=file_path_xlsx,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
|
@ -801,11 +835,15 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eeb9699d",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def write_markdown_and_docx_content(\n",
|
||||
" markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str\n",
|
||||
" markdown_content: str,\n",
|
||||
" file_dir: pathlib.PosixPath,\n",
|
||||
" file_name_without_extension: str,\n",
|
||||
") -> pathlib.PosixPath:\n",
|
||||
" \"\"\"Write Markdown content to both .md and .docx files.\"\"\"\n",
|
||||
" # Set the file paths for both files.\n",
|
||||
|
@ -837,7 +875,9 @@
|
|||
"\n",
|
||||
" # List of all states/territories in their FIPS codes:\n",
|
||||
" state_ids = sorted(df[state_field].unique())\n",
|
||||
" state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
|
||||
" state_names = \", \".join(\n",
|
||||
" [us.states.lookup(state_id).name for state_id in state_ids]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Create markdown content for comparisons.\n",
|
||||
" markdown_content = f\"\"\"\n",
|
||||
|
@ -851,11 +891,16 @@
|
|||
"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
" for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n",
|
||||
" for (index1, index2) in itertools.combinations(\n",
|
||||
" census_block_group_indices, 2\n",
|
||||
" ):\n",
|
||||
" # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n",
|
||||
" count_df = (\n",
|
||||
" df.groupby(\n",
|
||||
" [index1.priority_communities_field, index2.priority_communities_field]\n",
|
||||
" [\n",
|
||||
" index1.priority_communities_field,\n",
|
||||
" index2.priority_communities_field,\n",
|
||||
" ]\n",
|
||||
" )[GEOID_FIELD_NAME]\n",
|
||||
" .count()\n",
|
||||
" .reset_index(name=count_field_name)\n",
|
||||
|
@ -887,16 +932,24 @@
|
|||
"\n",
|
||||
" # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n",
|
||||
" true_true_cbgs = (\n",
|
||||
" true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n",
|
||||
" true_true_cbgs_series.iloc[0]\n",
|
||||
" if len(true_true_cbgs_series) > 0\n",
|
||||
" else 0\n",
|
||||
" )\n",
|
||||
" true_false_cbgs = (\n",
|
||||
" true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n",
|
||||
" true_false_cbgs_series.iloc[0]\n",
|
||||
" if len(true_false_cbgs_series) > 0\n",
|
||||
" else 0\n",
|
||||
" )\n",
|
||||
" false_true_cbgs = (\n",
|
||||
" false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n",
|
||||
" false_true_cbgs_series.iloc[0]\n",
|
||||
" if len(false_true_cbgs_series) > 0\n",
|
||||
" else 0\n",
|
||||
" )\n",
|
||||
" false_false_cbgs = (\n",
|
||||
" false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n",
|
||||
" false_false_cbgs_series.iloc[0]\n",
|
||||
" if len(false_false_cbgs_series) > 0\n",
|
||||
" else 0\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" markdown_content += (\n",
|
||||
|
@ -1088,15 +1141,20 @@
|
|||
"\n",
|
||||
" # Calculate comparison\n",
|
||||
" # A comparison priority tract has at least one CBG that is a priority CBG.\n",
|
||||
" df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n",
|
||||
" df[\n",
|
||||
" comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n",
|
||||
" ] = (\n",
|
||||
" frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
|
||||
" if is_a_method_b_priority_tract\n",
|
||||
" else None\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n",
|
||||
" df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n",
|
||||
" frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
|
||||
" df[\n",
|
||||
" comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n",
|
||||
" ] = (\n",
|
||||
" frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
|
||||
" == 1\n",
|
||||
" if is_a_method_b_priority_tract\n",
|
||||
" else None\n",
|
||||
" )\n",
|
||||
|
@ -1115,7 +1173,8 @@
|
|||
" df[\n",
|
||||
" comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n",
|
||||
" ] = (\n",
|
||||
" frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
|
||||
" frame.loc[:, method_a_priority_census_block_groups_field].mean()\n",
|
||||
" == 1\n",
|
||||
" if not is_a_method_b_priority_tract\n",
|
||||
" else None\n",
|
||||
" )\n",
|
||||
|
@ -1156,14 +1215,20 @@
|
|||
"\n",
|
||||
" # List of all states/territories in their FIPS codes:\n",
|
||||
" state_ids = sorted(original_df[state_field].unique())\n",
|
||||
" state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
|
||||
" state_names = \", \".join(\n",
|
||||
" [us.states.lookup(state_id).name for state_id in state_ids]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n",
|
||||
" # TODO: investigate why sums are sometimes series and sometimes scalar.\n",
|
||||
" method_a_priority_cbgs = (\n",
|
||||
" original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n",
|
||||
" original_df.loc[:, method_a_priority_census_block_groups_field]\n",
|
||||
" .sum()\n",
|
||||
" .squeeze()\n",
|
||||
" )\n",
|
||||
" method_a_priority_cbgs_percent = (\n",
|
||||
" f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
|
||||
" )\n",
|
||||
" method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
|
||||
"\n",
|
||||
" total_tracts_count = len(comparison_df)\n",
|
||||
"\n",
|
||||
|
@ -1185,7 +1250,9 @@
|
|||
" .sum()\n",
|
||||
" .squeeze()\n",
|
||||
" )\n",
|
||||
" method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
|
||||
" method_a_tracts_count_percent = (\n",
|
||||
" f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Method A priority community stats\n",
|
||||
" method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
|
||||
|
@ -1316,7 +1383,8 @@
|
|||
"\n",
|
||||
" # Write comparison to CSV.\n",
|
||||
" file_path = (\n",
|
||||
" output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
|
||||
" output_dir\n",
|
||||
" / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
|
||||
" )\n",
|
||||
" comparison_df.to_csv(\n",
|
||||
" path_or_buf=file_path,\n",
|
||||
|
|
|
@ -101,17 +101,25 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map = pd.read_csv(\n",
|
||||
" os.path.join(GEOCORR_DATA_DIR, 'geocorr2014_2125804280.csv'),\n",
|
||||
" encoding = \"ISO-8859-1\",\n",
|
||||
" os.path.join(GEOCORR_DATA_DIR, \"geocorr2014_2125804280.csv\"),\n",
|
||||
" encoding=\"ISO-8859-1\",\n",
|
||||
" skiprows=[1],\n",
|
||||
" dtype='str',\n",
|
||||
" dtype=\"str\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"geocorr_urban_rural_map['pop10'] = pd.to_numeric(geocorr_urban_rural_map['pop10'])\n",
|
||||
"geocorr_urban_rural_map['afact'] = pd.to_numeric(geocorr_urban_rural_map['afact'])\n",
|
||||
"geocorr_urban_rural_map[\"pop10\"] = pd.to_numeric(\n",
|
||||
" geocorr_urban_rural_map[\"pop10\"]\n",
|
||||
")\n",
|
||||
"geocorr_urban_rural_map[\"afact\"] = pd.to_numeric(\n",
|
||||
" geocorr_urban_rural_map[\"afact\"]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map['county'] + geocorr_urban_rural_map['tract'] # + geocorr_urban_rural_map['bg']\n",
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME].str.replace('.', '', regex=False)"
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = (\n",
|
||||
" geocorr_urban_rural_map[\"county\"] + geocorr_urban_rural_map[\"tract\"]\n",
|
||||
") # + geocorr_urban_rural_map['bg']\n",
|
||||
"geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] = geocorr_urban_rural_map[\n",
|
||||
" GEOID_TRACT_FIELD_NAME\n",
|
||||
"].str.replace(\".\", \"\", regex=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -139,15 +147,9 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map = geocorr_urban_rural_map[[\n",
|
||||
" GEOID_TRACT_FIELD_NAME,\n",
|
||||
" 'ur',\n",
|
||||
" 'ua',\n",
|
||||
" 'cntyname',\n",
|
||||
" 'uaname',\n",
|
||||
" 'pop10',\n",
|
||||
" 'afact'\n",
|
||||
"]]"
|
||||
"geocorr_urban_rural_map = geocorr_urban_rural_map[\n",
|
||||
" [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\", \"cntyname\", \"uaname\", \"pop10\", \"afact\"]\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -165,7 +167,9 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur', 'ua'], dropna=False).size().sort_values(ascending=False)"
|
||||
"geocorr_urban_rural_map.groupby(\n",
|
||||
" [GEOID_TRACT_FIELD_NAME, \"ur\", \"ua\"], dropna=False\n",
|
||||
").size().sort_values(ascending=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -175,7 +179,9 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_map.loc[geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == '36117020302']"
|
||||
"geocorr_urban_rural_map.loc[\n",
|
||||
" geocorr_urban_rural_map[GEOID_TRACT_FIELD_NAME] == \"36117020302\"\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -185,8 +191,12 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"total_geo_population = geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME).agg({'pop10': np.sum}).reset_index()\n",
|
||||
"total_geo_population.rename(columns={'pop10': 'total_population'}, inplace=True)\n",
|
||||
"total_geo_population = (\n",
|
||||
" geocorr_urban_rural_map.groupby(GEOID_TRACT_FIELD_NAME)\n",
|
||||
" .agg({\"pop10\": np.sum})\n",
|
||||
" .reset_index()\n",
|
||||
")\n",
|
||||
"total_geo_population.rename(columns={\"pop10\": \"total_population\"}, inplace=True)\n",
|
||||
"total_geo_population.head()"
|
||||
]
|
||||
},
|
||||
|
@ -197,8 +207,16 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, 'ur']).agg({'pop10': np.sum}).reset_index()\n",
|
||||
"geocorr_urban_rural_with_total_pop_map = geocorr_urban_rural_with_total_pop_map.merge(total_geo_population, how='inner', on=GEOID_TRACT_FIELD_NAME)\n",
|
||||
"geocorr_urban_rural_with_total_pop_map = (\n",
|
||||
" geocorr_urban_rural_map.groupby([GEOID_TRACT_FIELD_NAME, \"ur\"])\n",
|
||||
" .agg({\"pop10\": np.sum})\n",
|
||||
" .reset_index()\n",
|
||||
")\n",
|
||||
"geocorr_urban_rural_with_total_pop_map = (\n",
|
||||
" geocorr_urban_rural_with_total_pop_map.merge(\n",
|
||||
" total_geo_population, how=\"inner\", on=GEOID_TRACT_FIELD_NAME\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"geocorr_urban_rural_with_total_pop_map.head()"
|
||||
]
|
||||
},
|
||||
|
@ -209,7 +227,10 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map['afact'] = geocorr_urban_rural_with_total_pop_map['pop10'] / geocorr_urban_rural_with_total_pop_map['total_population']"
|
||||
"geocorr_urban_rural_with_total_pop_map[\"afact\"] = (\n",
|
||||
" geocorr_urban_rural_with_total_pop_map[\"pop10\"]\n",
|
||||
" / geocorr_urban_rural_with_total_pop_map[\"total_population\"]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -229,7 +250,10 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geocorr_urban_rural_with_total_pop_map.loc[geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME] == '01001020200']"
|
||||
"geocorr_urban_rural_with_total_pop_map.loc[\n",
|
||||
" geocorr_urban_rural_with_total_pop_map[GEOID_TRACT_FIELD_NAME]\n",
|
||||
" == \"01001020200\"\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -239,12 +263,16 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(index=GEOID_TRACT_FIELD_NAME, columns='ur', values=['pop10', 'afact'])\n",
|
||||
"urban_rural_map.columns = ['_'.join(col).strip() for col in urban_rural_map.columns.values]\n",
|
||||
"urban_rural_map = geocorr_urban_rural_with_total_pop_map.pivot(\n",
|
||||
" index=GEOID_TRACT_FIELD_NAME, columns=\"ur\", values=[\"pop10\", \"afact\"]\n",
|
||||
")\n",
|
||||
"urban_rural_map.columns = [\n",
|
||||
" \"_\".join(col).strip() for col in urban_rural_map.columns.values\n",
|
||||
"]\n",
|
||||
"urban_rural_map.reset_index(inplace=True)\n",
|
||||
"urban_rural_map['urban_heuristic_flag'] = 0\n",
|
||||
"mask = urban_rural_map['afact_U'] >= 0.5\n",
|
||||
"urban_rural_map.loc[mask, 'urban_heuristic_flag'] = 1"
|
||||
"urban_rural_map[\"urban_heuristic_flag\"] = 0\n",
|
||||
"mask = urban_rural_map[\"afact_U\"] >= 0.5\n",
|
||||
"urban_rural_map.loc[mask, \"urban_heuristic_flag\"] = 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -256,12 +284,13 @@
|
|||
"source": [
|
||||
"urban_rural_map.rename(\n",
|
||||
" columns={\n",
|
||||
" 'pop10_R': 'population_in_rural_areas',\n",
|
||||
" 'pop10_U': 'population_in_urban_areas',\n",
|
||||
" 'afact_R': 'perc_population_in_rural_areas',\n",
|
||||
" 'afact_U': 'perc_population_in_urban_areas',\n",
|
||||
" }, \n",
|
||||
" inplace=True)"
|
||||
" \"pop10_R\": \"population_in_rural_areas\",\n",
|
||||
" \"pop10_U\": \"population_in_urban_areas\",\n",
|
||||
" \"afact_R\": \"perc_population_in_rural_areas\",\n",
|
||||
" \"afact_U\": \"perc_population_in_urban_areas\",\n",
|
||||
" },\n",
|
||||
" inplace=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue