CDC-Data-2025/attachments/05042023_OAW_GENETICS_ANALYSES.txt
2025-02-03 14:21:23 -08:00

790 lines
36 KiB
Text

---
title: "Measles Virus B3"
subtitle: "Phylogenetic Topology in Final Presentation with Clade Relationships, Operation Allies Welcome"
author: "Andrew Beck, NCIRD/DVD/VVPDB"
date: "May 21, 2021"
output:
html_document:
toc: true
toc_float: true
toc_depth: 5
number_sections: true
params:
TREEFILE_WGS: # Path to Maximum Likelihood tree
BEASTFILE_WGS: # Path to WGS Bayesian tree
BEASTFILE_N450: # Path to N450 Bayesian tree
METADATA_TABLE: # Path to metadata table
SUBSTITUTIONS_TABLE: # Path to substitutions table
WGS_ALIGNMENT: # Path to WGS sequence alignment
N450_ALIGNMENT: # Path to N450 Bayesian tree
---
```{r setup, include=FALSE, echo = FALSE, warning=FALSE, message=FALSE}
library(ggtree)
library(ggsci)
library(pastecs)
library(tidyverse)
library(tidytree)
library(ggplot2)
library(ggpattern)
library(knitr)
library(phytools)
library(RColorBrewer)
library(kableExtra)
library(treeio)
library(scales)
library(plotly)
library(aplot)
library(purrr)
library(Biostrings)
library(DiagrammeR)
library(lubridate)
library(tidyquant)
library(stringr)
library(tracerer)
library(ggrepel)
library(cowplot)
library(ggprism)
library(gtable)
library(grid)
library(ape)
library(ggthemes)
library(magick)
library(magrittr)
library(dplyr)
knitr::opts_chunk$set(echo = FALSE,
warning = FALSE,
global.par = TRUE,
message = FALSE,
cache = FALSE,
root.dir =' C:/Users/lpu5/R_Projects/PHYLOGENETICS_MF/12032020_NY_WGS_FINAL')
```
```{r margins, include=FALSE, echo = FALSE}
par(mar=c(0,0,0,0))
```
```{r adaptive_scale_process, echo=FALSE, cache=FALSE}
# This will handle the interpolated scales. Courtesy https://nanx.me/blog/post/ggplot2-color-interpolation/
# Courtesy https://jianfengwu2022.github.io/
# @param values Color values.
pal_ramp <- function(values) {
force(values)
function(n) {
if (n <= length(values)) {
values[seq_len(n)]
} else {
colorRampPalette(values, alpha = TRUE)(n)
}
}
}
# Adaptive color palette generator
#
# Adaptive color palette generator for ggsci color palettes using `pal_ramp()`.
#
# @param name Color palette name in ggsci
# @param palette Color palette type in ggsci
# @param alpha Transparency level, a real number in (0, 1].
#
# @details See `names(ggsci:::ggsci_db)` for all color palette names in ggsci.
# See `names(ggsci:::ggsci_db$"pal")` for available palette types under
# the palette `pal`.
pal_adaptive <- function(name, palette, alpha = 1) {
if (alpha > 1L | alpha <= 0L) stop("alpha must be in (0, 1]")
raw_cols <- ggsci:::ggsci_db[[name]][[palette]]
raw_cols_rgb <- col2rgb(raw_cols)
alpha_cols <- rgb(
raw_cols_rgb[1L, ], raw_cols_rgb[2L, ], raw_cols_rgb[3L, ],
alpha = alpha * 255L, names = names(raw_cols),
maxColorValue = 255L
)
pal_ramp(unname(alpha_cols))
}
#Adaptive color scales
#
# @inheritParams pal_adaptive
# @param ... additional parameters for [ggplot2::discrete_scale()].
scale_color_adaptive <- function(name, palette, alpha = 1, ...) {
ggplot2::discrete_scale("colour", name, pal_adaptive(name, palette, alpha), ...)
}
scale_fill_adaptive <- function(name, palette, alpha = 1, ...) {
ggplot2::discrete_scale("fill", name, pal_adaptive(name, palette, alpha), ...)
}
```
```{r metadata_ingest, echo=FALSE, cache=FALSE}
tipcategories = read.table(params$METADATA_TABLE, sep = "\t",
#col.names = c("TAXON","IS_OAW","WHO_NAME","COUNTRY","STATE", "EPI_WEEK","YEAR"),
header = TRUE,
comment.char = "",
stringsAsFactors = FALSE)
```
```{r tree_ingest, echo=FALSE, warning=FALSE}
# Ingest the maximum likelihood tree, WGS
TREE_WGS_ML <- read.iqtree(file = params$TREEFILE_WGS)
TREE_WGS_ML <- root(as.phylo(TREE_WGS_ML), outgroup = "MVs_Manchester.GBR_7.12_2_B3", resolve.root=TRUE)
TREE_WGS_ML <- drop.tip(TREE_WGS_ML, "MVs_NewJersey.USA_8.19")
TREE_WGS_ML <- ggtree(TREE_WGS_ML) %<+% tipcategories
# Ingest the BEAST tree, WGS
TREE_BEAST_WGS <- read.beast(file = params$BEASTFILE_WGS)
TREE_BEAST_WGS <- ggtree(TREE_BEAST_WGS, mrsd="2021-10-07") %<+% tipcategories
# Ingest the BEAST tree, N450
TREE_BEAST_N450 <- read.beast(file = params$BEASTFILE_N450)
TREE_BEAST_N450 <- ggtree(TREE_BEAST_N450, mrsd="2021-10-07") %<+% tipcategories
# Extract time of internal nodes from N450 BEAST tree, display selected node numbers with dates.
my_tibble_N450 <-TREE_BEAST_N450 %>% as.treedata() %>% as.tibble %>%
mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>%
mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>%
mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>%
mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>%
mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>%
filter(node %in% c(17,167,168)) %>%
select(node,height_corrected,height_95_corrected_first,height_95_corrected_last)
kable(my_tibble_N450)
# Extract time of internal nodes from WGS BEAST tree, display selected node numbers with dates.
my_tibble_WGS <-TREE_BEAST_WGS %>% as.treedata() %>% as.tibble %>%
mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>%
mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>%
mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>%
mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>%
mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>%
filter(node %in% c(15,145,146)) %>%
select(node,height_corrected,height_95_corrected_first,height_95_corrected_last)
kable(my_tibble_WGS)
```
## Metadata
### Purpose
Measles virus WGS was performed on 44 specimens from OAW evacuee cases. 41 were successfully sequenced, and these are analyzed phylogenetically alongside several others from previous B3 outbreaks. 119 in total.
### Metadata Characteristics and Validation
Number of samples in metadata table: **`r nrow(tipcategories)`**
Maximum year in the metadata table: **`r max(tipcategories$YEAR)`**
Maximum decimal year in the metadata table, branch reference: **`r decimal_date(ymd("2021-10-07"))`**
Minimum year in the metadata table: **`r min(tipcategories$YEAR)`**
```{r TABLE_METADATA_SOURCE}
kable(tipcategories, align = "c", caption = "All metadata attached to trees displayed below.")%>%
kable_styling("hover", full_width = F, fixed_thead = T) %>%
row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
scroll_box(width = "100%", height = "300px")
```
### Maximum Likelihood Trees, not used in study
## Tree used for stub-out of annotations, only.
```{r ML_WGS, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
# Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter)
TREE_WGS_ML +
# If it's from the OAW study, label red
geom_tiplab(aes(label=WHO_NAME,
subset = ISOAW == "TRUE"),
color = "red", size=4.5) +
# Everything else is black
geom_tiplab(aes(label=WHO_NAME,
subset = ISOAW != "TRUE"),
color = "black", size=4.5)+
geom_nodelab(aes(label=node), color="black")+
xlim(0, .03)
```
## Whole B3 Tree with clade labels
```{r ML_WGS_whole, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
# Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter)
p <-TREE_WGS_ML +
# If it's from the OAW study, label red
geom_tiplab(aes(label=WHO_NAME,
subset = ISOAW == "TRUE"),
color = "red", size=4.5) +
# Everything else is black
geom_tiplab(aes(label=WHO_NAME,
subset = ISOAW != "TRUE"),
color = "black", size=4.5)+
#Label reasonable clusters.
geom_nodepoint(aes(subset=(as.numeric(label) >= 50)), size = 3, color = "black", fill = "grey", alpha = .5)+
geom_cladelab(node=122,label="OAW Entire",offset=.006,barsize=2,fontsize=8)+
geom_cladelab(node=152,label="Italy 2017",offset=.0035,barsize=2,fontsize=8)+
geom_cladelab(node=172,label="California 2019",offset=.0035,barsize=2,fontsize=8)+
geom_cladelab(node=145,label="A",offset=.0035,barsize=2,fontsize=8)+
geom_cladelab(node=126,label="B",offset=.0040,barsize=2,fontsize=8)+
geom_cladelab(node=125,label="C",offset=.0045,barsize=2,fontsize=8)+
geom_cladelab(node=124,label="D",offset=.005,barsize=2,fontsize=8)+
xlim(0, .03)
p
```
## Partial Maximum Likelihood Clade
This is identical to the "full" B3 tree, only the subtree is extracted for the current study sequences.
```{r ML_WGS_whole_clade, fig.height=10,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
viewClade(p, 122) + geom_tiplab(aes(label=WHO_NAME), color = "black", size=4.5)
```
## Bayesian Phylogeny WGS
Tree was calculated in BEAST v.2.7. Chain was 4x50 million steps, with 10 percent burnin and 1:10000 sampling from chain (18000 trees in total sample). Tree is
annotated as maximum clade credibility with mean branch lengths (time).
1. Models were compared and tested using IQtree v.2. Best support was TIM+G4+F (empirical frequencies) for concatenated coding regions, and TIM3+G4+F (empirical base frequencies) for concatenated noncoding regions.
2. Substitution Models are unlinked, meaning that the model controlling NCR and CDS is free to vary independently.
3. Clock Models are unlinked, NCR = strict, CDS = strict (linear).
4. Tree model is linked between partitions, Bayesian Skyline.
### Bayesian WGS with node numbers referenced
Node numbers are assigned by the R package and are not scientifically meaningful.
```{r BEAST_Nodereference, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
p <- TREE_BEAST_WGS+
# If the sequence is from the OAW study, label as red.
geom_tiplab(aes(label=WHO_NAME,
subset = ISOAW == "TRUE"),
color = "red", size=3) +
# Everything else is black.
geom_tiplab(aes(label=WHO_NAME,
subset = ISOAW != "TRUE"),
color = "black", size=3)+
theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023)
p
```
### Bayesian WGS with posterior support
1. Bars show 95% HPD for the inferred **time in years** of the node.
2. Points show nodes with > 0.5 posterior support in the sample.
```{r BEAST_WHOLE, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
p <- TREE_BEAST_WGS+
# Everything else is black
#geom_tiplab(aes(label=WHO_NAME),
# color = "black", size=3)+
geom_hilight(node=145,fill = "pink", alpha = .5)+
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .5)), color = "black", size = 5, fill="grey",alpha=0.5)+
theme_tree2() + xlim_tree(2022)
p
```
### Partial tree, with majority nodes shown if posterior > 0.9
This is a blow-up of the Bayesian subtree used for data presentation.
```{r BEAST_PARTIAL, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
p <- TREE_BEAST_WGS+
# Everything else is black
geom_tiplab(aes(label=WHO_NAME),time_scale=TRUE,
color = "black", size=5)+
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .9)), color = "black",
size = 5)+
theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023)
viewClade(p, 145)
```
### Partial B3 OAW subtree tree, containing metadata annotations
Notes:
1. This is a **partial** tree, graphically extracted from the larger B3 set that was modeled alongside.
2. Red arrows are highly confident transmission pairs, grey are less confident or hypothesized.
3. Some of the less confident transmissions (35-24, 35-47) cover considerable evolutionary distance (sum up all connecting branch lengths in years). These are highly implausible.
4. Confidence in the groupings/tree topology is a posterior probability, with black dots for nodes > .9. This means **very** high confidence in the split position in the tree, or the presence of the group bearing that common ancestor.
```{r BEAST_PARTIAL_meta, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE, dpi=300}
insetTree <-TREE_BEAST_WGS+
geom_hilight(node=145,fill = "pink", alpha = .5)+
theme_void() + xlim_tree(2022)
p <- TREE_BEAST_WGS+
geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)),
color = "black",
time_scale=TRUE,
align = TRUE,
size=5)+
geom_range("height_0.95_HPD",
color='blue',
size=2, alpha=.5) +
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)),
color = "black",
size = 5)+
theme_tree2() + scale_x_continuous(breaks = seq(2014,2022,2),labels = seq(2014,2022,2))+xlim_tree(2030)+
geom_vline(xintercept=c(2014.157,2016.553),linetype='dashed')+
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
mainTreeWGS <-viewClade(p,145) +
#High confidence Case 1->4
geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4',
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1,
arrow=arrow(length=unit(0.02, "npc")))+
#High confidence Case 3->8
geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21',
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->1
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->24
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->47
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->25
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->31
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
theme(axis.text.x = element_text(size = 14, face = "bold"))
tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(WGS_EXISTS == TRUE)
baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(BASE)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Base"))
clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(CLUSTER)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Main Cluster"))
importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(IMPORT)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Importation Status"))
barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Barrack/Bay"))
flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE_WGS)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Shared Flight"))
familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Family Group"))
# order - Tree, base, cluster, subcluster, Family Group
familyTilePlot %>% insert_left(barrackPlot,) %>%
insert_left(importationTilePlot,) %>%
insert_left(flightPlot,) %>%
insert_left(baseTilePlot,) %>%
insert_left(clusterTilePlot,) %>%
insert_left(mainTreeWGS,5)
```
## Bayesian Phylogeny N450
### Bayesian N450 with posterior support for node date
Bars show 95% HPD for the inferred **time in years** of the OAW common ancestor node.
```{r BEAST_WHOLE_N450, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
p <- TREE_BEAST_N450+
geom_hilight(node=145,fill = "pink", alpha = .5)+
geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+
theme_tree2() + xlim_tree(2022)
p
```
### N450 Supplement Tree, Bayesian
```{r BEAST_PARTIAL_meta_sub_N450, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE}
p <- TREE_BEAST_N450+
# Everything else is black
geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)),
color = "black",
time_scale=TRUE,
align = TRUE,
size=5)+
geom_tippoint(aes(subset=(N450_MATCHES_WGS == "SANGER_ONLY")),
color = "purple",
fill = "purple",
size = 5,
alpha = .5,
shape = 24)+
geom_range("height_0.95_HPD",
color='blue',
size=2, alpha=.5) +
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)),
color = "black",
size = 5)+
geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+
theme_tree2() + scale_x_continuous( breaks = seq(2020,2022,1),labels = seq(2020,2022,1))+xlim_tree(2024)
mainTreeN450 <-viewClade(p,167) +
#High confidence Case 1->4
geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4',
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1,
arrow=arrow(length=unit(0.02, "npc")))+
#High confidence Case 3->8
geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21',
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->1
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->24
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->47
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->25
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
#Lower confidence Case 35->31
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21',
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
arrow=arrow(length=unit(0.02, "npc")))+
theme(axis.text.x = element_text(size = 14, face = "bold"))
tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(N450_EXISTS == TRUE)
baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(BASE)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Base"))
clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(CLUSTER)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Main Cluster"))
importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(IMPORT)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Importation Status"))
barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Barrack/Bay"))
flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Shared Flight"))
familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') +
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+
scale_fill_adaptive(name = "nejm", palette = "default")+
theme_tree2()+
theme(axis.text.x = element_text(size = 14, face = "bold"))+
guides(fill = guide_legend(title = "Family Group"))
# order - Tree, base, cluster, subcluster, Family Group
familyTilePlot %>% insert_left(barrackPlot,) %>%
insert_left(importationTilePlot,) %>%
insert_left(flightPlot,) %>%
insert_left(baseTilePlot,) %>%
insert_left(clusterTilePlot,) %>%
insert_left(mainTreeN450,4)
```
### Combined comparison tree of WGS and N450
```{r WGS_N450_COMBINED, fig.height=12,fig.width=14, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
plot_grid(mainTreeN450, mainTreeWGS, labels = c('A', 'B'), label_size = 20)
```
### Base Distance Counts
```{r HAMMING_MATRIX_SETUP, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
### These are the functions needed for assembly and join-up of the base distance calculation tables
alignment_WGS <- readDNAMultipleAlignment(params$WGS_ALIGNMENT, format = "fasta")
alignment_N450 <- readDNAMultipleAlignment(params$N450_ALIGNMENT, format = "fasta")
#TODO: narrow it to get the N450 from the inclusive range 1126-1575
counts_matrix_WGS <-stringDist(unmasked(alignment_WGS), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE)
counts_matrix_N450 <-stringDist(unmasked(alignment_N450), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE)
#select(TAXON, index of row that = min(CASE))
# Select the taxon and the
counts_matrix_WGS <-as.matrix(counts_matrix_WGS) %>% as.tibble(rownames = "CASE_NOMENCLATURE")
counts_matrix_N450 <-as.matrix(counts_matrix_N450) %>% as.tibble(rownames = "CASE_NOMENCLATURE")
## Coerce the tree metadata to a tibble, filter out only the OAW cases
transform_single_result <-function(filterexpression, count_matrix) {
metadata_OAW_ONLY <- as.tibble(tipcategories) %>% filter(ISOAW == TRUE) %>%
filter(eval(parse(text = filterexpression))) %>%
select(TAXON, CASE, FAMILY_GROUP) %>%
mutate(LOWEST_CASE = min(CASE)) %>%
mutate(FIRST_TAXON = .[CASE == min(CASE),"TAXON"]) %>%
select(TAXON, CASE, FIRST_TAXON) %>%
left_join(count_matrix, by = c("TAXON" = "CASE_NOMENCLATURE"))
FIRST_TAXON_SELECTOR <-metadata_OAW_ONLY %>% distinct(FIRST_TAXON) %>% pluck(1,1)
#Make fake tibble with all case numbers.
joiner_tibble_cases <- tibble(CASE = 1:47) %>%
mutate(CASE = str_c("CASE_", CASE))
finalselectrenamed <-metadata_OAW_ONLY %>% select(TAXON, CASE, FIRST_TAXON, all_of(FIRST_TAXON_SELECTOR)) %>%
dplyr::rename(SUBSTITUTIONS_FROM_FIRST=4) %>%
mutate(grouping1 = str_replace_all(filterexpression, "\"", "_")) %>%
mutate(grouping2 = str_replace_all(grouping1, "\"", "_")) %>%
mutate(grouping3 = str_replace_all(grouping2, " == ", "_")) %>%
mutate(GROUPING = str_replace_all(grouping3, "__", "_")) %>%
mutate(CASE = str_c("CASE_", CASE)) %>%
select(-c(grouping1, grouping2, grouping3))
finalselectjoined <-left_join(joiner_tibble_cases, finalselectrenamed, by = "CASE") %>%
pivot_wider(names_from = CASE, values_from = SUBSTITUTIONS_FROM_FIRST) %>%
filter (!is.na(GROUPING)) %>%
select(-c(TAXON, FIRST_TAXON)) %>%
group_by(GROUPING) %>%
summarize_all(., ~ ifelse(any(., !is.na(.)), sum(., na.rm=TRUE), NA)) %>% ungroup
return(finalselectjoined)
}
```
### The substitution distance for WGS
```{r HAMMING_MATRIX_WGS, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
list_exposure_groups_WGs <-list("FAMILY_GROUP == \"A\"",
"FAMILY_GROUP == \"B\"",
"FAMILY_GROUP == \"C\"",
"FLIGHTS_WITH_MULTIPLE == \"E\"",
"FLIGHTS_WITH_MULTIPLE == \"F\"",
"FLIGHTS_WITH_MULTIPLE == \"G\"",
"FLIGHTS_WITH_MULTIPLE == \"M\"",
"FLIGHTS_WITH_MULTIPLE == \"O\"",
"FLIGHTS_WITH_MULTIPLE == \"R\"",
"IMPORT == \"International_Importation\"",
"IMPORT == \"US-Acquired\"",
"BARRACK_BAY_MULTIPLE == \"1725\"",
"BARRACK_BAY_MULTIPLE == \"1740\"",
"BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"",
"CASE %in% c(1,4)",
"CASE %in% c(3,8)",
"CASE %in% c(1,35)",
"CASE %in% c(24,35)",
"CASE %in% c(25,35)",
"CASE %in% c(31,35)",
"CASE %in% c(47,35)"
)
tibble_list_WGS <-lapply(list_exposure_groups_WGs,
transform_single_result, count_matrix = counts_matrix_WGS)
rbound_tibble_list_WGS <-bind_rows(tibble_list_WGS)
rbound_tibble_list_WGS
#kable(rbound_tibble_list_WGS, align = "c", caption = "Row-Bound Substitutions WGS.")%>%
# kable_styling("hover", full_width = F, fixed_thead = T) %>%
#row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
# scroll_box(width = "100%", height = "300px")
write_tsv(rbound_tibble_list_WGS, "WGS_SUBSTITUTIONS.tsv")
```
### The substitution distance for N450
```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
list_exposure_groups_N450 <-list("FAMILY_GROUP == \"A\"",
"FAMILY_GROUP == \"B\"",
"FAMILY_GROUP == \"C\"",
"FLIGHTS_WITH_MULTIPLE == \"E\"",
"FLIGHTS_WITH_MULTIPLE == \"F\"",
"FLIGHTS_WITH_MULTIPLE == \"G\"",
"FLIGHTS_WITH_MULTIPLE == \"M\"",
"FLIGHTS_WITH_MULTIPLE == \"O\"",
"FLIGHTS_WITH_MULTIPLE == \"R\"",
"IMPORT == \"International_Importation\"",
"IMPORT == \"US-Acquired\"",
"BARRACK_BAY_MULTIPLE == \"1725\"",
"BARRACK_BAY_MULTIPLE == \"1740\"",
"BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"",
"CASE %in% c(1,4)",
"CASE %in% c(3,8)",
"CASE %in% c(30,28)",
"CASE %in% c(1,35)",
"CASE %in% c(24,35)",
"CASE %in% c(25,35)",
"CASE %in% c(30,35)",
"CASE %in% c(31,35)",
"CASE %in% c(47,35)"
)
counts_matrix_N450
tibble_list_N450 <-lapply(list_exposure_groups_N450,
transform_single_result, count_matrix = counts_matrix_N450)
rbound_tibble_list_N450 <-bind_rows(tibble_list_N450)
rbound_tibble_list_N450
#kable(rbound_tibble_list_N450, align = "c", caption = "Row-Bound Substitutions N450.")%>%
# kable_styling("hover", full_width = F, fixed_thead = T) %>%
#row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
# scroll_box(width = "100%", height = "300px")
write_tsv(rbound_tibble_list_N450, "N450_SUBSTITUTIONS.tsv")
```
### The substitution distances for N450 and WGS, for select reported cases.
```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
# Case 3-8
cat('# Base Count Differences Case 3->8 `', '`\n')
cat('# WGS (N-L Span) `', '`\n')
counts_matrix_WGS["MVs_Wisconsin.USA_37.21_2","MVs_Wisconsin.USA_39.21"]
cat('# N450 `', '`\n')
counts_matrix_N450["MVs_Wisconsin.USA_37.21_2", "MVs_Wisconsin.USA_39.21"]
# Case 1-4
cat('# Base Count Differences Case 1->4 `', '`\n')
cat('# WGS (N-L Span) `', '`\n')
counts_matrix_WGS["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"]
cat('# N450 `', '`\n')
counts_matrix_N450["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"]
# Case 30-28
# Not in the matrix.
# Case Case35 -> Case1
cat('# Base Count Differences Case 35->1 `', '`\n')
cat('# WGS (N-L Span) `', '`\n')
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"]
cat('# N450 `', '`\n')
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"]
# Case Case35 -> Case24
cat('# Base Count Differences Case 35->24 `', '`\n')
cat('# WGS (N-L Span) `', '`\n')
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"]
cat('# N450 `', '`\n')
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"]
# Case Case35 -> Case25
cat('# Base Count Differences Case 35->25 `', '`\n')
cat('# WGS (N-L Span) `', '`\n')
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"]
cat('# N450 `', '`\n')
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"]
# Case Case35 -> Case31
cat('# Base Count Differences Case 35->31 `', '`\n')
cat('# WGS (N-L Span) `', '`\n')
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"]
cat('# N450 `', '`\n')
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"]
# Case Case35 -> Case30
#Not in the matrix
# Case Case35 -> Case47
cat('# Base Count Differences Case 35->47 `', '`\n')
cat('# WGS (N-L Span) `', '`\n')
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"]
cat('# N450 `', '`\n')
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"]
```