790 lines
36 KiB
Text
790 lines
36 KiB
Text
---
|
|
title: "Measles Virus B3"
|
|
subtitle: "Phylogenetic Topology in Final Presentation with Clade Relationships, Operation Allies Welcome"
|
|
author: "Andrew Beck, NCIRD/DVD/VVPDB"
|
|
date: "May 21, 2021"
|
|
output:
|
|
html_document:
|
|
toc: true
|
|
toc_float: true
|
|
toc_depth: 5
|
|
number_sections: true
|
|
params:
|
|
TREEFILE_WGS: # Path to Maximum Likelihood tree
|
|
BEASTFILE_WGS: # Path to WGS Bayesian tree
|
|
BEASTFILE_N450: # Path to N450 Bayesian tree
|
|
METADATA_TABLE: # Path to metadata table
|
|
SUBSTITUTIONS_TABLE: # Path to substitutions table
|
|
WGS_ALIGNMENT: # Path to WGS sequence alignment
|
|
N450_ALIGNMENT: # Path to N450 Bayesian tree
|
|
|
|
---
|
|
|
|
|
|
```{r setup, include=FALSE, echo = FALSE, warning=FALSE, message=FALSE}
|
|
library(ggtree)
|
|
library(ggsci)
|
|
library(pastecs)
|
|
library(tidyverse)
|
|
library(tidytree)
|
|
library(ggplot2)
|
|
library(ggpattern)
|
|
library(knitr)
|
|
library(phytools)
|
|
library(RColorBrewer)
|
|
library(kableExtra)
|
|
library(treeio)
|
|
library(scales)
|
|
library(plotly)
|
|
library(aplot)
|
|
library(purrr)
|
|
library(Biostrings)
|
|
library(DiagrammeR)
|
|
library(lubridate)
|
|
library(tidyquant)
|
|
library(stringr)
|
|
library(tracerer)
|
|
library(ggrepel)
|
|
library(cowplot)
|
|
library(ggprism)
|
|
library(gtable)
|
|
library(grid)
|
|
library(ape)
|
|
library(ggthemes)
|
|
library(magick)
|
|
library(magrittr)
|
|
library(dplyr)
|
|
knitr::opts_chunk$set(echo = FALSE,
|
|
warning = FALSE,
|
|
global.par = TRUE,
|
|
message = FALSE,
|
|
cache = FALSE,
|
|
root.dir =' C:/Users/lpu5/R_Projects/PHYLOGENETICS_MF/12032020_NY_WGS_FINAL')
|
|
```
|
|
```{r margins, include=FALSE, echo = FALSE}
|
|
par(mar=c(0,0,0,0))
|
|
```
|
|
|
|
```{r adaptive_scale_process, echo=FALSE, cache=FALSE}
|
|
# This will handle the interpolated scales. Courtesy https://nanx.me/blog/post/ggplot2-color-interpolation/
|
|
# Courtesy https://jianfengwu2022.github.io/
|
|
|
|
# @param values Color values.
|
|
pal_ramp <- function(values) {
|
|
force(values)
|
|
function(n) {
|
|
if (n <= length(values)) {
|
|
values[seq_len(n)]
|
|
} else {
|
|
colorRampPalette(values, alpha = TRUE)(n)
|
|
}
|
|
}
|
|
}
|
|
|
|
# Adaptive color palette generator
|
|
#
|
|
# Adaptive color palette generator for ggsci color palettes using `pal_ramp()`.
|
|
#
|
|
# @param name Color palette name in ggsci
|
|
# @param palette Color palette type in ggsci
|
|
# @param alpha Transparency level, a real number in (0, 1].
|
|
#
|
|
# @details See `names(ggsci:::ggsci_db)` for all color palette names in ggsci.
|
|
# See `names(ggsci:::ggsci_db$"pal")` for available palette types under
|
|
# the palette `pal`.
|
|
pal_adaptive <- function(name, palette, alpha = 1) {
|
|
if (alpha > 1L | alpha <= 0L) stop("alpha must be in (0, 1]")
|
|
|
|
raw_cols <- ggsci:::ggsci_db[[name]][[palette]]
|
|
raw_cols_rgb <- col2rgb(raw_cols)
|
|
alpha_cols <- rgb(
|
|
raw_cols_rgb[1L, ], raw_cols_rgb[2L, ], raw_cols_rgb[3L, ],
|
|
alpha = alpha * 255L, names = names(raw_cols),
|
|
maxColorValue = 255L
|
|
)
|
|
|
|
pal_ramp(unname(alpha_cols))
|
|
}
|
|
|
|
|
|
#Adaptive color scales
|
|
#
|
|
# @inheritParams pal_adaptive
|
|
# @param ... additional parameters for [ggplot2::discrete_scale()].
|
|
scale_color_adaptive <- function(name, palette, alpha = 1, ...) {
|
|
ggplot2::discrete_scale("colour", name, pal_adaptive(name, palette, alpha), ...)
|
|
}
|
|
|
|
scale_fill_adaptive <- function(name, palette, alpha = 1, ...) {
|
|
ggplot2::discrete_scale("fill", name, pal_adaptive(name, palette, alpha), ...)
|
|
}
|
|
|
|
```
|
|
|
|
```{r metadata_ingest, echo=FALSE, cache=FALSE}
|
|
tipcategories = read.table(params$METADATA_TABLE, sep = "\t",
|
|
#col.names = c("TAXON","IS_OAW","WHO_NAME","COUNTRY","STATE", "EPI_WEEK","YEAR"),
|
|
header = TRUE,
|
|
comment.char = "",
|
|
stringsAsFactors = FALSE)
|
|
|
|
|
|
```
|
|
|
|
```{r tree_ingest, echo=FALSE, warning=FALSE}
|
|
|
|
# Ingest the maximum likelihood tree, WGS
|
|
TREE_WGS_ML <- read.iqtree(file = params$TREEFILE_WGS)
|
|
TREE_WGS_ML <- root(as.phylo(TREE_WGS_ML), outgroup = "MVs_Manchester.GBR_7.12_2_B3", resolve.root=TRUE)
|
|
TREE_WGS_ML <- drop.tip(TREE_WGS_ML, "MVs_NewJersey.USA_8.19")
|
|
TREE_WGS_ML <- ggtree(TREE_WGS_ML) %<+% tipcategories
|
|
|
|
# Ingest the BEAST tree, WGS
|
|
TREE_BEAST_WGS <- read.beast(file = params$BEASTFILE_WGS)
|
|
TREE_BEAST_WGS <- ggtree(TREE_BEAST_WGS, mrsd="2021-10-07") %<+% tipcategories
|
|
|
|
# Ingest the BEAST tree, N450
|
|
TREE_BEAST_N450 <- read.beast(file = params$BEASTFILE_N450)
|
|
TREE_BEAST_N450 <- ggtree(TREE_BEAST_N450, mrsd="2021-10-07") %<+% tipcategories
|
|
|
|
# Extract time of internal nodes from N450 BEAST tree, display selected node numbers with dates.
|
|
my_tibble_N450 <-TREE_BEAST_N450 %>% as.treedata() %>% as.tibble %>%
|
|
mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>%
|
|
mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>%
|
|
mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>%
|
|
mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>%
|
|
mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>%
|
|
filter(node %in% c(17,167,168)) %>%
|
|
select(node,height_corrected,height_95_corrected_first,height_95_corrected_last)
|
|
|
|
kable(my_tibble_N450)
|
|
|
|
# Extract time of internal nodes from WGS BEAST tree, display selected node numbers with dates.
|
|
my_tibble_WGS <-TREE_BEAST_WGS %>% as.treedata() %>% as.tibble %>%
|
|
mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>%
|
|
mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>%
|
|
mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>%
|
|
mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>%
|
|
mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>%
|
|
filter(node %in% c(15,145,146)) %>%
|
|
select(node,height_corrected,height_95_corrected_first,height_95_corrected_last)
|
|
|
|
kable(my_tibble_WGS)
|
|
```
|
|
## Metadata
|
|
|
|
### Purpose
|
|
|
|
Measles virus WGS was performed on 44 specimens from OAW evacuee cases. 41 were successfully sequenced, and these are analyzed phylogenetically alongside several others from previous B3 outbreaks. 119 in total.
|
|
|
|
### Metadata Characteristics and Validation
|
|
|
|
Number of samples in metadata table: **`r nrow(tipcategories)`**
|
|
|
|
Maximum year in the metadata table: **`r max(tipcategories$YEAR)`**
|
|
|
|
Maximum decimal year in the metadata table, branch reference: **`r decimal_date(ymd("2021-10-07"))`**
|
|
|
|
Minimum year in the metadata table: **`r min(tipcategories$YEAR)`**
|
|
|
|
```{r TABLE_METADATA_SOURCE}
|
|
|
|
kable(tipcategories, align = "c", caption = "All metadata attached to trees displayed below.")%>%
|
|
kable_styling("hover", full_width = F, fixed_thead = T) %>%
|
|
row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
|
|
scroll_box(width = "100%", height = "300px")
|
|
```
|
|
|
|
### Maximum Likelihood Trees, not used in study
|
|
|
|
## Tree used for stub-out of annotations, only.
|
|
|
|
```{r ML_WGS, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
# Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter)
|
|
TREE_WGS_ML +
|
|
# If it's from the OAW study, label red
|
|
geom_tiplab(aes(label=WHO_NAME,
|
|
subset = ISOAW == "TRUE"),
|
|
color = "red", size=4.5) +
|
|
# Everything else is black
|
|
geom_tiplab(aes(label=WHO_NAME,
|
|
subset = ISOAW != "TRUE"),
|
|
color = "black", size=4.5)+
|
|
geom_nodelab(aes(label=node), color="black")+
|
|
xlim(0, .03)
|
|
|
|
```
|
|
|
|
## Whole B3 Tree with clade labels
|
|
|
|
```{r ML_WGS_whole, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
# Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter)
|
|
p <-TREE_WGS_ML +
|
|
# If it's from the OAW study, label red
|
|
geom_tiplab(aes(label=WHO_NAME,
|
|
subset = ISOAW == "TRUE"),
|
|
color = "red", size=4.5) +
|
|
# Everything else is black
|
|
geom_tiplab(aes(label=WHO_NAME,
|
|
subset = ISOAW != "TRUE"),
|
|
color = "black", size=4.5)+
|
|
#Label reasonable clusters.
|
|
geom_nodepoint(aes(subset=(as.numeric(label) >= 50)), size = 3, color = "black", fill = "grey", alpha = .5)+
|
|
geom_cladelab(node=122,label="OAW Entire",offset=.006,barsize=2,fontsize=8)+
|
|
geom_cladelab(node=152,label="Italy 2017",offset=.0035,barsize=2,fontsize=8)+
|
|
geom_cladelab(node=172,label="California 2019",offset=.0035,barsize=2,fontsize=8)+
|
|
geom_cladelab(node=145,label="A",offset=.0035,barsize=2,fontsize=8)+
|
|
geom_cladelab(node=126,label="B",offset=.0040,barsize=2,fontsize=8)+
|
|
geom_cladelab(node=125,label="C",offset=.0045,barsize=2,fontsize=8)+
|
|
geom_cladelab(node=124,label="D",offset=.005,barsize=2,fontsize=8)+
|
|
xlim(0, .03)
|
|
|
|
p
|
|
|
|
```
|
|
|
|
## Partial Maximum Likelihood Clade
|
|
|
|
This is identical to the "full" B3 tree, only the subtree is extracted for the current study sequences.
|
|
|
|
```{r ML_WGS_whole_clade, fig.height=10,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
viewClade(p, 122) + geom_tiplab(aes(label=WHO_NAME), color = "black", size=4.5)
|
|
```
|
|
|
|
## Bayesian Phylogeny WGS
|
|
|
|
Tree was calculated in BEAST v.2.7. Chain was 4x50 million steps, with 10 percent burnin and 1:10000 sampling from chain (18000 trees in total sample). Tree is
|
|
annotated as maximum clade credibility with mean branch lengths (time).
|
|
|
|
1. Models were compared and tested using IQtree v.2. Best support was TIM+G4+F (empirical frequencies) for concatenated coding regions, and TIM3+G4+F (empirical base frequencies) for concatenated noncoding regions.
|
|
2. Substitution Models are unlinked, meaning that the model controlling NCR and CDS is free to vary independently.
|
|
3. Clock Models are unlinked, NCR = strict, CDS = strict (linear).
|
|
4. Tree model is linked between partitions, Bayesian Skyline.
|
|
|
|
### Bayesian WGS with node numbers referenced
|
|
|
|
Node numbers are assigned by the R package and are not scientifically meaningful.
|
|
|
|
```{r BEAST_Nodereference, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
p <- TREE_BEAST_WGS+
|
|
# If the sequence is from the OAW study, label as red.
|
|
geom_tiplab(aes(label=WHO_NAME,
|
|
subset = ISOAW == "TRUE"),
|
|
color = "red", size=3) +
|
|
# Everything else is black.
|
|
geom_tiplab(aes(label=WHO_NAME,
|
|
subset = ISOAW != "TRUE"),
|
|
color = "black", size=3)+
|
|
theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023)
|
|
p
|
|
```
|
|
|
|
### Bayesian WGS with posterior support
|
|
|
|
1. Bars show 95% HPD for the inferred **time in years** of the node.
|
|
2. Points show nodes with > 0.5 posterior support in the sample.
|
|
|
|
```{r BEAST_WHOLE, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
p <- TREE_BEAST_WGS+
|
|
# Everything else is black
|
|
#geom_tiplab(aes(label=WHO_NAME),
|
|
# color = "black", size=3)+
|
|
geom_hilight(node=145,fill = "pink", alpha = .5)+
|
|
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .5)), color = "black", size = 5, fill="grey",alpha=0.5)+
|
|
theme_tree2() + xlim_tree(2022)
|
|
|
|
p
|
|
```
|
|
|
|
### Partial tree, with majority nodes shown if posterior > 0.9
|
|
|
|
This is a blow-up of the Bayesian subtree used for data presentation.
|
|
|
|
```{r BEAST_PARTIAL, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
p <- TREE_BEAST_WGS+
|
|
# Everything else is black
|
|
geom_tiplab(aes(label=WHO_NAME),time_scale=TRUE,
|
|
color = "black", size=5)+
|
|
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .9)), color = "black",
|
|
size = 5)+
|
|
theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023)
|
|
viewClade(p, 145)
|
|
```
|
|
|
|
### Partial B3 OAW subtree tree, containing metadata annotations
|
|
|
|
Notes:
|
|
|
|
1. This is a **partial** tree, graphically extracted from the larger B3 set that was modeled alongside.
|
|
2. Red arrows are highly confident transmission pairs, grey are less confident or hypothesized.
|
|
3. Some of the less confident transmissions (35-24, 35-47) cover considerable evolutionary distance (sum up all connecting branch lengths in years). These are highly implausible.
|
|
4. Confidence in the groupings/tree topology is a posterior probability, with black dots for nodes > .9. This means **very** high confidence in the split position in the tree, or the presence of the group bearing that common ancestor.
|
|
|
|
```{r BEAST_PARTIAL_meta, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE, dpi=300}
|
|
insetTree <-TREE_BEAST_WGS+
|
|
geom_hilight(node=145,fill = "pink", alpha = .5)+
|
|
theme_void() + xlim_tree(2022)
|
|
|
|
|
|
p <- TREE_BEAST_WGS+
|
|
geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)),
|
|
color = "black",
|
|
time_scale=TRUE,
|
|
align = TRUE,
|
|
size=5)+
|
|
geom_range("height_0.95_HPD",
|
|
color='blue',
|
|
size=2, alpha=.5) +
|
|
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)),
|
|
color = "black",
|
|
size = 5)+
|
|
theme_tree2() + scale_x_continuous(breaks = seq(2014,2022,2),labels = seq(2014,2022,2))+xlim_tree(2030)+
|
|
geom_vline(xintercept=c(2014.157,2016.553),linetype='dashed')+
|
|
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
|
|
|
|
mainTreeWGS <-viewClade(p,145) +
|
|
#High confidence Case 1->4
|
|
geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4',
|
|
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#High confidence Case 3->8
|
|
geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21',
|
|
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->1
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->24
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->47
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->25
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->31
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))
|
|
|
|
|
|
tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(WGS_EXISTS == TRUE)
|
|
|
|
|
|
baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(BASE)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Base"))
|
|
|
|
clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(CLUSTER)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Main Cluster"))
|
|
|
|
importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(IMPORT)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Importation Status"))
|
|
|
|
barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Barrack/Bay"))
|
|
|
|
flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE_WGS)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Shared Flight"))
|
|
|
|
familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Family Group"))
|
|
|
|
|
|
# order - Tree, base, cluster, subcluster, Family Group
|
|
familyTilePlot %>% insert_left(barrackPlot,) %>%
|
|
insert_left(importationTilePlot,) %>%
|
|
insert_left(flightPlot,) %>%
|
|
insert_left(baseTilePlot,) %>%
|
|
insert_left(clusterTilePlot,) %>%
|
|
insert_left(mainTreeWGS,5)
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
## Bayesian Phylogeny N450
|
|
|
|
### Bayesian N450 with posterior support for node date
|
|
|
|
Bars show 95% HPD for the inferred **time in years** of the OAW common ancestor node.
|
|
|
|
```{r BEAST_WHOLE_N450, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
p <- TREE_BEAST_N450+
|
|
geom_hilight(node=145,fill = "pink", alpha = .5)+
|
|
geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+
|
|
theme_tree2() + xlim_tree(2022)
|
|
|
|
p
|
|
```
|
|
|
|
### N450 Supplement Tree, Bayesian
|
|
|
|
```{r BEAST_PARTIAL_meta_sub_N450, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE}
|
|
|
|
|
|
p <- TREE_BEAST_N450+
|
|
# Everything else is black
|
|
geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)),
|
|
color = "black",
|
|
time_scale=TRUE,
|
|
align = TRUE,
|
|
size=5)+
|
|
geom_tippoint(aes(subset=(N450_MATCHES_WGS == "SANGER_ONLY")),
|
|
color = "purple",
|
|
fill = "purple",
|
|
size = 5,
|
|
alpha = .5,
|
|
shape = 24)+
|
|
geom_range("height_0.95_HPD",
|
|
color='blue',
|
|
size=2, alpha=.5) +
|
|
geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)),
|
|
color = "black",
|
|
size = 5)+
|
|
geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+
|
|
theme_tree2() + scale_x_continuous( breaks = seq(2020,2022,1),labels = seq(2020,2022,1))+xlim_tree(2024)
|
|
|
|
|
|
mainTreeN450 <-viewClade(p,167) +
|
|
#High confidence Case 1->4
|
|
geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4',
|
|
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#High confidence Case 3->8
|
|
geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21',
|
|
color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->1
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->24
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->47
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
|
|
#Lower confidence Case 35->25
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
#Lower confidence Case 35->31
|
|
geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21',
|
|
color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
|
|
arrow=arrow(length=unit(0.02, "npc")))+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))
|
|
|
|
|
|
tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(N450_EXISTS == TRUE)
|
|
|
|
baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(BASE)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Base"))
|
|
|
|
clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(CLUSTER)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Main Cluster"))
|
|
|
|
importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(IMPORT)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Importation Status"))
|
|
|
|
barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Barrack/Bay"))
|
|
|
|
flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Shared Flight"))
|
|
|
|
familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
|
|
geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') +
|
|
scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+
|
|
scale_fill_adaptive(name = "nejm", palette = "default")+
|
|
theme_tree2()+
|
|
theme(axis.text.x = element_text(size = 14, face = "bold"))+
|
|
guides(fill = guide_legend(title = "Family Group"))
|
|
|
|
|
|
# order - Tree, base, cluster, subcluster, Family Group
|
|
familyTilePlot %>% insert_left(barrackPlot,) %>%
|
|
insert_left(importationTilePlot,) %>%
|
|
insert_left(flightPlot,) %>%
|
|
insert_left(baseTilePlot,) %>%
|
|
insert_left(clusterTilePlot,) %>%
|
|
insert_left(mainTreeN450,4)
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
### Combined comparison tree of WGS and N450
|
|
|
|
```{r WGS_N450_COMBINED, fig.height=12,fig.width=14, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
plot_grid(mainTreeN450, mainTreeWGS, labels = c('A', 'B'), label_size = 20)
|
|
```
|
|
|
|
### Base Distance Counts
|
|
|
|
```{r HAMMING_MATRIX_SETUP, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
### These are the functions needed for assembly and join-up of the base distance calculation tables
|
|
|
|
|
|
alignment_WGS <- readDNAMultipleAlignment(params$WGS_ALIGNMENT, format = "fasta")
|
|
alignment_N450 <- readDNAMultipleAlignment(params$N450_ALIGNMENT, format = "fasta")
|
|
|
|
#TODO: narrow it to get the N450 from the inclusive range 1126-1575
|
|
counts_matrix_WGS <-stringDist(unmasked(alignment_WGS), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE)
|
|
counts_matrix_N450 <-stringDist(unmasked(alignment_N450), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE)
|
|
|
|
#select(TAXON, index of row that = min(CASE))
|
|
# Select the taxon and the
|
|
counts_matrix_WGS <-as.matrix(counts_matrix_WGS) %>% as.tibble(rownames = "CASE_NOMENCLATURE")
|
|
counts_matrix_N450 <-as.matrix(counts_matrix_N450) %>% as.tibble(rownames = "CASE_NOMENCLATURE")
|
|
|
|
## Coerce the tree metadata to a tibble, filter out only the OAW cases
|
|
|
|
transform_single_result <-function(filterexpression, count_matrix) {
|
|
metadata_OAW_ONLY <- as.tibble(tipcategories) %>% filter(ISOAW == TRUE) %>%
|
|
filter(eval(parse(text = filterexpression))) %>%
|
|
select(TAXON, CASE, FAMILY_GROUP) %>%
|
|
mutate(LOWEST_CASE = min(CASE)) %>%
|
|
mutate(FIRST_TAXON = .[CASE == min(CASE),"TAXON"]) %>%
|
|
select(TAXON, CASE, FIRST_TAXON) %>%
|
|
left_join(count_matrix, by = c("TAXON" = "CASE_NOMENCLATURE"))
|
|
|
|
FIRST_TAXON_SELECTOR <-metadata_OAW_ONLY %>% distinct(FIRST_TAXON) %>% pluck(1,1)
|
|
|
|
|
|
#Make fake tibble with all case numbers.
|
|
joiner_tibble_cases <- tibble(CASE = 1:47) %>%
|
|
mutate(CASE = str_c("CASE_", CASE))
|
|
|
|
finalselectrenamed <-metadata_OAW_ONLY %>% select(TAXON, CASE, FIRST_TAXON, all_of(FIRST_TAXON_SELECTOR)) %>%
|
|
dplyr::rename(SUBSTITUTIONS_FROM_FIRST=4) %>%
|
|
mutate(grouping1 = str_replace_all(filterexpression, "\"", "_")) %>%
|
|
mutate(grouping2 = str_replace_all(grouping1, "\"", "_")) %>%
|
|
mutate(grouping3 = str_replace_all(grouping2, " == ", "_")) %>%
|
|
mutate(GROUPING = str_replace_all(grouping3, "__", "_")) %>%
|
|
mutate(CASE = str_c("CASE_", CASE)) %>%
|
|
select(-c(grouping1, grouping2, grouping3))
|
|
|
|
finalselectjoined <-left_join(joiner_tibble_cases, finalselectrenamed, by = "CASE") %>%
|
|
pivot_wider(names_from = CASE, values_from = SUBSTITUTIONS_FROM_FIRST) %>%
|
|
filter (!is.na(GROUPING)) %>%
|
|
select(-c(TAXON, FIRST_TAXON)) %>%
|
|
group_by(GROUPING) %>%
|
|
summarize_all(., ~ ifelse(any(., !is.na(.)), sum(., na.rm=TRUE), NA)) %>% ungroup
|
|
|
|
return(finalselectjoined)
|
|
}
|
|
|
|
|
|
|
|
```
|
|
|
|
### The substitution distance for WGS
|
|
|
|
```{r HAMMING_MATRIX_WGS, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
|
|
list_exposure_groups_WGs <-list("FAMILY_GROUP == \"A\"",
|
|
"FAMILY_GROUP == \"B\"",
|
|
"FAMILY_GROUP == \"C\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"E\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"F\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"G\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"M\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"O\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"R\"",
|
|
"IMPORT == \"International_Importation\"",
|
|
"IMPORT == \"US-Acquired\"",
|
|
"BARRACK_BAY_MULTIPLE == \"1725\"",
|
|
"BARRACK_BAY_MULTIPLE == \"1740\"",
|
|
"BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"",
|
|
"CASE %in% c(1,4)",
|
|
"CASE %in% c(3,8)",
|
|
"CASE %in% c(1,35)",
|
|
"CASE %in% c(24,35)",
|
|
"CASE %in% c(25,35)",
|
|
"CASE %in% c(31,35)",
|
|
"CASE %in% c(47,35)"
|
|
)
|
|
|
|
tibble_list_WGS <-lapply(list_exposure_groups_WGs,
|
|
transform_single_result, count_matrix = counts_matrix_WGS)
|
|
rbound_tibble_list_WGS <-bind_rows(tibble_list_WGS)
|
|
rbound_tibble_list_WGS
|
|
#kable(rbound_tibble_list_WGS, align = "c", caption = "Row-Bound Substitutions WGS.")%>%
|
|
# kable_styling("hover", full_width = F, fixed_thead = T) %>%
|
|
#row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
|
|
# scroll_box(width = "100%", height = "300px")
|
|
|
|
write_tsv(rbound_tibble_list_WGS, "WGS_SUBSTITUTIONS.tsv")
|
|
```
|
|
|
|
### The substitution distance for N450
|
|
|
|
```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
|
|
list_exposure_groups_N450 <-list("FAMILY_GROUP == \"A\"",
|
|
"FAMILY_GROUP == \"B\"",
|
|
"FAMILY_GROUP == \"C\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"E\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"F\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"G\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"M\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"O\"",
|
|
"FLIGHTS_WITH_MULTIPLE == \"R\"",
|
|
"IMPORT == \"International_Importation\"",
|
|
"IMPORT == \"US-Acquired\"",
|
|
"BARRACK_BAY_MULTIPLE == \"1725\"",
|
|
"BARRACK_BAY_MULTIPLE == \"1740\"",
|
|
"BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"",
|
|
"CASE %in% c(1,4)",
|
|
"CASE %in% c(3,8)",
|
|
"CASE %in% c(30,28)",
|
|
"CASE %in% c(1,35)",
|
|
"CASE %in% c(24,35)",
|
|
"CASE %in% c(25,35)",
|
|
"CASE %in% c(30,35)",
|
|
"CASE %in% c(31,35)",
|
|
"CASE %in% c(47,35)"
|
|
)
|
|
counts_matrix_N450
|
|
tibble_list_N450 <-lapply(list_exposure_groups_N450,
|
|
transform_single_result, count_matrix = counts_matrix_N450)
|
|
rbound_tibble_list_N450 <-bind_rows(tibble_list_N450)
|
|
rbound_tibble_list_N450
|
|
#kable(rbound_tibble_list_N450, align = "c", caption = "Row-Bound Substitutions N450.")%>%
|
|
# kable_styling("hover", full_width = F, fixed_thead = T) %>%
|
|
#row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
|
|
# scroll_box(width = "100%", height = "300px")
|
|
|
|
write_tsv(rbound_tibble_list_N450, "N450_SUBSTITUTIONS.tsv")
|
|
|
|
```
|
|
|
|
### The substitution distances for N450 and WGS, for select reported cases.
|
|
|
|
```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
|
|
# Case 3-8
|
|
cat('# Base Count Differences Case 3->8 `', '`\n')
|
|
cat('# WGS (N-L Span) `', '`\n')
|
|
counts_matrix_WGS["MVs_Wisconsin.USA_37.21_2","MVs_Wisconsin.USA_39.21"]
|
|
cat('# N450 `', '`\n')
|
|
counts_matrix_N450["MVs_Wisconsin.USA_37.21_2", "MVs_Wisconsin.USA_39.21"]
|
|
# Case 1-4
|
|
cat('# Base Count Differences Case 1->4 `', '`\n')
|
|
cat('# WGS (N-L Span) `', '`\n')
|
|
counts_matrix_WGS["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"]
|
|
cat('# N450 `', '`\n')
|
|
counts_matrix_N450["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"]
|
|
# Case 30-28
|
|
|
|
# Not in the matrix.
|
|
|
|
# Case Case35 -> Case1
|
|
cat('# Base Count Differences Case 35->1 `', '`\n')
|
|
cat('# WGS (N-L Span) `', '`\n')
|
|
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"]
|
|
cat('# N450 `', '`\n')
|
|
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"]
|
|
|
|
# Case Case35 -> Case24
|
|
cat('# Base Count Differences Case 35->24 `', '`\n')
|
|
cat('# WGS (N-L Span) `', '`\n')
|
|
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"]
|
|
cat('# N450 `', '`\n')
|
|
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"]
|
|
# Case Case35 -> Case25
|
|
cat('# Base Count Differences Case 35->25 `', '`\n')
|
|
cat('# WGS (N-L Span) `', '`\n')
|
|
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"]
|
|
cat('# N450 `', '`\n')
|
|
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"]
|
|
# Case Case35 -> Case31
|
|
cat('# Base Count Differences Case 35->31 `', '`\n')
|
|
cat('# WGS (N-L Span) `', '`\n')
|
|
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"]
|
|
cat('# N450 `', '`\n')
|
|
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"]
|
|
# Case Case35 -> Case30
|
|
#Not in the matrix
|
|
|
|
# Case Case35 -> Case47
|
|
cat('# Base Count Differences Case 35->47 `', '`\n')
|
|
cat('# WGS (N-L Span) `', '`\n')
|
|
counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"]
|
|
cat('# N450 `', '`\n')
|
|
counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"]
|
|
|
|
```
|