CDC-Data-2025/attachments/05042023_OAW_GENETICS_ANALYSES.txt

---
title: "Measles Virus B3"
subtitle: "Phylogenetic Topology in Final Presentation with Clade Relationships, Operation Allies Welcome"
author: "Andrew Beck, NCIRD/DVD/VVPDB"
date: "May 21, 2021"
output:
    html_document:
        toc: true
        toc_float: true
        toc_depth: 5
        number_sections: true
params:
    TREEFILE_WGS: # Path to Maximum Likelihood tree
    BEASTFILE_WGS: # Path to WGS Bayesian tree
    BEASTFILE_N450: # Path to N450 Bayesian tree
    METADATA_TABLE: # Path to metadata table
    SUBSTITUTIONS_TABLE: # Path to substitutions table
    WGS_ALIGNMENT: # Path to WGS sequence alignment
    N450_ALIGNMENT: # Path to N450 Bayesian tree

---


```{r setup, include=FALSE, echo = FALSE, warning=FALSE, message=FALSE}
library(ggtree)
library(ggsci)
library(pastecs)
library(tidyverse)
library(tidytree)
library(ggplot2)
library(ggpattern)
library(knitr)
library(phytools)
library(RColorBrewer)
library(kableExtra)
library(treeio)
library(scales)
library(plotly)
library(aplot)
library(purrr)
library(Biostrings)
library(DiagrammeR)
library(lubridate)
library(tidyquant)
library(stringr)
library(tracerer)
library(ggrepel)
library(cowplot)
library(ggprism)
library(gtable)
library(grid)
library(ape)
library(ggthemes)
library(magick)
library(magrittr)
library(dplyr)
knitr::opts_chunk$set(echo = FALSE,
                      warning = FALSE,
                      global.par = TRUE,
                      message = FALSE,
                      cache = FALSE,
                      root.dir =' C:/Users/lpu5/R_Projects/PHYLOGENETICS_MF/12032020_NY_WGS_FINAL')
```
```{r margins, include=FALSE, echo = FALSE}
par(mar=c(0,0,0,0))
```

```{r adaptive_scale_process, echo=FALSE, cache=FALSE}
# This will handle the interpolated scales. Courtesy https://nanx.me/blog/post/ggplot2-color-interpolation/
# Courtesy https://jianfengwu2022.github.io/

# @param values Color values.
pal_ramp <- function(values) {
  force(values)
  function(n) {
    if (n <= length(values)) {
      values[seq_len(n)]
    } else {
      colorRampPalette(values, alpha = TRUE)(n)
    }
  }
}

# Adaptive color palette generator
#
# Adaptive color palette generator for ggsci color palettes using `pal_ramp()`.
#
# @param name Color palette name in ggsci
# @param palette Color palette type in ggsci
# @param alpha Transparency level, a real number in (0, 1].
#
# @details See `names(ggsci:::ggsci_db)` for all color palette names in ggsci.
# See `names(ggsci:::ggsci_db$"pal")` for available palette types under
# the palette `pal`.
pal_adaptive <- function(name, palette, alpha = 1) {
  if (alpha > 1L | alpha <= 0L) stop("alpha must be in (0, 1]")

  raw_cols <- ggsci:::ggsci_db[[name]][[palette]]
  raw_cols_rgb <- col2rgb(raw_cols)
  alpha_cols <- rgb(
    raw_cols_rgb[1L, ], raw_cols_rgb[2L, ], raw_cols_rgb[3L, ],
    alpha = alpha * 255L, names = names(raw_cols),
    maxColorValue = 255L
  )

  pal_ramp(unname(alpha_cols))
}


#Adaptive color scales
#
# @inheritParams pal_adaptive
# @param ... additional parameters for [ggplot2::discrete_scale()].
scale_color_adaptive <- function(name, palette, alpha = 1, ...) {
  ggplot2::discrete_scale("colour", name, pal_adaptive(name, palette, alpha), ...)
}

scale_fill_adaptive <- function(name, palette, alpha = 1, ...) {
  ggplot2::discrete_scale("fill", name, pal_adaptive(name, palette, alpha), ...)
}

```

```{r metadata_ingest, echo=FALSE, cache=FALSE}
tipcategories = read.table(params$METADATA_TABLE, sep = "\t",
                          #col.names = c("TAXON","IS_OAW","WHO_NAME","COUNTRY","STATE", "EPI_WEEK","YEAR"),
                          header = TRUE,
                          comment.char = "",
                          stringsAsFactors = FALSE)


```

```{r tree_ingest, echo=FALSE, warning=FALSE}

# Ingest the maximum likelihood tree, WGS
TREE_WGS_ML <- read.iqtree(file = params$TREEFILE_WGS)
TREE_WGS_ML <- root(as.phylo(TREE_WGS_ML), outgroup = "MVs_Manchester.GBR_7.12_2_B3", resolve.root=TRUE)
TREE_WGS_ML <- drop.tip(TREE_WGS_ML, "MVs_NewJersey.USA_8.19")
TREE_WGS_ML <- ggtree(TREE_WGS_ML) %<+% tipcategories

# Ingest the BEAST tree, WGS
TREE_BEAST_WGS <- read.beast(file = params$BEASTFILE_WGS)
TREE_BEAST_WGS <- ggtree(TREE_BEAST_WGS, mrsd="2021-10-07") %<+% tipcategories

# Ingest the BEAST tree, N450
TREE_BEAST_N450 <- read.beast(file = params$BEASTFILE_N450)
TREE_BEAST_N450 <- ggtree(TREE_BEAST_N450, mrsd="2021-10-07") %<+% tipcategories

# Extract time of internal nodes from N450 BEAST tree, display selected node numbers with dates.
my_tibble_N450 <-TREE_BEAST_N450 %>% as.treedata() %>% as.tibble %>%
  mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>%
  mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>%
  mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>%
  mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>%
  mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>%
  filter(node %in% c(17,167,168)) %>%
  select(node,height_corrected,height_95_corrected_first,height_95_corrected_last)

kable(my_tibble_N450)

# Extract time of internal nodes from WGS BEAST tree, display selected node numbers with dates.
my_tibble_WGS <-TREE_BEAST_WGS %>% as.treedata() %>% as.tibble %>%
  mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>%
  mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>%
  mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>%
  mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>%
  mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>%
  filter(node %in% c(15,145,146)) %>%
  select(node,height_corrected,height_95_corrected_first,height_95_corrected_last)

kable(my_tibble_WGS)
```
## Metadata

### Purpose

Measles virus WGS was performed on 44 specimens from OAW evacuee cases. 41 were successfully sequenced, and these are analyzed phylogenetically alongside several others from previous B3 outbreaks. 119 in total.

### Metadata Characteristics and Validation

Number of samples in metadata table: **`r nrow(tipcategories)`**

Maximum year in the metadata table: **`r max(tipcategories$YEAR)`**

Maximum decimal year in the metadata table, branch reference: **`r decimal_date(ymd("2021-10-07"))`**

Minimum year in the metadata table: **`r min(tipcategories$YEAR)`**

```{r TABLE_METADATA_SOURCE}

kable(tipcategories, align = "c", caption = "All metadata attached to trees displayed below.")%>%
  kable_styling("hover", full_width = F, fixed_thead = T) %>%
  row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
  scroll_box(width = "100%", height = "300px")
```

### Maximum Likelihood Trees, not used in study

## Tree used for stub-out of annotations, only.

```{r ML_WGS, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

# Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter)
TREE_WGS_ML +
  # If it's from the OAW study, label red
      geom_tiplab(aes(label=WHO_NAME,
                      subset = ISOAW == "TRUE"),
                      color = "red", size=4.5) +
  # Everything else is black
      geom_tiplab(aes(label=WHO_NAME,
                      subset = ISOAW != "TRUE"),
                      color = "black", size=4.5)+
      geom_nodelab(aes(label=node), color="black")+
xlim(0, .03)

```

## Whole B3 Tree with clade labels

```{r ML_WGS_whole, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

# Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter)
p <-TREE_WGS_ML +
  # If it's from the OAW study, label red
      geom_tiplab(aes(label=WHO_NAME,
                      subset = ISOAW == "TRUE"),
                      color = "red", size=4.5) +
  # Everything else is black
      geom_tiplab(aes(label=WHO_NAME,
                      subset = ISOAW != "TRUE"),
                      color = "black", size=4.5)+
  #Label reasonable clusters.
      geom_nodepoint(aes(subset=(as.numeric(label) >= 50)), size = 3, color = "black", fill = "grey", alpha = .5)+
      geom_cladelab(node=122,label="OAW Entire",offset=.006,barsize=2,fontsize=8)+
      geom_cladelab(node=152,label="Italy 2017",offset=.0035,barsize=2,fontsize=8)+
      geom_cladelab(node=172,label="California 2019",offset=.0035,barsize=2,fontsize=8)+
      geom_cladelab(node=145,label="A",offset=.0035,barsize=2,fontsize=8)+
      geom_cladelab(node=126,label="B",offset=.0040,barsize=2,fontsize=8)+
      geom_cladelab(node=125,label="C",offset=.0045,barsize=2,fontsize=8)+
      geom_cladelab(node=124,label="D",offset=.005,barsize=2,fontsize=8)+
xlim(0, .03)

p

```

## Partial Maximum Likelihood Clade

This is identical to the "full" B3 tree, only the subtree is extracted for the current study sequences.

```{r ML_WGS_whole_clade, fig.height=10,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
viewClade(p, 122)  + geom_tiplab(aes(label=WHO_NAME), color = "black", size=4.5)
```

## Bayesian Phylogeny WGS

Tree was calculated in BEAST v.2.7. Chain was 4x50 million steps, with 10 percent burnin and 1:10000 sampling from chain (18000 trees in total sample). Tree is
annotated as maximum clade credibility with mean branch lengths (time).

1. Models were compared and tested using IQtree v.2. Best support was TIM+G4+F (empirical frequencies) for concatenated coding regions, and TIM3+G4+F (empirical base frequencies) for concatenated noncoding regions.
2. Substitution Models are unlinked, meaning that the model controlling NCR and CDS is free to vary independently.
3. Clock Models are unlinked, NCR = strict, CDS = strict (linear).
4. Tree model is linked between partitions, Bayesian Skyline.

### Bayesian WGS with node numbers referenced

Node numbers are assigned by the R package and are not scientifically meaningful.

```{r BEAST_Nodereference, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

p <- TREE_BEAST_WGS+
  # If the sequence is from the OAW study, label as red.
      geom_tiplab(aes(label=WHO_NAME,
                      subset = ISOAW == "TRUE"),
                      color = "red", size=3) +
  # Everything else is black.
      geom_tiplab(aes(label=WHO_NAME,
                      subset = ISOAW != "TRUE"),
                      color = "black", size=3)+
  theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023)
p
```

### Bayesian WGS with posterior support

1. Bars show 95% HPD for the inferred **time in years** of the node.
2. Points show nodes with > 0.5 posterior support in the sample.

```{r BEAST_WHOLE, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

p <- TREE_BEAST_WGS+
  # Everything else is black
      #geom_tiplab(aes(label=WHO_NAME),
      #                color = "black", size=3)+
      geom_hilight(node=145,fill = "pink", alpha = .5)+
      geom_nodepoint(aes(subset=(as.numeric(posterior) >= .5)), color = "black", size = 5, fill="grey",alpha=0.5)+
  theme_tree2() + xlim_tree(2022)

p
```

### Partial tree, with majority nodes shown if posterior > 0.9

This is a blow-up of the Bayesian subtree used for data presentation.

```{r BEAST_PARTIAL, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

p <- TREE_BEAST_WGS+
  # Everything else is black
      geom_tiplab(aes(label=WHO_NAME),time_scale=TRUE,
                      color = "black", size=5)+
      geom_nodepoint(aes(subset=(as.numeric(posterior) >= .9)), color = "black",
                    size = 5)+
  theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023)
viewClade(p, 145)
```

### Partial B3 OAW subtree tree, containing metadata annotations

Notes:

1. This is a **partial** tree, graphically extracted from the larger B3 set that was modeled alongside.
2. Red arrows are highly confident transmission pairs, grey are less confident or hypothesized.
3. Some of the less confident transmissions (35-24, 35-47) cover considerable evolutionary distance (sum up all connecting branch lengths in years). These are highly implausible.
4. Confidence in the groupings/tree topology is a posterior probability, with black dots for nodes > .9. This means **very** high confidence in the split position in the tree, or the presence of the group bearing that common ancestor.

```{r BEAST_PARTIAL_meta, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE, dpi=300}
insetTree <-TREE_BEAST_WGS+
      geom_hilight(node=145,fill = "pink", alpha = .5)+
      theme_void() + xlim_tree(2022)


p <- TREE_BEAST_WGS+
      geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)),
                      color = "black",
                      time_scale=TRUE,
                      align = TRUE,
                      size=5)+
      geom_range("height_0.95_HPD",
                    color='blue',
                    size=2, alpha=.5) +
      geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)),
                    color = "black",
                    size = 5)+
  theme_tree2() + scale_x_continuous(breaks = seq(2014,2022,2),labels = seq(2014,2022,2))+xlim_tree(2030)+
  geom_vline(xintercept=c(2014.157,2016.553),linetype='dashed')+
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))

mainTreeWGS <-viewClade(p,145) +
    #High confidence Case 1->4
    geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4',
                  color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #High confidence Case 3->8
    geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21',
                  color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->1
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->24
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->47
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->25
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->31
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    theme(axis.text.x = element_text(size = 14, face = "bold"))


tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(WGS_EXISTS == TRUE)


baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(BASE)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Base"))

clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(CLUSTER)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Main Cluster"))

importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(IMPORT)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Importation Status"))

barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Barrack/Bay"))

flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE_WGS)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Shared Flight"))

familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Family Group"))


 # order - Tree, base, cluster, subcluster, Family Group
familyTilePlot %>% insert_left(barrackPlot,) %>%
                insert_left(importationTilePlot,) %>%
                insert_left(flightPlot,) %>%
                insert_left(baseTilePlot,) %>%
                insert_left(clusterTilePlot,) %>%
                insert_left(mainTreeWGS,5)


```


## Bayesian Phylogeny N450

### Bayesian N450 with posterior support for node date

Bars show 95% HPD for the inferred **time in years** of the OAW common ancestor node.

```{r BEAST_WHOLE_N450, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

p <- TREE_BEAST_N450+
      geom_hilight(node=145,fill = "pink", alpha = .5)+
      geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+
      theme_tree2() + xlim_tree(2022)

p
```

### N450 Supplement Tree, Bayesian

```{r BEAST_PARTIAL_meta_sub_N450, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE}


p <- TREE_BEAST_N450+
  # Everything else is black
            geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)),
                      color = "black",
                      time_scale=TRUE,
                      align = TRUE,
                      size=5)+
            geom_tippoint(aes(subset=(N450_MATCHES_WGS == "SANGER_ONLY")),
                    color = "purple",
                    fill = "purple",
                    size = 5,
                    alpha = .5,
                    shape = 24)+
            geom_range("height_0.95_HPD",
                    color='blue',
                    size=2, alpha=.5) +
            geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)),
                    color = "black",
                    size = 5)+
            geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+
            theme_tree2() + scale_x_continuous( breaks = seq(2020,2022,1),labels = seq(2020,2022,1))+xlim_tree(2024)


mainTreeN450 <-viewClade(p,167) +
    #High confidence Case 1->4
    geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4',
                  color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #High confidence Case 3->8
    geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21',
                  color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->1
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->24
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->47
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+

    #Lower confidence Case 35->25
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
    #Lower confidence Case 35->31
    geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21',
                  color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5,
                  arrow=arrow(length=unit(0.02, "npc")))+
   theme(axis.text.x = element_text(size = 14, face = "bold"))


tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(N450_EXISTS == TRUE)

baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(BASE)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Base"))

clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(CLUSTER)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Main Cluster"))

importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(IMPORT)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Importation Status"))

barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Barrack/Bay"))

flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Shared Flight"))

 familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+
                   geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') +
                   scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+
                   scale_fill_adaptive(name = "nejm", palette = "default")+
                   theme_tree2()+
                   theme(axis.text.x = element_text(size = 14, face = "bold"))+
                   guides(fill = guide_legend(title = "Family Group"))


 # order - Tree, base, cluster, subcluster, Family Group
familyTilePlot %>% insert_left(barrackPlot,) %>%
                insert_left(importationTilePlot,) %>%
                insert_left(flightPlot,) %>%
                insert_left(baseTilePlot,) %>%
                insert_left(clusterTilePlot,) %>%
                insert_left(mainTreeN450,4)


```

### Combined comparison tree of WGS and N450

```{r WGS_N450_COMBINED, fig.height=12,fig.width=14, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

plot_grid(mainTreeN450, mainTreeWGS, labels = c('A', 'B'), label_size = 20)
```

### Base Distance Counts

```{r HAMMING_MATRIX_SETUP, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
### These are the functions needed for assembly and join-up of the base distance calculation tables


alignment_WGS <- readDNAMultipleAlignment(params$WGS_ALIGNMENT, format = "fasta")
alignment_N450 <- readDNAMultipleAlignment(params$N450_ALIGNMENT, format = "fasta")

#TODO: narrow it to get the N450 from the inclusive range 1126-1575
 counts_matrix_WGS <-stringDist(unmasked(alignment_WGS), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE)
 counts_matrix_N450 <-stringDist(unmasked(alignment_N450), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE)

#select(TAXON, index of row that  = min(CASE))
# Select the taxon and the
counts_matrix_WGS <-as.matrix(counts_matrix_WGS) %>% as.tibble(rownames = "CASE_NOMENCLATURE")
counts_matrix_N450 <-as.matrix(counts_matrix_N450) %>% as.tibble(rownames = "CASE_NOMENCLATURE")

## Coerce the tree metadata to a tibble, filter out only the OAW cases

transform_single_result <-function(filterexpression, count_matrix) {
metadata_OAW_ONLY <- as.tibble(tipcategories) %>% filter(ISOAW == TRUE) %>%
                     filter(eval(parse(text = filterexpression))) %>%
                     select(TAXON, CASE, FAMILY_GROUP) %>%
                     mutate(LOWEST_CASE = min(CASE)) %>%
                     mutate(FIRST_TAXON = .[CASE == min(CASE),"TAXON"]) %>%
                     select(TAXON, CASE, FIRST_TAXON) %>%
                     left_join(count_matrix, by = c("TAXON" = "CASE_NOMENCLATURE"))

FIRST_TAXON_SELECTOR <-metadata_OAW_ONLY %>% distinct(FIRST_TAXON) %>% pluck(1,1)


#Make fake tibble with all case numbers.
joiner_tibble_cases <- tibble(CASE = 1:47) %>%
                       mutate(CASE = str_c("CASE_", CASE))

finalselectrenamed <-metadata_OAW_ONLY %>% select(TAXON, CASE, FIRST_TAXON, all_of(FIRST_TAXON_SELECTOR)) %>%
                         dplyr::rename(SUBSTITUTIONS_FROM_FIRST=4) %>%
                         mutate(grouping1 = str_replace_all(filterexpression, "\"", "_")) %>%
                         mutate(grouping2 = str_replace_all(grouping1, "\"", "_")) %>%
                         mutate(grouping3 = str_replace_all(grouping2, " == ", "_")) %>%
                         mutate(GROUPING = str_replace_all(grouping3, "__", "_")) %>%
                         mutate(CASE = str_c("CASE_", CASE)) %>%
                         select(-c(grouping1, grouping2, grouping3))

finalselectjoined <-left_join(joiner_tibble_cases, finalselectrenamed, by = "CASE") %>%
                    pivot_wider(names_from = CASE, values_from = SUBSTITUTIONS_FROM_FIRST) %>%
                    filter (!is.na(GROUPING)) %>%
                    select(-c(TAXON, FIRST_TAXON)) %>%
                    group_by(GROUPING) %>%
                    summarize_all(., ~ ifelse(any(., !is.na(.)), sum(., na.rm=TRUE), NA)) %>%  ungroup

return(finalselectjoined)
}


```

### The substitution distance for WGS

```{r HAMMING_MATRIX_WGS, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}


list_exposure_groups_WGs <-list("FAMILY_GROUP == \"A\"",
                            "FAMILY_GROUP == \"B\"",
                            "FAMILY_GROUP == \"C\"",
                            "FLIGHTS_WITH_MULTIPLE == \"E\"",
                            "FLIGHTS_WITH_MULTIPLE == \"F\"",
                            "FLIGHTS_WITH_MULTIPLE == \"G\"",
                            "FLIGHTS_WITH_MULTIPLE == \"M\"",
                            "FLIGHTS_WITH_MULTIPLE == \"O\"",
                            "FLIGHTS_WITH_MULTIPLE == \"R\"",
                            "IMPORT == \"International_Importation\"",
                            "IMPORT == \"US-Acquired\"",
                            "BARRACK_BAY_MULTIPLE == \"1725\"",
                            "BARRACK_BAY_MULTIPLE == \"1740\"",
                            "BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"",
                            "CASE %in% c(1,4)",
                            "CASE %in% c(3,8)",
                            "CASE %in% c(1,35)",
                            "CASE %in% c(24,35)",
                            "CASE %in% c(25,35)",
                            "CASE %in% c(31,35)",
                            "CASE %in% c(47,35)"
)

tibble_list_WGS <-lapply(list_exposure_groups_WGs,
                     transform_single_result, count_matrix = counts_matrix_WGS)
rbound_tibble_list_WGS <-bind_rows(tibble_list_WGS)
rbound_tibble_list_WGS
#kable(rbound_tibble_list_WGS, align = "c", caption = "Row-Bound Substitutions WGS.")%>%
 # kable_styling("hover", full_width = F, fixed_thead = T) %>%
   #row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
 #  scroll_box(width = "100%", height = "300px")

write_tsv(rbound_tibble_list_WGS, "WGS_SUBSTITUTIONS.tsv")
```

### The substitution distance for N450

```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}

list_exposure_groups_N450 <-list("FAMILY_GROUP == \"A\"",
                             "FAMILY_GROUP == \"B\"",
                             "FAMILY_GROUP == \"C\"",
                             "FLIGHTS_WITH_MULTIPLE == \"E\"",
                             "FLIGHTS_WITH_MULTIPLE == \"F\"",
                             "FLIGHTS_WITH_MULTIPLE == \"G\"",
                             "FLIGHTS_WITH_MULTIPLE == \"M\"",
                             "FLIGHTS_WITH_MULTIPLE == \"O\"",
                             "FLIGHTS_WITH_MULTIPLE == \"R\"",
                             "IMPORT == \"International_Importation\"",
                             "IMPORT == \"US-Acquired\"",
                             "BARRACK_BAY_MULTIPLE == \"1725\"",
                             "BARRACK_BAY_MULTIPLE == \"1740\"",
                             "BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"",
                             "CASE %in% c(1,4)",
                             "CASE %in% c(3,8)",
                             "CASE %in% c(30,28)",
                             "CASE %in% c(1,35)",
                             "CASE %in% c(24,35)",
                             "CASE %in% c(25,35)",
                             "CASE %in% c(30,35)",
                             "CASE %in% c(31,35)",
                             "CASE %in% c(47,35)"
)
counts_matrix_N450
tibble_list_N450 <-lapply(list_exposure_groups_N450,
                     transform_single_result, count_matrix = counts_matrix_N450)
rbound_tibble_list_N450 <-bind_rows(tibble_list_N450)
rbound_tibble_list_N450
#kable(rbound_tibble_list_N450, align = "c", caption = "Row-Bound Substitutions N450.")%>%
 # kable_styling("hover", full_width = F, fixed_thead = T) %>%
   #row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>%
  # scroll_box(width = "100%", height = "300px")

write_tsv(rbound_tibble_list_N450, "N450_SUBSTITUTIONS.tsv")

```

### The substitution distances for N450 and WGS, for select reported cases.

```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE}
 # Case 3-8
 cat('# Base Count Differences Case 3->8 `', '`\n')
 cat('# WGS (N-L Span) `', '`\n')
 counts_matrix_WGS["MVs_Wisconsin.USA_37.21_2","MVs_Wisconsin.USA_39.21"]
 cat('# N450 `', '`\n')
 counts_matrix_N450["MVs_Wisconsin.USA_37.21_2", "MVs_Wisconsin.USA_39.21"]
 # Case 1-4
 cat('# Base Count Differences Case 1->4 `', '`\n')
 cat('# WGS (N-L Span) `', '`\n')
 counts_matrix_WGS["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"]
 cat('# N450 `', '`\n')
 counts_matrix_N450["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"]
 # Case 30-28

 # Not in the matrix.

 # Case Case35 -> Case1
 cat('# Base Count Differences Case 35->1 `', '`\n')
 cat('# WGS (N-L Span) `', '`\n')
 counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"]
 cat('# N450 `', '`\n')
 counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"]

 # Case Case35 -> Case24
 cat('# Base Count Differences Case 35->24 `', '`\n')
 cat('# WGS (N-L Span) `', '`\n')
 counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"]
 cat('# N450 `', '`\n')
 counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"]
 # Case Case35 -> Case25
 cat('# Base Count Differences Case 35->25 `', '`\n')
 cat('# WGS (N-L Span) `', '`\n')
 counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"]
 cat('# N450 `', '`\n')
 counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"]
 # Case Case35 -> Case31
 cat('# Base Count Differences Case 35->31 `', '`\n')
 cat('# WGS (N-L Span) `', '`\n')
 counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"]
 cat('# N450 `', '`\n')
 counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"]
 # Case Case35 -> Case30
 #Not in the matrix

 # Case Case35 -> Case47
 cat('# Base Count Differences Case 35->47 `', '`\n')
 cat('# WGS (N-L Span) `', '`\n')
 counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"]
 cat('# N450 `', '`\n')
 counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"]

```