2021-11-16 ggtree包绘制进化树

余大神的力作之一:Guangchuang Yu (https://yulab-smu.top)。绘制出的进化树还是很专业美观的!
——电子书: Data Integration, Manipulation and Visualization of Phylogenetic Trees
——书包括4个部分:1)数据输入、输出和操作。2)树数据的可视化和注释。3)ggtree延展。4)其它。

ggtree包的功能很多。这里摘抄几个例子,详见原作网站。

1.加载包

library(treedataverse)
##  Attaching packages  treedataverse 0.0.1 

##  ape         5.5         treeio      1.18.0
##  dplyr       1.0.7       ggtree      3.2.0 
##  ggplot2     3.3.5       ggtreeExtra 1.4.0 
##  tidytree    0.3.6

2.Aligning graph to the tree based on the tree structure

library(ggtree)
library(TDbook)

## load `tree_nwk`, `df_info`, `df_alleles`, and `df_bar_data` from 'TDbook'
tree <- tree_nwk
snps <- df_alleles
snps_strainCols <- snps[1,] 
snps<-snps[-1,] # drop strain names
colnames(snps) <- snps_strainCols

gapChar <- "?"
snp <- t(snps)
lsnp <- apply(snp, 1, function(x) {
        x != snp[1,] & x != gapChar & snp[1,] != gapChar
    })
lsnp <- as.data.frame(lsnp)
lsnp$pos <- as.numeric(rownames(lsnp))
lsnp <- tidyr::gather(lsnp, name, value, -pos)
snp_data <- lsnp[lsnp$value, c("name", "pos")]

## visualize the tree 
p <- ggtree(tree) 

## attach the sampling information data set 
## and add symbols colored by location
p <- p %<+% df_info + geom_tippoint(aes(color=location))

## visualize SNP and Trait data using dot and bar charts,
## and align them based on tree structure
p + geom_facet(panel = "SNP", data = snp_data, geom = geom_point, 
               mapping=aes(x = pos, color = location), shape = '|') +
    geom_facet(panel = "Trait", data = df_bar_data, geom = geom_col, 
                aes(x = dummy_bar_value, color = location, 
                fill = location), orientation = 'y', width = .6) +
    theme_tree2(legend.position=c(.05, .85))
image.png

3.Visualize a tree with associated matrix

beast_file <- system.file("examples/MCC_FluA_H3.tree", package="ggtree")
beast_tree <- read.beast(beast_file)

genotype_file <- system.file("examples/Genotype.txt", package="ggtree")
genotype <- read.table(genotype_file, sep="\t", stringsAsFactor=F)
colnames(genotype) <- sub("\\.$", "", colnames(genotype))
p <- ggtree(beast_tree, mrsd="2013-01-01") + 
    geom_treescale(x=2008, y=1, offset=2) + 
    geom_tiplab(size=2)
gheatmap(p, genotype, offset=5, width=0.5, font.size=3, 
        colnames_angle=-45, hjust=0) +
    scale_fill_manual(breaks=c("HuH3N2", "pdm", "trig"), 
        values=c("steelblue", "firebrick", "darkgreen"), name="genotype")

p <- ggtree(beast_tree, mrsd="2013-01-01") + 
    geom_tiplab(size=2, align=TRUE, linesize=.5) + 
    theme_tree2()
gheatmap(p, genotype, offset=8, width=0.6, 
        colnames=FALSE, legend_title="genotype") +
    scale_x_ggtree() + 
    scale_y_continuous(expand=c(0, 0.3))
image.png

4.Visualize a tree with multiple associated matrices

nwk <- system.file("extdata", "sample.nwk", package="treeio")

tree <- read.tree(nwk)
circ <- ggtree(tree, layout = "circular")

df <- data.frame(first=c("a", "b", "a", "c", "d", "d", "a", 
                        "b", "e", "e", "f", "c", "f"),
                 second= c("z", "z", "z", "z", "y", "y", 
                        "y", "y", "x", "x", "x", "a", "a"))
rownames(df) <- tree$tip.label

df2 <- as.data.frame(matrix(rnorm(39), ncol=3))
rownames(df2) <- tree$tip.label
colnames(df2) <- LETTERS[1:3]


p1 <- gheatmap(circ, df, offset=.8, width=.2,
               colnames_angle=95, colnames_offset_y = .25) +
    scale_fill_viridis_d(option="D", name="discrete\nvalue")


library(ggnewscale)
p2 <- p1 + new_scale_fill()
gheatmap(p2, df2, offset=15, width=.3,
         colnames_angle=90, colnames_offset_y = .25) +
    scale_fill_viridis_c(option="A", name="continuous\nvalue")
mgheatmap-1.gif

5.Visualize a tree with multiple sequence alignment

library(TDbook)

# load `tree_seq_nwk` and `AA_sequence` from 'TDbook'
p <- ggtree(tree_seq_nwk) + geom_tiplab(size=3)
msaplot(p, AA_sequence, offset=3, width=2)
p <- ggtree(tree_seq_nwk, layout='circular') + 
    geom_tiplab(offset=4, align=TRUE) + xlim(NA, 12)
msaplot(p, AA_sequence, window=c(120, 200))
image.png

6. Composite plots

library(ggplot2)
library(ggtree)

set.seed(2019-10-31)
tr <- rtree(10)

d1 <- data.frame(
    # only some labels match
    label = c(tr$tip.label[sample(5, 5)], "A"),
    value = sample(1:6, 6))

d2 <- data.frame(
    label = rep(tr$tip.label, 5),
    category = rep(LETTERS[1:5], each=10),
    value = rnorm(50, 0, 3)) 

g <- ggtree(tr) + geom_tiplab(align=TRUE)

p1 <- ggplot(d1, aes(label, value)) + geom_col(aes(fill=label)) + 
    geom_text(aes(label=label, y= value+.1)) +
    coord_flip() + theme_tree2() + theme(legend.position='none')
 
p2 <- ggplot(d2, aes(x=category, y=label)) + 
    geom_tile(aes(fill=value)) + scale_fill_viridis_c() + 
    theme_minimal() + xlab(NULL) + ylab(NULL)
composite-1.gif

7.The phylo4 and phylo4d objects

library(phylobase)
data(geospiza_raw)
g1 <- as(geospiza_raw$tree, "phylo4")
g2 <- phylo4d(g1, geospiza_raw$data, missing.data="warn")

d1 <- data.frame(x = seq(1.1, 2, length.out = 5),
                lab = names(geospiza_raw$data))

p1 <- ggtree(g2) + geom_tippoint(aes(size = wingL), x = d1$x[1], shape = 1) + 
    geom_tippoint(aes(size = tarsusL), x = d1$x[2], shape = 1) + 
    geom_tippoint(aes(size = culmenL), x = d1$x[3], shape = 1) + 
    geom_tippoint(aes(size = beakD),   x = d1$x[4], shape = 1) + 
    geom_tippoint(aes(size = gonysW),  x = d1$x[5], shape = 1) + 
    scale_size_continuous(range = c(3,12), name="") + 
    geom_text(aes(x = x, y = 0, label = lab), data = d1, angle = 45) +
    geom_tiplab(offset = 1.3) + xlim(0, 3) +
    theme(legend.position = c(.1, .75))  

## users can use `as.treedata(g2)` to convert `g2` to a `treedata` object
## and use `get_tree_data()` function to extract the associated data 

p2 <- gheatmap(ggtree(g1), data=geospiza_raw$data, colnames_angle=45) + 
  geom_tiplab(offset=1) + hexpand(.2) + theme(legend.position = c(.1, .75))

aplot::plot_list(p1, p2, ncol=2, tag_levels='A')    
image.png

8.ggtree for other tree-like structures

library(treeio)
library(ggplot2)
library(ggtree)

data("GNI2014", package="treemap")
n <- GNI2014[, c(3,1)]
n[,1] <- as.character(n[,1])
n[,1] <- gsub("\\s\\(.*\\)", "", n[,1])

w <- cbind("World", as.character(unique(n[,1])))

colnames(w) <- colnames(n)
edgelist <- rbind(n, w)

y <- as.phylo(edgelist)
ggtree(y, layout='circular') %<+% GNI2014 + 
    aes(color=continent) + geom_tippoint(aes(size=population), alpha=.6) + 
    geom_tiplab(aes(label=country), offset=.1) +
    theme(plot.margin=margin(60,60,60,60))
image.png

9.Visualizing pairwise nucleotide sequence distance with a phylogenetic tree

library(TDbook)
library(tibble)
library(tidyr)
library(Biostrings)
library(treeio)
library(ggplot2)
library(ggtree)

# loaded from TDbook package
tree <- tree_HPV58

clade <- c(A3 = 92, A1 = 94, A2 = 108, B1 = 156, 
            B2 = 159, C = 163, D1 = 173, D2 = 176)
tree <- groupClade(tree, clade)
cols <- c(A1 = "#EC762F", A2 = "#CA6629", A3 = "#894418", B1 = "#0923FA", 
         B2 = "#020D87", C = "#000000", D1 = "#9ACD32",D2 = "#08630A")

## visualize the tree with tip labels and tree scale
p <- ggtree(tree, aes(color = group), ladderize = FALSE) %>% 
    rotate(rootnode(tree)) + 
    geom_tiplab(aes(label = paste0("italic('", label, "')")), 
                parse = TRUE, size = 2.5) +
    geom_treescale(x = 0, y = 1, width = 0.002) + 
    scale_color_manual(values = c(cols, "black"), 
                na.value = "black", name = "Lineage",
                breaks = c("A1", "A2", "A3", "B1", "B2", "C", "D1", "D2")) +
    guides(color = guide_legend(override.aes = list(size = 5, shape = 15))) +
    theme_tree2(legend.position = c(.1, .88))
## Optional
## add labels for monophyletic (A, C and D) and paraphyletic (B) groups 
dat <- tibble(node = c(94, 108, 131, 92, 156, 159, 163, 173, 176,172),
              name = c("A1", "A2", "A3", "A", "B1", 
                        "B2", "C", "D1", "D2", "D"),
              offset = c(0.003, 0.003, 0.003, 0.00315, 0.003, 
                        0.003, 0.0031, 0.003, 0.003, 0.00315),
              offset.text = c(-.001, -.001, -.001, 0.0002, -.001, 
                        -.001, 0.0002, -.001, -.001, 0.0002),
              barsize = c(1.2, 1.2, 1.2, 2, 1.2, 1.2, 3.2, 1.2, 1.2, 2),
              extend = list(c(0, 0.5), 0.5, c(0.5, 0), 0, c(0, 0.5), 
                        c(0.5, 0), 0, c(0, 0.5), c(0.5, 0), 0)
            ) %>% 
       dplyr::group_split(barsize)

p <- p +
     geom_cladelab(
         data = dat[[1]],
         mapping = aes(
             node = node,
             label = name,
             color = group,
             offset = offset,
             offset.text = offset.text,
             extend = extend
         ),
         barsize = 1.2,
         fontface = 3,
         align = TRUE
     ) +
     geom_cladelab(
         data = dat[[2]],
         mapping = aes(
             node = node,
             label = name,
             offset = offset,
             offset.text =offset.text,
             extend = extend
         ),
         barcolor = "darkgrey",
         textcolor = "darkgrey",
         barsize = 2,
         fontsize = 5,
         fontface = 3,
         align = TRUE
     ) +
     geom_cladelab(
         data = dat[[3]],
         mapping = aes(
             node = node,
             label = name,
             offset = offset,
             offset.text = offset.text,
             extend = extend
         ),
         barcolor = "darkgrey",
         textcolor = "darkgrey",
         barsize = 3.2,
         fontsize = 5,
         fontface = 3,
         align = TRUE
     ) +
     geom_strip(65, 71, "italic(B)", color = "darkgrey", 
                offset = 0.00315, align = TRUE, offset.text = 0.0002, 
                barsize = 2, fontsize = 5, parse = TRUE)

## Optional
## display support values
p <- p + geom_nodelab(aes(subset = (node == 92), label = "*"), 
                    color = "black", nudge_x = -.001, nudge_y = 1) +
    geom_nodelab(aes(subset = (node == 155), label = "*"), 
                    color = "black", nudge_x = -.0003, nudge_y = -1) +
    geom_nodelab(aes(subset = (node == 158), label = "95/92/1.00"), 
                    color = "black", nudge_x = -0.0001, 
                    nudge_y = -1, hjust = 1) +
    geom_nodelab(aes(subset = (node == 162), label = "98/97/1.00"), 
                    color = "black", nudge_x = -0.0001, 
                    nudge_y = -1, hjust = 1) +
    geom_nodelab(aes(subset = (node == 172), label = "*"), 
                    color = "black", nudge_x = -.0003, nudge_y = -1) 
## extract accession numbers from tip labels
tl <- tree$tip.label
acc <- sub("\\w+\\|", "", tl)
names(tl) <- acc

## read sequences from GenBank directly into R
## and convert the object to DNAStringSet
tipseq <- ape::read.GenBank(acc) %>% as.character %>% 
    lapply(., paste0, collapse = "") %>% unlist %>% 
    DNAStringSet
## align the sequences using muscle
tipseq_aln <- muscle::muscle(tipseq)
tipseq_aln <- DNAStringSet(tipseq_aln)
## calculate pairwise hamming distances among sequences
tipseq_dist <- stringDist(tipseq_aln, method = "hamming")

## calculate the percentage of differences
tipseq_d <- as.matrix(tipseq_dist) / width(tipseq_aln[1]) * 100

## convert the matrix to a tidy data frame for facet_plot
dd <- as_tibble(tipseq_d)
dd$seq1 <- rownames(tipseq_d)
td <- gather(dd,seq2, dist, -seq1)
td$seq1 <- tl[td$seq1]
td$seq2 <- tl[td$seq2]

g <- p$data$group
names(g) <- p$data$label
td$clade <- g[td$seq2] 

## visualize the sequence differences using dot plot and line plot
## and align the sequence difference plot to the tree using facet_plot
p2 <- p + geom_facet(panel = "Sequence Distance", 
            data = td, geom = geom_point, alpha = .6, 
            mapping = aes(x = dist, color = clade, shape = clade)) +
    geom_facet(panel = "Sequence Distance", 
            data = td, geom = geom_path, alpha = .6, 
            mapping=aes(x = dist, group = seq2, color = clade)) + 
    scale_shape_manual(values = 1:8, guide = FALSE) 

print(p2)
image.png

10.Highlighting different groups.

library(TDbook)
mytree <- tree_treenwk_30.4.19

# Define nodes for coloring later on
tiplab <- mytree$tip.label
cls <- tiplab[grep("^ch", tiplab)] 
labeltree <- groupOTU(mytree, cls)

p <- ggtree(labeltree, aes(color=group, linetype=group), layout="circular") +
    scale_color_manual(values = c("#efad29", "#63bbd4")) +
    geom_nodepoint(color="black", size=0.1) +
    geom_tiplab(size=2, color="black")

p2 <- flip(p, 136, 110) %>% 
    flip(141, 145) %>% 
    rotate(141) %>% 
    rotate(142) %>% 
    rotate(160) %>% 
    rotate(164) %>% 
    rotate(131)

### Group V and II coloring 
dat <- data.frame(
           node = c(110, 88, 156,136),
           fill = c("#229f8a", "#229f8a", "#229f8a", "#f9311f")
       )
p3 <- p2 +
      geom_hilight(
          data = dat,
          mapping = aes(
              node = node,
              fill = I(fill)
          ),
          alpha = 0.2,
          extendto = 1.4
      )

### Putting on a label on the avian specific expansion 
p4 <- p3 +
      geom_cladelab(
          node = 113,
          label = "Avian-specific expansion",
          align = TRUE,
          angle = -35,
          offset.text = 0.05,
          hjust = "center",
          fontsize = 2,
          offset = .2,
          barsize = .2
      )

### Adding the bootstrap values with subset used to remove all bootstraps < 50  
p5 <- p4 +
      geom_nodelab(
          mapping = aes(
              x = branch,
              label = label,
              subset = !is.na(as.numeric(label)) & as.numeric(label) > 50
          ),
          size = 2,
          color = "black",
          nudge_y = 0.6
      )

### Putting labels on the subgroups 
p6 <- p5 +
      geom_cladelab(
          data = data.frame(
              node = c(114, 121),
              name = c("Subgroup A", "Subgroup B")
          ),
          mapping = aes(
              node = node,
              label = name
          ),
          align = TRUE,
          offset = .05,
          offset.text = .03,
          hjust = "center",
          barsize = .2,
          fontsize = 2,
          angle = "auto",
          horizontal = FALSE
      ) +
      theme(
          legend.position = "none",
          plot.margin = grid::unit(c(-15, -15, -15, -15), "mm")
      )
print(p6)
image.png

11.Phylogenetic tree with genome locus structure

library(dplyr)
library(ggplot2)
library(gggenes)
library(ggtree)

get_genes <- function(data, genome) {
    filter(data, molecule == genome) %>% pull(gene)
}

g <- unique(example_genes[,1])
n <- length(g)
d <- matrix(nrow = n, ncol = n)
rownames(d) <- colnames(d) <- g
genes <- lapply(g, get_genes, data = example_genes)

for (i in 1:n) {
    for (j in 1:i) {
        jaccard_sim <- length(intersect(genes[[i]], genes[[j]])) / 
                       length(union(genes[[i]], genes[[j]]))
        d[j, i] <- d[i, j] <- 1 - jaccard_sim
    }
}

tree <- ape::bionj(d) 

p <- ggtree(tree, branch.length='none') + 
    geom_tiplab() + xlim_tree(5.5) + 
    geom_facet(mapping = aes(xmin = start, xmax = end, fill = gene),
               data = example_genes, geom = geom_motif, panel = 'Alignment',
               on = 'genE', label = 'gene', align = 'left') +
    scale_fill_brewer(palette = "Set3") + 
    scale_x_continuous(expand=c(0,0)) +
    theme(strip.text=element_blank(),
        panel.spacing=unit(0, 'cm'))

facet_widths(p, widths=c(1,2))
image.png

你可能感兴趣的:(2021-11-16 ggtree包绘制进化树)