论文
Evolutionary origins of the SARS-CoV-2 sarbecovirus lineage responsible for the COVID-19 pandemic
本地文件 s41564-020-0771-4.pdf
代码和数据下载链接
https://github.com/plemey/SARSCoV2origins
今天的推文我们来重复一下论文中的 Figure5b
他这个图是先用密码子的RSCU值做主成分分析,然后做散点图展示结果
加载需要用到的R包
library(tidyverse)
library(ggplot2)
library(ggrepel)
读取数据
all_rscu <- read_csv('all_rscu_codonBiasReanalysis.csv')
all_df <- as.data.frame(all_rscu[,2:26])
row.names(all_df) <- all_rscu$codon
主成分分析
df_pca <- prcomp(t(all_df))
给主成分分析的结果添加分组信息
subset_viruses <- c('MG772934','MG772933','GU190215','KP886808','KP886809','MN908947')
df_out <- as.data.frame(df_pca$x)
df_out$group <- ifelse(row.names(df_out) %in% subset_viruses,'virus','vertebrate')
df_out$species <- row.names(df_out)
论文中提供的代码作图
ggplot(df_out,aes(x=PC1,y=PC2,color=group)) +
geom_point(show.legend = F) +
scale_color_manual(values = c('dodgerblue4','firebrick')) +
geom_label_repel(aes(label = species),show.legend = F) +
theme_bw() + theme(aspect.ratio = 1, panel.grid = element_blank())
但是这个图和论文中最终呈现的图还是有些细节不太一样的,我们对代码进行修改,尽量重复原图
增加箭头
ggplot(df_out,aes(x=PC1,y=PC2)) +
geom_point(show.legend = F)+
geom_label_repel(aes(label = species,
color=group),
show.legend = F) +
scale_color_manual(values = c('black','#6a9a97')) +
theme_bw() +
theme(aspect.ratio = 1,
panel.grid = element_blank())+
geom_segment(aes(x=1.5,y=0,xend=0,yend=1),
arrow = arrow(type="closed"),
lty="dashed",
color="grey")+
geom_segment(aes(x=1.5,y=0,xend=3,yend=-1),
arrow = arrow(type="closed"),
lty="dashed",
color="grey")
增加注释的文字
ggplot(df_out,aes(x=PC1,y=PC2)) +
geom_point(show.legend = F)+
geom_label_repel(aes(label = species,
color=group),
show.legend = F) +
scale_color_manual(values = c('black','#6a9a97')) +
theme_bw() +
theme(aspect.ratio = 1,
panel.grid = element_blank())+
geom_segment(aes(x=1.5,y=0,xend=0,yend=1),
arrow = arrow(type="closed"),
lty="dashed",
color="grey")+
geom_segment(aes(x=1.5,y=0,xend=3,yend=-1),
arrow = arrow(type="closed"),
lty="dashed",
color="grey")+
annotate(geom = "text",
x=-2,y=-1,
label="Eukaryotes",
fontface="bold")+
annotate(geom = "text",
x=1.6,y=0.5,
label="Coronaviruses",
fontface="bold",
color='#6a9a97')+
annotate(geom = "text",
x=0,y=1.2,
label="High GC")+
annotate(geom = "text",
x=3,y=-1.2,
label="Low GC")
论文中最终呈现的图还对其中一个点的文字标签进行了更改,这个出图后借助其他软件来更改了
欢迎大家关注我的公众号
小明的数据分析笔记本
小明的数据分析笔记本 公众号 主要分享:1、R语言和python做数据分析和数据可视化的简单小例子;2、园艺植物相关转录组学、基因组学、群体遗传学文献阅读笔记;3、生物信息学入门学习资料及自己的学习笔记!