导读
继续:基因BIN来源分析(一)cd-hit去冗余,salmon计算基因TPM
4 制作gene bin map文件
提取所有bin的prokka注释结果中的predict_id, bin_id, gene, annotation到同一个文件gene_bin_map.txt中。
touch Bin_all/Bin_gene/gene_bin_map.txt
echo -e "Predict\tBin\tGene\tAnnotation" >> Bin_all/Bin_gene/gene_bin_map.txt
for i in Bin_all/Bin_prokka/prokka_out_table/bin.*.tsv; do
file=${i##*/}
base=${file%.tsv}
cat $i | sed '1d' | awk -F"\t" -v tmp=$base 'BEGIN{OFS="\t"}{print $1, tmp, $4, $7}' >> Bin_all/Bin_gene/gene_bin_map.txt
echo -e "\033[32m add $i gene bin to map file OK\033[0m"
done
5 合并样品tpm,添加bin gene(R语言)
合并所有样品文件夹中的基因TPM表,添加bin, gene, annotation注释信息
1 合并样品TPM
#!/usr/bin/env Rscript
# 合并gene sample表
setwd("Bin_all/Bin_gene")
folds = list.files(pattern="*.quant")
result = read.table(paste(folds[1], "quant.sf", sep="/"), header=T, sep="\t")
result = result[, c(1, 4)]
colnames(result)[2] = unlist(strsplit(folds[1], split=".quant"))
for(i in 2:length(folds))
{
file = paste(folds[i], "quant.sf", sep="/")
tmp = read.table(file, sep="\t", header=T)
tmp = tmp[, c(1, 4)]
result = cbind(result, tmp[, 2])
colnames(result)[i+1] = unlist(strsplit(folds[i], split=".quant"))
print(paste("R merge NO.", i, folds[i], sep=" "))
}
write.table(result, file="gene_sample_tpm.txt", sep="\t", row.names=F, quote=F)
2 添加注释
# 添加Bin gene信息
map = read.table("gene_bin_map.txt", header=T, sep="\t", quote="")
result2 = merge(result, map, by.x="Name", by.y="Predict", all.x=T)
write.table(result2, file="gene_sample_tpm_annotation.txt", sep="\t", row.names=F, quote=F)
6 绘图准备文件
挑选target基因,同组样品取平均TPM
# task1
target = read.table("task1/target.txt", sep="\t")
target = as.character(target[,1])
target_df = result2[result2$Gene%in%target, ]
# 合并列,样品组
df1 = target_df[, c("Bin", "Gene", "Annotation")]
for(i in c(2, 5, 8, 11, 14, 17, 20))
{
# 每组求和后取平均数,放在最后一列
df1[, ncol(df1)+1] = apply(target_df[, c(i, i+1, i+2)], 1, sum)/3
# 最后一列重命名,用组名
colnames(df1)[ncol(df1)] = substr(colnames(target_df)[i], 1, 2)
}
7 绘制target基因-样品组散点图矩阵
1 数据准备
# 合并行,相同基因
# 第一行
df2 = data.frame(apply(df1[df1$Gene==target[1], c(4:10)], 2, sum))
colnames(df2)[1] = target[1]
# 其他行
for(i in 2:length(target))
{
tmp = apply(df1[df1$Gene==target[i], c(4:10)], 2, sum)
df2 = cbind(df2, tmp)
colnames(df2)[i] = target[i]
}
df2 = data.frame(t(df2))
write.table(df2, file="task1/gene_group.txt", row.names=T, quote=F, sep="\t")
2 格式处理
# 散点图
df2 = read.table("gene_group.txt", header=T, sep="\t")
df2$gene = rownames(df2)
input = melt(df2, id="gene")
3 ggplot绘图
为防止散点过大或过小取TPM平方根
library("ggplot2")
point = ggplot(input, aes(x=variable, y=gene)) +
labs(x="", y="", size="sqrt(TPM)") +
geom_point(pch=19, aes(size=sqrt(value), color=variable)) +
theme(panel.grid=element_blank(), panel.background=element_rect(color="black", fill='transparent')) +
guides(color = FALSE) +
theme(text=element_text(family="serif"))
ggsave(point, filename="gene_group.pdf")
ggsave(point, filename="gene_group.png")
8 绘制target基因-select_bin-TPM饼图矩阵
1 确定select_bins
求每个bin中每个基因的样品丰度sum,按照bin对sum排序,选择每个基因中的top bin作为最终的select bin得到新文件gene_group_select_bin.txt
# 7 top bins
df3 = df1[, c(1, 2, 4:ncol(df1))]
df3$sum = apply(df3[, c(3:ncol(df3))], 1, sum)
df3 = df3[order(df3$Gene, df3$sum, decreasing=T),]
select_bin = as.character(df3[!duplicated(df3$Gene),]$Bin)
df3 = df3[df3$Bin%in%select_bin, ]
length(as.character(unique(df3$Bin)))
length(as.character(unique(df3$Gene)))
write.table(df3, file="task1/gene_group_select_bin.txt", sep="\t", quote=F, row.names=F)
2 par for循环绘制pie图矩阵
有的基因只在一个select bin中有,此种情况若某组样品为0则饼图报错,故在for循环中加入else语句处理此类情况。
df3 = read.table("gene_group_select_bin.txt", header=T, sep="\t")
df3 = df3[order(df3$Gene, df3$Bin, decreasing=F),]
gene = as.character(unique(df3$Gene))
bin = as.character(unique(df3$Bin))
sample = colnames(df3)[3:9]
color = read.table("../group_color.list", sep="\t", comment.char="")
color2 = data.frame(bin=bin, color=color$V1[1:length(bin)])
color2 = color2[order(color2$bin, decreasing=F),]
pdf('gene_group_bin.pdf', height=100, width=50)
opar=par(no.readonly=TRUE)
par(mfrow=c(17, 7), col.main="red", family="serif")
par(mai=c(0.2, 0.2, 0.2, 0.2))
for(i in gene)
{
tmp = df3[df3$Gene==i, sample]
rownames(tmp) = as.character(df3[df3$Gene==i, "Bin"])
color_tmp = as.character(color2[color2$bin%in%rownames(tmp), "color"])
for(j in 1:7)
{
if(sum(tmp[, j])!=0)
{
pie(tmp[, j], label="", border="white", col = color_tmp)
}
else
{
pie(1, label="", border="white", col = "black")
}
}
}
par(opar)
dev.off()
3 造图例
color2$num = as.numeric(rownames(color2))*0+3
color2=rbind(color2, data.frame(bin="none", color="#000000", num=3))
legend = ggplot(color2, aes(x=bin, y=num, fill=bin)) +
geom_bar(stat="identity") +
scale_fill_manual(values=as.character(color2$color),
labels=color2$bin) +
theme(text=element_text(family="serif")) +
labs(fill="Bin") +
theme(axis.text.x = element_text(angle = 90))
ggsave(legend, filename="legend.png")