目录
- 前言
- 方法简介
- 从vcf文件里提取有用信息
- tidy vcfR
- vcf可视化1
- vcf可视化2
- 测序深度覆盖度
- 窗口缩放
- 如何单独分离染色体
- 利用vcf信息判断物种染色体倍数
- CNV分析
vcf数据里除了位点的ATGC的对比,进行纯合/杂合判断的以外。还有一个重要的项目就是DP
,测序深度。测序深度不仅是看测序质量的重要参考,也是对染色体倍数体以及基因拷贝数进行评估的重要指标。
现重复一下之前的操作,读取数据,提取必要的数据。
提取矩阵数据
一般的VCF文件都很大,用手动提取里面的信息肯定不大现实。用vcfR
就可以轻松实现。
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.8.0.9000
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
vcf_file <- system.file("extdata", "pinf_sc50.vcf.gz", package = "pinfsc50")
vcf <- read.vcfR(vcf_file, verbose = FALSE)
查看一下R读取的数据。
vcf
## ***** Object of Class vcfR *****
## 18 samples
## 1 CHROMs
## 22,031 variants
## Object size: 22.4 Mb
## 7.929 percent missing data
## ***** ***** *****
head(vcf)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.1"
## [1] "##source=\"GATK haplotype Caller, phased with beagle4\""
## [1] "##FILTER="
## [1] "##FORMAT="
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "Supercontig_1.50" "41" NA "AT" "A" "4784.43" NA
## [2,] "Supercontig_1.50" "136" NA "A" "C" "550.27" NA
## [3,] "Supercontig_1.50" "254" NA "T" "G" "774.44" NA
## [4,] "Supercontig_1.50" "275" NA "A" "G" "714.53" NA
## [5,] "Supercontig_1.50" "386" NA "T" "G" "876.55" NA
## [6,] "Supercontig_1.50" "462" NA "T" "G" "1301.07" NA
## [1]
## [1] "***** Genotype section *****"
## FORMAT BL2009P4_us23
## [1,] "GT:AD:DP:GQ:PL" "1|1:0,7:7:21:283,21,0"
## [2,] "GT:AD:DP:GQ:PL" "0|0:12,0:12:36:0,36,427"
## [3,] "GT:AD:DP:GQ:PL" "0|0:27,0:27:81:0,81,1117"
## [4,] "GT:AD:DP:GQ:PL" "0|0:29,0:29:87:0,87,1243"
## [5,] "GT:AD:DP:GQ:PL" "0|0:26,0:26:78:0,78,1034"
## [6,] "GT:AD:DP:GQ:PL" "0|0:23,0:23:69:0,69,958"
## DDR7602 IN2009T1_us22
## [1,] "1|1:0,6:6:18:243,18,0" "1|1:0,8:8:24:324,24,0"
## [2,] "0|0:20,0:20:60:0,60,819" "0|0:16,0:16:48:0,48,650"
## [3,] "0|0:26,0:26:78:0,78,1077" "0|0:23,0:23:69:0,69,946"
## [4,] "0|0:27,0:27:81:0,81,1158" "0|0:32,0:32:96:0,96,1299"
## [5,] "0|0:30,0:30:90:0,90,1242" "0|0:41,0:41:99:0,122,1613"
## [6,] "0|0:36,0:36:99:0,108,1556" "0|0:35,0:35:99:0,105,1467"
## LBUS5 NL07434
## [1,] "1|1:0,6:6:18:243,18,0" "1|1:0,12:12:36:486,36,0"
## [2,] "0|0:20,0:20:60:0,60,819" "0|0:28,0:28:84:0,84,948"
## [3,] "0|0:26,0:26:78:0,78,1077" "0|1:19,20:39:99:565,0,559"
## [4,] "0|0:27,0:27:81:0,81,1158" "0|1:19,19:38:99:523,0,535"
## [5,] "0|0:30,0:30:90:0,90,1242" "0|1:22,22:44:99:593,0,651"
## [6,] "0|0:36,0:36:99:0,108,1556" "0|1:29,25:54:99:723,0,876"
## [1] "First 6 columns only."
## [1]
## [1] "Unique GT formats:"
## [1] "GT:AD:DP:GQ:PL"
## [1]
选取我们需要的部分也就是Genotype Section里的DP
区域。
dp <- extract.gt(vcf, element='DP', as.numeric=TRUE)
测序深度箱状图
par(mar=c(8,4,1,1))
#boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Depth", log='y', las=2)
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Depth", las=2)
abline(h=seq(0,1e4, by=100), col="#C0C0C088")
众所周知箱状图的特点就是(boxplot)包含了所有的信息,包括异常值outlier。正因为这个原因,这张图很大程度上受到了这些异常值的影响,变得非常难懂。自己看看还可以,用来发表文章的话肯定不行。
测序深度小提琴图
经过log2转换,我们可以得到理想的效果。
library(reshape2)
library(ggplot2)
dpf <- melt(dp, varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE)
head(dpf)
dpf <- dpf[ dpf$Depth > 0,]
p <- ggplot(dpf, aes(x=Sample, y=Depth)) + geom_violin(fill="#C0C0C0", adjust=1.0,
scale = "count", trim=TRUE)
p <- p + theme_bw()
p <- p + theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1, size=12))
p <- p + scale_y_continuous(trans=scales::log2_trans(),
breaks=c(1, 10, 100, 800),
minor_breaks=c(1:10, 2:10*10, 2:8*100))
p <- p + theme(axis.title.y = element_text(size=12))
p <- p + theme( panel.grid.major.y=element_line(color = "#A9A9A9", size=0.6) )
p <- p + theme( panel.grid.minor.y=element_line(color = "#C0C0C0", size=0.2) )
p <- p + stat_summary(fun.y=median, geom="point", shape=23, size=2)
p
又或者不需要转换,而是通过过滤数据来改善箱图效果。举个例子,提取90%的信赖区间的数据来可视化。
sums <- apply(dp, MARGIN=2, quantile, probs=c(0.05, 0.95), na.rm=TRUE)
dp2 <- sweep(dp, MARGIN=2, FUN = "-", sums[1,])
dp[dp2 < 0] <- NA
dp2 <- sweep(dp, MARGIN=2, FUN = "-", sums[2,])
dp[dp2 > 0] <- NA
dp[dp < 4] <- NA
par(mar=c(8,4,1,1))
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Depth")
abline(h=seq(0,200, by=20), col="#C0C0C088")
这样也可以获得类似的结果。