植物重测序---变异位点分析(GATK篇)

基因组变异的类型有很多,譬如:单核苷酸多态性(single nucleotide polymorphism, SNP)、单核苷酸位点变异(single nucleotide variant, SNV)、插入/缺失(Insertion/Deletion. InDel)、基因组拷贝数变异(copy number variation, CNV)、基因组结构变异(structure variation, SV)、融合基因(Fusion gene)等等。
这里我们主要介绍GATK鉴定SNPs和InDels。

Step1:软件准备

sra-tools bwa trim_galore samtools gatk4 picard
使用conda解决一切软件
conda install -y sra-tools bwa samtools gatk4 picard

Step2:数据准备

使用的测试数据来源于NCBI-SRA
SRR2040561
SRR2052532
prefetch SRR2040561 -O ./
prefetch SRR2052532 -O ./

Step3:BWA比对--->bam文件

#首先构建基因组索引
 nohup bwa index -a is Oryza_sativa.fasta &
#参考基因组构建的索引结果
358M Nov 22 09:51 Oryza_sativa.fasta.bwt
90M Nov 22 09:51 Oryza_sativa.fasta.pac
509 Nov 22 09:51 Oryza_sativa.fasta.ann
14K Nov 22 09:51 Oryza_sativa.fasta.amb
179M Nov 22 09:53 Oryza_sativa.fasta.sa
377 Nov 22 09:53 nohup.out

#使用BWA进行比对
#bwa+samtools联用
bwa mem -t 10 -M ./Oryza_sativa.fasta \
./SRR2040561_1_val_1.fq.gz \
./SRR2040561_2_val_2.fq.gz |samtools sort -@ 4 -m 1G -o SRR2040561.sort.bam -
bwa mem -t 10 -M ./Oryza_sativa.fasta\
./SRR2052532_1_val_1.fq.gz \
./SRR2052532_2_val_2.fq.gz |samtools sort -@ 4 -m 1G -o SRR2052532.sort.bam -
去除PCR重复
#picard rmduplication
picard -Xmx4g MarkDuplicates I=SRR2040561.sort.bam \
O=SRR2040561.sort.rmdup.bam REMOVE_DUPLICATES=true \
M=SRR2040561.mark_dup_matrix
picard -Xmx4g MarkDuplicates I=SRR2052532.sort.bam \
O=SRR2052532.sort.rmdup.bam REMOVE_DUPLICATES=true \
 M=SRR2052532.mark_dup_matrix
# 此时要注意,这是的bam文件是没有标签的,如果进行GATK分析的时候会报错,因此必须再此加上标签

#加标签
#add header to bam by picard
picard AddOrReplaceReadGroups \
    I=SRR2040561.sort.rmdup.bam \
O=SRR2040561.sort.rmdup.bam2 \
SO=coordinate ID=SRR2040561 \
LB=SRR2040561 PL=Illumina PU=run SM=SRR2040561
picard  AddOrReplaceReadGroups \
    I=SRR2052532.sort.rmdup.bam \
O=SRR2052532.sort.rmdup.bam2 \
SO=coordinate ID=SRR2052532 \
LB=SRR2052532 PL=Illumina PU=run SM=SRR2052532

#bam index + flagstat
# bam+index
samtools index V1.bam
samtools index V2.bam
# bam +flagstat
samtools flagstat V1.bam > V1.bam.flagstat
samtools flagstat V2.bam > V2.bam.flagstat

Step4:GATK--call SNP InDel

#基因组fasta文件还有.dict 和.fai 两个索引文件
samtools faidx Oryza_sativa.fasta 
picard CreateSequenceDictionary \
    R=Oryza_sativa.fasta O=Oryza_sativa.dict
#HaplotypeCaller 
bamdir=/home/GATK/bam
ref=./Oryza_sativa.fasta
gatk HaplotypeCaller -R $ref \
-I $bamdir/SRR2052532.rm.bam -ERC GVCF \
-O $bamdir/SRR2052532.g.vcf \
    --genotyping-mode DISCOVERY \
--pcr-indel-model CONSERVATIVE --sample-ploidy 2 \
    --min-base-quality-score 10 --kmer-size 10 --kmer-size 25
gatk HaplotypeCaller -R $ref \
-I $bamdir/SRR2040561.rm.bam -ERC GVCF \
-O $bamdir/SRR2040561.g.vcf \
    --genotyping-mode DISCOVERY \
--pcr-indel-model CONSERVATIVE --sample-ploidy 2 \
    --min-base-quality-score 10 --kmer-size 10 --kmer-size 25

#Combine
gatk CombineGVCFs -R $ref \
-O combined.g.vcf \
-V $bamdir/SRR2052532.g.vcf -V $bamdir/SRR2040561.g.vcf
#Genotype
gatk GenotypeGVCFs -R $ref \
-O $bamdir/variants.raw.vcf \
-V $bamdir/combined.g.vcf
#fileter SNP
gatk SelectVariants -R $ref \
-V $bamdir/variants.raw.vcf \
--select-type SNP -O $bamdir/variants.raw.snp.vcf

gatk VariantFiltration -R $ref \
-V $bamdir/variants.raw.snp.vcf \
--filter-expression "QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 ||  ReadPosRankSum < -8.0" \
--filter-name 'SNP_filter' \
-O $bamdir/all.filter.snp.vcf

gatk SelectVariants -R $ref \
-V $bamdir/all.filter.snp.vcf \
--exclude-filtered -O $bamdir/all.filtered.snp.vcf
#同理的办法去过滤掉Indel就可以啦!

你可能感兴趣的:(植物重测序---变异位点分析(GATK篇))