【生信】使用VSEARCH处理双端测序数据

使用VSEARCH处理双端测序数据


        一般情况下,我们从ENA获取的数据为双端测序数据,而且已经去除了引入,那么如何从测序数据获得丰度数据呢?可以使用的软件有VSEARCH、USEARCH以及QIIME等。本文以VSEARCH为例,从输入双端测序序列到最终OTU表的生成。

#!/bin/bash
### 使用VSEARCH处理双端测序数据
### 目标是得到OTU表和注释信息 

#########################################	
# 目录					#	
# data/:双端测序数据:如PRJEB9708	#
# database/:参考数据库                  #
# temp/:临时文件			#
# res/:最终结果				#
# err/:错误信息				#
# log/:日志				#
#########################################

# 文件说明
# SequenceDataProcess_byVSEARCH.sh :分析主流程(shell脚本)
# rdp_16s_v16.fa :16s数据库                       
# data/*/*.fastq.gz : 压缩的原始测序数据                      
#############################################################

# 定义路径
HOME=$HOME/Lab_2
data=$HOME/data
refdb=$HOME/database
temp=$HOME/temp
res=$HOME/res
err=$HOME/err
log=$HOME/log
############################################################

## 定义特殊文件名 ##########


echo "Start"
cd $HOME

echo "Step_1:Paired-end reads merging"
prefix='ERR7765'	# 双端测序数据前缀

#echo "Step_1.1;decompress paired-end reads"
#gunzip ${data}/*/*.fastq.gz

echo "Step_1.2:Merge paired-end reads"
## 55,69 表示样品序号从55到59;如ERR776555~ERR776569
for i in `seq 55 69`;do
vsearch --fastq_mergepairs ${data}/${prefix}${i}/${prefix}${i}_1.fastq.gz --reverse ${data}/${prefix}${i}/${prefix}${i}_2.fastq.gz --gzip_decompress --fastq_maxdiffs 5 --fastq_maxns 0 --threads 10 --fastaout ${temp}/${prefix}${i}_merged.fasta --fastqout ${temp}/${prefix}${i}_merged.fastq --log ${log}/${prefix}${i}_mergeing.log
done

cat ${temp}/${prefix}*_merged.fastq > ${res}/merged.fq
cat ${temp}/${prefix}*_merged.fasta > ${res}/merged.fa
echo "查看合并的fasta和fastq文件:"
ls -lh ${res}/merged.*

echo "Setp_1.3:Get sample names"
./usearch11 -fastx_get_sample_names ${res}/merged.fa -output ${res}/samples.txt -threads 30

# 质量控制 
## fastq filter, keep reads error rates less than 1%
## --fastq_maxee_rate 0.01 或者是 --fastq_maxee 1.0
echo "Step_2:Quality filtering"
### vsearch的方式
time vsearch --fastx_filter ${res}/merged.fq --fastq_maxee_rate 0.01 --fastq_maxns 0  --fastaout ${temp}/filtered_v.fa --fastaout_discarded ${temp}/discarded.fa --log ${log}/filter_v.log --threads 30

echo "Step_3:Dereplication and cluster otus"
# 序列去冗余,推荐使用vsearch,并添加miniuniqusize为8,去除低丰度,增加计算速度
### --minuniquesize 2 保留最小丰度reads数,建议最小设置为2,去掉所有的单次出现序列
miniquesize=2
time vsearch --derep_fulllength ${temp}/filtered_v.fa --minuniquesize ${miniquesize} --output ${temp}/derep_v.fa --uc ${temp}/derep_v.uc --sizeout

echo "Step_3.1:Discard singletons"
./usearch11 -sortbysize ${temp}/derep_v.fa -fastaout ${temp}/sorted.fa -minsize 2

# 聚类方式生成OTU
echo "Step_4:UPARSE-OTU"
echo "Step_4.1:Make 97% OTUs and filter chimeras"
./usearch11 -cluster_otus ${temp}/sorted.fa -otus ${res}/otus_1.fa -relabel Otu
./usearch11 -cluster_otus ${temp}/derep_v.fa -otus ${res}/otus_2.fa # for compare with otus_1.fa

echo "Step_4.2:Denoise"
./usearch11 -unoise3 ${temp}/sorted.fa -zotus ${temp}/zotus_1.fa

echo "使用vsearch 聚类OTU"
time vsearch --cluster_fast ${temp}/derep_v.fa --id 0.97 --biomout ${temp}/cluster.biom. --centroids ${temp}/otus_v.fa --mothur_shared_out ${temp}/cluster.mothur --otutabout ${temp}/cluster.txt --relabel OTU_ --uc ${temp}/cluster.uc

echo "去嵌合体"
# 下载参考数据库RDP
#wget http://drive5.com/uchime/rdp_gold.fa
time vsearch --uchime_ref ${temp}/otus_v.fa -db ${refdb}/gold.fa --borderline ${temp}/chimeric.sequences.borderline --chimeras ${temp}/chimeric.sequence --nonchimeras ${res}/otus_v.fa

echo "创建OTUs表"
time vsearch -usearch_global ${temp}/filtered_v.fa --db ${res}/otus_v.fa --id 0.97 --strand plus --alnout ${res}/shr.aln --biomout ${res}/Otutable.biom --mothur_shared_out ${res}/Otutable.mothur --otutabout ${res}/Otutable.txt --uc ${res}/Output.uc --threads 10

echo "OTU QC: Tight clusters"
./usearch11 -cluster_fast ${temp}/otus_v.fa -id 0.97 -maxaccepts 4 -maxrejects 128 -top_hit_only -uc ${res}/hits.uc -centroids ${res}/new_otus_1.fa
echo "To check for hits and see the identities"
grep "^H" ${res}/hits.uc | cut -f4 | sort -g

echo " 物种注释"
## 下载注释参考数据库

#cd databases/
#wget https://www.drive5.com/sintax/rdp_16s_v16.fa.gz
#gunzip rdp_16s_v16.fa.gz
#cd ../
time vsearch --usearch_global ${res}/otus_v.fa --db ${refdb}/silva_16s_v123.fa --biomout ${res}/taxonomy_out.biom --mothur_shared_out ${res}/taxonomy_out.mothur --otutabout ${res}/taxonomy_out.txt --id 0.97 --threads 10

echo ""
printf "最终的OTUs表:%s\n" ${res}/Otutable.txt
printf "最终的OTU序列:%s\n" ${res}/otus_v.fa
printf "最终的OTU注释:%s\n" ${res}/taxonomy_out.txt

echo "接下来:进化树,Alpha,Beta多样性分析以及后续操作"
# End
echo "End"

 

你可能感兴趣的:(数据挖掘与分析策略,学习历程记录)