【生信】用USEARCH 进行Alpha,Beta,进化树等分析操作

基于USEARCH 进行Alpha,Beta,进化树等分析操作

上回收到,使用USEARCH处理完双端测序数据,生成OTU丰度表及代表性序列。这里,继续使用USEARCH进行微生物的Alpha、Beta、进化树等分析操作。主要分为两个部分,第一部分为处理双端测序的脚本修正,第二部分为Alpha、Beta、进化树等分析操作分析脚本。

第一部分:

#!/bin/bash

### 使用 vsearch+usearch11+QIIME 处理双端测序数据
### 目标 OTU表和注释信息

## 目录
# temp/: 临时文件目录
# res/: 结果文件目录
# data/:双端测序数据集
# database/:参考数据库
# err/: 错误信息
# log/: 操作日志

## 文件
HOME=$HOME/Lab_2		### 操作空间
temp=${HOME}/temp2	   	### 临时文件存放目录
res=${HOME}/res2		### 结果文件存放目录
data=${HOME}/seq		### 测序数据存放位置
refdb=${HOME}/database		### 参考测序数据库
err=${HOME}/err2		### 错误信息
log=${HOME}/log2		### 日志信息


########################################################################################### 
# 实验准备                                                                                #
# 实验数据为已合并为fasta格式的双端测序序列                                               #
# 安装usearch11、vsearch和qiime                                                           #
# 下载Greengene最新数据库,320MB                                                          #
# wget -c ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz     #
# 解压数据包后大小3.4G                                                                    #
# tar xvzf gg_13_8_otus.tar.gz                                                            #
###########################################################################################

echo "实验开始"
cd ${HOME}

echo "双端测序数据合并"
#echo "Step_1.1:decompress paired reads sequences"
#gunzip ${data}/*/*.fastq.gz		### 每个样品存放于一个目录中

prefix='ERR7765'			### 样本序号前缀
### merged.fastq 合并后的FASTQ格式reads
### merged.fasta 合并后的FASTA格式reads
### 55 样本编号末尾起始值
### 69 样本编号末尾结束值

echo "Step_1.2:Merge paired reads"
for i in `seq 55 69`;do
./usearch11 -fastq_mergepairs ${data}/${prefix}${i}/${prefix}${i}_1.fastq -reverse ${data}/${prefix}${i}/${prefix}${i}_2.fastq -fastqout ${temp}/${i}.merged.fastq -fastaout ${temp}/${i}.merged.fasta
done

echo "Step_2:Quality filtering"
for i in `seq 55 69`;do
./usearch11 -fastq_filter ${temp}/${i}.merged.fastq -fastq_maxee 1.0 -fastaout ${temp}/${i}.filtered.fasta -threads 2
done
cat ${temp}/*.filtered.fasta > ${res}/merged.fasta
#cat ${temp}/*.filtered.fastq > ${res}/merged.fastq
echo ""
ls -lh ${res}/merged.fasta
echo ""

echo "Step_3:Dereplication"
## 方式1
./usearch11 -fastx_uniques ${res}/merged.fasta -fastaout ${res}/unique.fasta --minuniquesize 2 -threads 2 -sizeout
## 方式2
# vsearch --derep_fulllength ${res}/filtered_v.fasta --output ${res}/unique_v.fa --minuniquesize 2 --threads 10 --sizeout

echo "Step_4:Discard singletons"
./usearch11 -sortbysize ${res}/unique.fasta -fastaout ${res}/discard.fasta -minsize 2

### 聚类OTU ###
echo "Step_5:UPARSE-OTU"
## 采用步骤3中的方式1的执行结果
./usearch11 -cluster_otus ${res}/unique.fasta -otus ${res}/otus_u51.fasta -uparseout ${res}/uparse.1.txt -relabel Otu -threads 10
## 采用步骤3中的执行结果
./usearch11 -cluster_otus ${res}/discard.fasta -otus ${res}/otus_u52.fasta -uparseout ${res}/uparse.2.txt -relabel Otu -threads 10
## Denoise 降噪得到的OTU
./usearch11 -unoise3 ${res}/unique.fasta -zotus ${res}/zotus_51.fasta -threads 30

## 采用QIIME的方式
echo "Picking OTUs"
pick_otus.py -i ${res}/unique.fasta -o ${res} -m uclust -r ${refdb}/gg_13_8_otus/rep_set/97_otus.fasta --threads 30
echo "Picking a representative sequence set, one sequence from each OTU"
pick_rep_set.py -i ${res}/unique_otus.txt -f ${res}/unique.fasta -o ${res}/rep_set1.fasta



### 统计fasta文件中序列的数量 ###
### 查看OTU数量  (分别为三种方式聚类得到的OTU数量以及一个降噪后得到的OTU数量) ###
echo "Number of OTU:"
grep '>' -c ${res}/otus_u51.fasta
grep '>' -c ${res}/otus_u52.fasta
grep '>' -c ${res}/zotus_51.fasta

echo "Step_6:Construct OTU table"
### Input should be reads before quality filtering and before discarding low-abundance unique sequences
### Reads must have sample indenfiers in the labels.
### The search database is the OTU (or ZOTU) sequences, which must have valid OTU identifiers in the labels.
### suggest making OTU tables for both 97% OTUs and denoised sequences (ZOTUs).
./usearch11 -otutab ${res}/merged.fasta -otus ${res}/otus_u51.fasta -otutabout ${res}/otutab_u61.txt -mapout ${res}/map_61.txt -threads 15
./usearch11 -otutab ${res}/merged.fasta -zotus ${res}/zotus_51.fasta -otutabout ${res}/zotutab_61.txt -mapout ${res}/zmap_61.txt -threads 15

# 基于数据库去嵌合体
echo "Step_7:Chimera detection"
otusfile='otus_u52.fasta'	### 选择上文中产生的OTU table
reffile='gold.fa'		### 选择一个参考数据库
printf "OTU表名:%s ,参考数据库名:%s \n"${otusfile} ${reffile}
### 基于所选择的参考数据库比对去除已知序列的嵌合体 ###
./usearch11 -uchime2_ref ${res}/${otusfile} -db ${refdb}/${reffile} -chimeras ${res}/otus_chimeras.fasta -notmatched ${res}/otus_rdp.fasta -uchimeout ${res}/otus_rdp.uchime -strand plus -mode sensitive -threads 30

### 获取嵌合体的序列ID ###
echo "Get sequenceID of chimeric:"
grep '>' ${res}/otus_chimeras.fasta|sed 's/>//g' > ${res}/otus_chimeras.id
### 剔除嵌合体的序列 ###
filter_fasta.py -f ${res}/${otusfile} -o ${res}/otus_non_chimera.fasta -s ${res}/otus_chimeras.id -n

### 检查是否为预期的序列数量
echo "Check whether it is the expected number of sequences:"
grep '>' -c ${res}/otus_non_chimera.fasta

# 去除非细菌序列
echo "Step_8:Removal of non-bacterial sequences"
### 下载Greengene最新数据库,320MB
### wget -c ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz
### 解压数据包后大小3.4G
### tar xvzf gg_13_8_otus.tar.gz
### 将OTU与97%相似聚类的代表性序列多序列比对,大约8min
time align_seqs.py -i ${res}/otus_non_chimera.fasta -t ${refdb}/gg_13_8_otus/rep_set_aligned/97_otus.fasta -o ${res}/aligned/

# 无法比对细菌的数量
echo "Number of incomparable bacteria:"
grep -c '>' ${res}/aligned/otus_non_chimera_failures.fasta 

# 获得不像细菌的OTU ID
echo "Obtaining OTU IDs that are not like bacteria:"
grep '>' ${res}/aligned/otus_non_chimera_failures.fasta|cut -f 1 -d ' '|sed 's/>//g' > ${res}/aligned/otus_non_chimera_failures.id

# 过滤非细菌序列
filter_fasta.py -f ${res}/otus_non_chimera.fasta -o ${res}/otus_rdp_align.fasta -s ${res}/aligned/otus_non_chimera_failures.id -n

# 查看我们现在还有多少OTU
echo "How much is left of OTUs?:"
grep '>' -c ${res}/otus_rdp_align.fasta

# 产生代表性序列和OTU表
echo "Step_9:Generate representative sequences and OTU tables"
# 重命名OTU,这就是最终版的代表性序列,即Reference(可选)
echo "Step_9.1:Rename OTU"
awk 'BEGIN {n=1}; />/ {print ">OTU_" n; n++} !/>/ {print}' ${res}/otus_rdp_align.fasta > ${res}/rep_seqs.fasta

# 生成OTU表
echo "Step_9.2:Generate OTU tables"
./usearch11 -usearch_global ${res}/merged.fasta -db ${res}/rep_seqs.fasta -otutabout ${res}/otu_table.txt -biomout ${res}/otu_table.biom -strand plus -id 0.97 -threads 30
#less ${res}/otu_table.txt

# OTU物种注释,表操作
### 物种注释
### 必须添加rdp_classifier.jar的路径到PATH中,否则可能报错
### 如果没有rdp_classifier,可从如下地址下载,下载后进行解压
### https://jaist.dl.sourceforge.net/project/rdp-classifier/rdp-classifier/rdp_classifier_2.2.zip
echo "Step_10:Species annotation"
#RDP_Classifier_Path=$HOME/miniconda3/share/rdp_classifier-2.2-1/rdp_classifier.jar
#echo "export RDP_JAR_PATH=${RDP_Classifier_Path}" >> $HOME/.bashrc
echo "export RDP_JAR_PATH=$HOME/database/rdp_classifier_2.2/rdp_classifier-2.2.jar" >> $HOME/.bashrc
source $HOME/.bashrc
assign_taxonomy.py -i ${res}/rep_seqs.fasta -r ${refdb}/gg_13_8_otus/rep_set/97_otus.fasta -t ${refdb}/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt -m rdp -o ${res}
#echo "Align the representative sequence set"
#align_seqs.py  -i ${res}/rep_seqs.fasta -t ${refdb}/gg_13_8_otus/rep_set_aligned/core_set_aligned.fasta.imputed -o ${res}/pynast_aligned/
#echo "Filtering the alignment prior to tre building and removing positions which are all gaps, or not useful for phylogenetic inference"
#filter_alignment.py -i ${res}/pynast_aligned/rep_seqs_aligned.fasta -o ${res}/filtered_alignment/ --remove_outliers
#echo "Building a phylogenetic tree"
#make_phylogeny.py -i ${res}/filtered_alignment/rep_seqs_aligned_pfiltered.fasta -o ${res}/rep_phylo.tre
#echo "Building an OTU table"
#make_otu_table.py -i ${res}/unique_otus.txt -t ${refdb}/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt -o ${res}/otu_table_non_chimeric.biom

### OTU表统计、格式转换、添加信息
echo "Step_11:OTU Table Statistics, Format Conversion, Adding Information"
### 文本OTU表转换为BIOM:方便操作
biom convert -i ${res}/otu_table.txt -o ${res}/otu_table.biom --table-type="OTU table" --to-json
### 添加物种信息至OTU表最后一列,命名为taxonomy
biom add-metadata -i ${res}/otu_table.biom --observation-metadata-fp ${res}/rep_seqs_tax_assignments.txt -o ${res}/otu_table_tax.biom --sc-separated taxonomy --observation-header OTUID,taxonomy
### 转换biom为txt格式,带有物种注释:人类可读
biom convert -i ${res}/otu_table_tax.biom -o ${res}/otu_table_tax.txt --to-tsv --header-key taxonomy
### 查看OTU表的基本信息:样品,OUT数量统计
biom summarize-table -i ${res}/otu_table_tax.biom -o ${res}/otu_table_tax.sum
#less ${res}/otu_table_tax.sum

# OTU表筛选
echo "Step_12:Screen of OTU table"
# 按样品数据量过滤:选择counts>3000的样品
filter_samples_from_otu_table.py -i ${res}/otu_table_tax.biom -o ${res}/otu_table2.biom -n 3000
# 查看过滤后结果:只有15个样品,1129个OTU
echo "The result of screen step1:"
biom summarize-table -i ${res}/otu_table2.biom
# 按OTU丰度过滤:选择相对丰度均值大于万分之一的OTU
filter_otus_from_otu_table.py --min_count_fraction 0.0001 -i ${res}/otu_table2.biom -o ${res}/otu_table3.biom
# 查看过滤后结果:只有15个样品,444个OTU
echo "The result of screen step2:"
biom summarize-table -i ${res}/otu_table3.biom
# 转换最终biom格式OTU表为文本OTU表格
biom convert -i ${res}/otu_table3.biom -o ${res}/otu_table3.txt --table-type="OTU table" --to-tsv
# OTU表格式调整方便R读取
sed -i '/# Const/d;s/#OTU //g;s/ID.//g' ${res}/otu_table3.txt
# 筛选最终OTU表中对应的OTU序列
filter_fasta.py -f ${res}/rep_seqs.fasta -b ${res}/otu_table3.biom -o ${res}/rep_seqs4.fasta

printf "最终的OTU表:%s\t最终的OTU序列:%s\n"${res}/otu_table_tax.txt ${res}/rep_seqs.fasta

# End
echo "接下来:进化树,Alpha,Beta多样性分析以及后续操作"
echo "End"

第二部分:

#!/bin/bash
## 使用USEARCH 进行Alpha,Beta,进化树等分析操作
## 目录要求按照SeqDP_UQ_1.sh准备

## 实验前准备
cd $HOME/Lab_2

## 目录
res=$HOME/res
temp=$HOME/temp
err=$HOME/err
log=$HOME/log
refdb=$HOME/database

echo "Step_1:Alpha diversity"
#mkdir alpha/
alpha=$HOME/alpha

echo " calculate default metrics"
./usearch11 -alpha_div ${res}/otutab.txt -output ${alpha}/alpha.txt

echo " calculate Gini-Simpson index"
./usearch11 -alpha_div ${res}/otutable.txt -output ${alpha}/gini.txt -metrics gini_simpson

echo ” calculate Chao1 and Berger-Parker indexes“
./usearch11 -alpha_div ${res}/otutable.txt -output ${alpha}/alpha.txt -metrics chao1,berger_parker

echo " alpha_div_rare command"
./usearch11 -alpha_div_rare ${res}/otutab.txt -output ${alpha}/rare.txt
echo "Step_2:Make OTU tree"
./usearch11 -calc_distmx ${res}/otus.fa -tabbedout ${alpha}/distmx.txt
./usearch11 -cluster_aggd ${alpha}/distmx.txt -treeout ${alpha}/otus.tree

echo "Step_3:Beta diversity"
#mkdir beta/
beta=$HOME/beta

./usearch11 -beta_div ${res}/otutab.txt -tree ${alpha}/otus.tree -filename_prefix ${beta}/

echo " calculate all supported beta metrics and write results in current directory"
./usearch11 -beta_div ${res}/otutable.txt

echo " calculate all supported beta metrics and write output in directory /results/beta"
./usearch11 -beta_div ${res}/otutable.txt -filename_prefix ${beta}/

echo " calculate Jaccard only and write results in current directory"
./usearch11 -beta_div ${res}/otutable.txt -metrics jaccard

echo " calculate Jaccard and Bray-Curtis and write results in current directory"
./usearch11 -beta_div ${res}/otutable.txt -metrics jaccard,bray_curtis

echo "Step_4:Rarefaction"
./usearch11 -alpha_div_rare ${res}/otutab.txt -output ${alpha}/alpha_rare.txt

echo "Step_5:Predict taxonomy"
./usearch11 -sintax ${res}/otus.fa -db ${refdb}/rdp_its_v2.fa -strand both -tabbedout ${res}/sintax.txt -sintax_cutoff 0.8

echo "Step_6:Taxonomy summary reports"
./usearch11 -sintax_summary ${res}/sintax.txt -otutabin ${res}/otutab.txt -rank g -output ${res}/genus_summary.txt
./usearch11 -sintax_summary ${res}/sintax.txt -otutabin ${res}/otutab.txt -rank p -output ${res}/phylum_summary.txt

 

你可能感兴趣的:(数据挖掘与分析策略,学习历程记录)