使用Diamond比对NR数据库获取物种注释

 之前用Kraken2注释宏基因组的contig,发现只有30%左右可以被Kraken2注释

Kraken2+Bracken:宏基因组物种注释-CSDN博客

不信邪,再用NR库试试 

参考:

将NR数据库diamond比对结果做物种注释_diamond 物种注释-CSDN博客

NR下载

nohup wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz &
wget -c https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5
#检查下载的完整性(一样的话就是完整了)
md5sum nr.gz  ### 899ac219cac213c60fede9c3d9ef8f7b  nr.gz
cat nr.gz.md5 ### 899ac219cac213c60fede9c3d9ef8f7b  nr.gz
nohup gunzip  nr.gz  &
mv nr  nr.fa

###下载NR物种相关信息和taxid信息
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5
#检查完整性
md5sum prot.accession2taxid.gz  #32b8ca7ea712e161c72af69135fc938e
cat prot.accession2taxid.gz.md5 #32b8ca7ea712e161c72af69135fc938e
md5sum taxdump.tar.gz           #26558b800bc1b3795c25d1f0ead65412
cat taxdump.tar.gz.md5          #26558b800bc1b3795c25d1f0ead65412  
#解压
tar -zxvf taxdump.tar.gz
gunzip prot.accession2taxid.gz
#移动需要的文件
cd ~
mkdir .taxonkit
cp *.dmp ~/.taxonkit

 必要软件下载

conda create -n NR_database_search
conda activate NR_database_search
conda install -c bioconda taxonkit=0.15.0
conda install -c bioconda diamond=2.1.8
conda install -c bioconda csvtk=0.28.0

diamond建库 

nohup diamond makedb --threads 180 --in nr.fa --db NR_2023_07_23 &

diamond比对

diamond blastx --db NR_2023_07_23.dmnd --query nucleic_reads.fna\
-o nucleic_matches_fmt6.txt --threads 180 --evalue 0.00001 \
--max-target-seqs 5 --outfmt 6

diamond blastp --db NR_2023_07_23.dmnd --query protein_reads.fna\
-o protein_matches_fmt6.txt --threads 180 --evalue 0.00001 \
--max-target-seqs 5 --outfmt 6
## --outfmt 6 最好别改变

 这些参数可以调整diamond比对的速度or准确性

使用Diamond比对NR数据库获取物种注释_第1张图片

 这几个参数可以调整比对的coverage,identity,score

结果如下 !!!(这个表头后面python会加)

taxonkit获得物种分类信息表

感谢大佬:一文完成nt库序列快速下载及blast结果注释物种 (qq.com)

得到seqid注释之后,可以搜索注释

## 一些主要的物种编号
# 2     bacteria
# 2157  archaea
# 4751  fungi
# 10239 virus
# 2759 Eukaryota

#看taxnokit安好了么
taxonkit -h

#创建目录
cd /home/zhongpei/database/NR_2023_07_23
mkdir /home/zhongpei/database/NR_2023_07_23/NCBI_Main_tax
cd NCBI_Main_tax

#开始
taxonkit list -j 4 --ids 2,2157,4751,10239,2759 --indent "" > NCBI_Main.taxid.txt
# -j 是线程,软件说4个够了;--ids 是需要的物种编号,用逗号分隔
wc -l NCBI_Main.taxid.txt # 2708739 NCBI_Main.taxid.txt
head -n 5 NCBI_Main.taxid.txt #查看内容

# 提取taxid和taxonomy(界门纲目科属种)的对应信息到NCBI_Main.taxid.txt
less NCBI_Main.taxid.txt | taxonkit reformat -I 1 -r Unassigned -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}"| sed '1i\Taxid\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\tStrain' > NCBI_Main.taxid_new.txt
# -I 1 一个制表符分隔;-r 没有找到的用什么字符去填充,这里用的“Unassigned” 
# -f 输出的格式;1i表示在第行之前插入文本(sed用法,不太会)

完成! 

使用Diamond比对NR数据库获取物种注释_第2张图片

vim NCBI_Main.taxid_new.txt

把第一个Taxid改成小写

seqid和taxid的对应

还记不记得第一步下载过一个 "prot.accession2taxid" ,现在要派上用场了

其实python也能做,csvtk太不熟悉了,先来学习一下吧,感觉还挺方便(这一步比较慢)

cat prot.accession2taxid | csvtk -t grep -f taxid -P NCBI_Main.taxid.txt | csvtk -t cut -f accession.version,taxid > NCBI_seqid_taxid.txt
# "cat prot.accession2taxid |" 把 prot.accession2taxid 的内容到下面的 csvtk
# -t 输入内容是制表符分隔;grep 这是csvtk的1个子命令,用于在文件中搜索匹配的行
# -P 搜索那些"taxid"字段的值出现在"NCBI_Main.taxid.txt"文件中的行
# cut 这是csvtk的1个子命令,用于从输入中选择特定的字段

NCBI_seqid_taxid.txt 就是目标文件

diamond seqid和taxid对应,再和界门纲目科属种对应

 把diamond结果文件与NCBI_seqid_taxid.txt对应

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
# ##########################################################
# match diamond blast NR result.txt and NCBI_seqid_taxid.txt
# written by PeiZhong in IFR of CAAS

import os
import argparse

parser = argparse.ArgumentParser(description='match diamond blast NR result.txt and NCBI_seqid_taxid.txt. '
'!!! all the file should end will .txt !!!')
parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your diamond result txt')
parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
'your two-column diamond results file in this folder, e.g., clean, '
'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
parser.add_argument('NCBI_acc_taxid',help='full Path that contain NCBI_seqid_taxid.txt')
args = parser.parse_args()

result_file_folder_path = args.diamond_result_folder_path
NCBI_file = args.NCBI_acc_taxid
file_mark = args.result_files_mark

os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
print(files)

db={}

print("start db read")
f_table = open("%s" % (NCBI_file), 'r')
print("start db build")
for line in f_table.readlines():
    line=line.split('\t')
    acc_num = line[0].strip()
    tax_num = line[1].strip()
    db[acc_num] = tax_num
print("finish db build")

file_ls = []
for result_file in files:
    if file_mark in result_file:
        file_ls.append(result_file)
file_ls.sort()
print(file_ls)

header = "qseqid\taccession.version\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"

for result_file in file_ls:
    print(result_file)
    f_result = open("%s" % (result_file), 'r+')
    content = f_result.read()
    f_result.seek(0, 0)
    f_result.write(header.rstrip('\r\n') + '\n' + content)
    f_result.close()
    out_name = str(result_file).strip('txt')
    out_name = out_name+'_taxid.txt'
    f_result = open("%s" % (result_file), 'r')
    f_out = open("%s" % (out_name), 'a')
    for line in f_result.readlines():
        line = line.split('\t')
        query_num = line[0].strip()
        acc_q_num = line[1].strip()
        if acc_q_num in db:
            print(query_num,end="\t",file=f_out)
            print(acc_q_num, end="\t", file=f_out)
            print(db[acc_q_num], file=f_out)
    f_out.close()
    f_result.close()
f_table.close()

使用Diamond比对NR数据库获取物种注释_第3张图片

chmod +x diamond_NR_tax.py(完整地址)
diamond_NR_tax.py(完整地址) -h
diamond_NR_tax.py(完整地址) diamond_result_folder_path result_files_mark NCBI_acc_taxid
diamond_result_folder_path:你存放上面处理完的diamond比对文件,txt结尾的文件,的目录(完整地址)
result_files_mark:这个地址中,你的这些文件独有的标识字符串
#for example,the mark of result file 'SY10_clean.txt' and 'SY11_clean.txt' is 'clean'
NCBI_acc_taxid:你的NCBI_seqid_taxid.txt文件的完整地址

使用Diamond比对NR数据库获取物种注释_第4张图片

做完结果是这样的 

使用Diamond比对NR数据库获取物种注释_第5张图片

接下来再写个python代码根据taxid把这个文件和界门纲目科属种联系起来就行(不好意思只会python,我检讨。。。。但是python简单呀)

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
# ##########################################################
# match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt
# written by PeiZhong in IFR of CAAS

import os
import argparse

parser = argparse.ArgumentParser(description='match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt. '
'!!! all the file should end will .txt !!!')
parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your result txt')
parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
'your two-column diamond results file in this folder, e.g., clean, '
'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
parser.add_argument('NCBI_taxid_tax',help='full Path that contain NCBI_Main.taxid_new.txt')
args = parser.parse_args()

result_file_folder_path = args.diamond_result_folder_path
NCBI_file = args.NCBI_taxid_tax
file_mark = args.result_files_mark

os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
print(files)

db={}

print("start db read")
f_table = open("%s" % (NCBI_file), 'r')
print("start db build")
for line in f_table.readlines():
    line=line.split('\t')
    taxid = line[0].strip()
    tax_anno = line[1:8]
    db[taxid] = tax_anno
print("finish db build")

file_ls = []
for result_file in files:
    if file_mark in result_file:
        file_ls.append(result_file)
file_ls.sort()
print(file_ls)

for result_file in file_ls:
    print(result_file)
    out_name = str(result_file).strip('txt')
    out_name = out_name+'_tax.txt'
    f_result = open("%s" % (result_file), 'r')
    f_out = open("%s" % (out_name), 'a')
    for line in f_result.readlines():
        line = line.split('\t')
        query_num = line[0].strip()
        acc_q_num = line[1].strip()
        taxid_1 = line[2].strip()
        if taxid_1 in db:
            print(query_num,end="\t",file=f_out)
            print(acc_q_num, end="\t", file=f_out)
            print(taxid_1, end="\t", file=f_out)
            tax_in_db = db[taxid_1]
            str_ls = map(str, tax_in_db)
            tax = '\t'.join(str_ls)
            print(tax, file=f_out)
    f_out.close()
    f_result.close()
f_table.close()

使用Diamond比对NR数据库获取物种注释_第6张图片

和上面一样也需要给权限

使用Diamond比对NR数据库获取物种注释_第7张图片

好了,可以交差了 !我宣布python是我们初学者的yyds

使用Diamond比对NR数据库获取物种注释_第8张图片

再把多个样本的结果结合到一起成为表格 

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
# ##########################################################
# match diamond blast NR result.txt and NCBI_seqid_taxid.txt
# written by PeiZhong in IFR of CAAS
import argparse
import os
import pandas as pd
import csv

parser = argparse.ArgumentParser(description='tax files combine')
parser.add_argument('tax_result_folder_path',help='full Path of the folder that contain your tax result txt')
parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
'your two-column diamond results file in this folder, e.g., clean, '
'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
args = parser.parse_args()

result_file_folder_path = args.tax_result_folder_path
file_mark = args.result_files_mark

path = os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)

file_ls = []
for file in files:
    if file_mark in file:
        file_ls.append(file)
file_ls.sort()
print(file_ls)

for file in file_ls:
    df = pd.read_csv('%s' % (file),sep='\t')
    df.drop_duplicates(subset='qseqid', keep='first', inplace=True)
    outname = str(file).rsplit(".", 1)[0]
    df.to_csv(outname+'_only1.txt', index=False, sep='\t')

path = os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
file2_ls = []
for file2 in files:
    if "only1" in file2:
        file2_ls.append(file2)
file2_ls.sort()
print(file2_ls)

def tax_finder(file):
    tax_ls = []
    with open(file, 'r') as f:
        for line in f.readlines():
            if "qseqid" not in line:
                tax = ';'.join(line.split('\t')[3:9])
                tax_ls.append(tax)
    db = {}
    for i in tax_ls:
        if i in db:
            db[i] += 1
        if i not in db:
            db[i] = 1
    output = str(file).rsplit(".",1)[0]+"_count1.txt"
    with open(output,"a") as f3:
        for key, value in db.items():
            key = key.strip()
            print(key, end="\t", file=f3)
            print(value, file=f3)

for file in file2_ls:
    tax_finder(file)

path = os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
file3_ls = []
for file3 in files:
    if "count1" in file3:
        file3_ls.append(file3)
file3_ls.sort()
print(file3_ls)

all_tax = {}

for file in file3_ls:
    with open("%s" % (file),"r") as f4:
        for line in f4.readlines():
            tax = line.split('\t')[0]
            all_tax[tax] = 0
print(all_tax)

ls_db_result = []

for file in file3_ls:
    with open("%s" % (file),"r") as f5:
        db_name = str(file)
        db_name = all_tax.copy()
        for line in f5.readlines():
            tax = line.split('\t')[0]
            count = line.strip("\n").split('\t')[1]
            if tax in db_name:
                db_name[tax] = count
    ls_db_result.append(db_name)

header=[]
for key in all_tax.keys():
    header.append(key)

with open('merge_overview_2.csv', 'a', encoding='utf-8', newline='') as f7:
    dictWriter = csv.DictWriter(f7,header)
    dictWriter.writeheader()
    dictWriter.writerows(ls_db_result)
f7_rows=[]
with open('merge_overview_2.csv',"r") as f7:
    for line in f7.readlines():
        f7_rows.append(line)

with open('merge_overview_3.csv',"a") as f8:
    print("",end=",",file=f8)
    print(f7_rows[0].strip("\n"),file=f8)
    row_count=0
    for i in file3_ls:
        row_count += 1
        print(i,end=",",file=f8)
        print(f7_rows[row_count].strip("\n"),file=f8)

 使用Diamond比对NR数据库获取物种注释_第9张图片 

你可能感兴趣的:(数据库)