该脚本和之前 Blast结果处理 的目的是一样的,只不过这次是用python的正则表达式来定义一个名为parse_blast()
的函数来进行处理,感觉比之前那个更简单一些。
和之前用到的测试数据一样,其部分内容如下所示:
# BLASTP 2.7.1+
# Query: Q9Q6P4
# Database: blastdatabase/ElmMotif.fasta
# 0 hits found
# BLASTP 2.7.1+
# Query: P08668
# Database: blastdatabase/ElmMotif.fasta
# 0 hits found
# BLASTP 2.7.1+
# Query: NP_056788.1
# Database: blastdatabase/ElmMotif.fasta
# Fields: query acc., subject acc., score, evalue, % identity, % query coverage per subject, % positives
# 2 hits found
NP_056788.1 PF01115-1023-1041-1666 38 2.4 57.143 2 71.43
NP_056788.1 PF01267-1023-1041-1665 38 2.4 57.143 2 71.43
# BLASTP 2.7.1+
# Query: L0N5F9
# Database: blastdatabase/ElmMotif.fasta
# Fields: query acc., subject acc., score, evalue, % identity, % query coverage per subject, % positives
# 19 hits found
L0N5F9 PF00069-19-31-1128 40 1.4 100.000 1 100.00
L0N5F9 PF01115-986-1003-1656 38 3.3 60.000 1 90.00
L0N5F9 PF01267-986-1003-1655 38 3.3 60.000 1 90.00
L0N5F9 PF01115-986-1003-1654 38 3.3 60.000 1 90.00
L0N5F9 PF01267-986-1003-1653 38 3.3 60.000 1 90.00
L0N5F9 PF01115-982-999-1652 38 3.3 60.000 1 90.00
L0N5F9 PF01267-982-999-1651 38 3.3 60.000 1 90.00
L0N5F9 PF01115-982-999-1650 38 3.3 60.000 1 90.00
L0N5F9 PF01267-982-999-1649 38 3.3 60.000 1 90.00
L0N5F9 PF01115-982-999-1648 38 3.3 60.000 1 90.00
L0N5F9 PF01267-982-999-1647 38 3.3 60.000 1 90.00
L0N5F9 PF01115-982-999-1646 38 3.3 60.000 1 90.00
L0N5F9 PF01267-982-999-1645 38 3.3 60.000 1 90.00
L0N5F9 PF01857-106-127-943 35 8.8 66.667 1 88.89
L0N5F9 PF02991-246-256-2047 34 9.9 75.000 1 87.50
L0N5F9 PF02991-246-256-2045 34 9.9 75.000 1 87.50
L0N5F9 PF02991-246-256-2044 34 9.9 75.000 1 87.50
L0N5F9 PF02991-246-256-2043 34 9.9 75.000 1 87.50
L0N5F9 PF02991-246-256-2031 34 9.9 75.000 1 87.50
# BLASTP 2.7.1+
# Query: YP_081514.1
# Database: blastdatabase/ElmMotif.fasta
# 0 hits found
# BLASTP 2.7.1+
# Query: P03188
# Database: blastdatabase/ElmMotif.fasta
# 0 hits found
# 用正则表达式处理 Blast 的结果,
# Blast结果的格式为 'query acc., subject acc., score, evalue, % identity, % query coverage per subject, % positives'
import re
def parse_blast(blastResult,outfile): ## blastResult 表示 blast 的运行结果文件
with open(blastResult) as inF:
content = inF.read() ## 将 blast 结果中的所有数据 赋给 content
patt_base = re.compile((
'# (BLASTP.+)\n' ## 获取 BLAST 的版本信息
'# Query:\s*(.+)\n' ## 获取 Query
'# Database:\s*(.+)\n' ## 获取 blast 时指定的 库 名称
'# Fields:\s*.+\n'
'# (\d+)\s*hits found\n' ## 获取 hits 的个数 (有 Fields,那么 hits 就不是 0)
), re.MULTILINE)
match_base = patt_base.findall(content)
outF = open(outfile,'w')
outF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
('Blast version','Query','Database','Hits found','Subject','Score','Evalue','Identity','Query coverage per subject','Positives'))
for i in range(len(match_base)):
blast_version = match_base[i][0].strip()
blast_query = match_base[i][1].strip()
blast_database = match_base[i][2].strip()
blast_hits = match_base[i][3].strip()
patt_info = re.compile((
blast_query+'\s+([A-Za-z0-9._-]{1,})\s+([0-9.]{1,})\s+([0-9.]{1,})\s+([0-9.]{1,})\s+([0-9.]{1,})\s+([0-9.]{1,})\n'
),re.MULTILINE)
match_info = patt_info.findall(content)
if int(blast_hits) == len(match_info):
for j in range(len(match_info)):
subject_acc = match_info[j][0].strip()
score = match_info[j][1].strip()
e_value = match_info[j][2].strip()
identity = match_info[j][3].strip()
query_coverage = match_info[j][4].strip()
positives = match_info[j][5].strip()
outF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
(blast_version,blast_query,blast_database,blast_hits,subject_acc,score,e_value,identity,query_coverage,positives))
else:
print('脚本出错,Hits Found 的数目和实际匹配上的个数不一致!')
outF.close()
调用方式:
infile = '' ## Blast的结构文件作为输入文件。
outfile = '' ## 指定输出文件,将处理结果输出。
parse_blast(blastResult=infile,outfile=outfile)