去除测序reads中的接头:adaptor

之前用c写过一个程序,查找reads中是否包含了adaptor,如果检测到的话就过滤掉含有adaptor的reads,这次在过滤完数据之后发现接头序列比较多,为了提升组装效果,又不能很大地影响数据量,需要对接头进行截断处理,并过滤过短的reads,用python写了一个简短的程序,指定超过3个错配以内的匹配都认为匹配到,并且长度小于50bp的reads过滤,在以下程序基础上添加传入参数,可以适用比较多的情况(单端、双端、含有single等):

 1 import sys
 2 import re
 3 from Bio import SeqIO
 4 
 5 def rmPE(read1,read2,adaptor1,adaptor2,min_length):
 6     res_1 = rmSE(read1,adaptor1,min_length)
 7     res_2 = rmSE(read2,adaptor2,min_length)
 8     if res_1 and res_2:
 9         return res_1,res_2
10     else:
11         return False
12 
13 def rmSE(read,adaptor,min_length):
14     seq = read.seq
15     seed_len = 6
16     a_len = len(adaptor)
17     seq_len = len(seq)
18     for i in range(a_len - seed_len):
19         seed = adaptor[i:i+seed_len]
20         pos = 0
21         while(pos < seq_len):
22             find_pos = seq.find(seed,pos)
23             if find_pos > 0:
24                 mistaken_count = 0
25                 _b = find_pos
26                 _e = find_pos + seed_len
27                 while(_b >= 0 and i >= find_pos - _b):
28                     if adaptor[i - find_pos + _b] != seq[_b]:
29                         mistaken_count += 1
30                     if mistaken_count > 3:
31                         break
32                     _b -= 1
33                 else :
34                     while(_e < seq_len and i - find_pos + _e < a_len):
35                         if adaptor[ i - find_pos + _e ] != seq[_e]:
36                             mistaken_count += 1
37                         if mistaken_count > 3:
38                             break
39                         _e += 1
40                     else:
41                         if find_pos - i > min_length:
42                             return  read[:find_pos-i]
43                         else :
44                             return False
45                 pos = find_pos + 1
46             else:
47                 break
48     return read
49 
50 def rmAdaptor(argv):
51     argv.pop(0)
52     type = argv.pop(0)
53     if type=='PE':
54         read1_file,read2_file,adaptor1,adaptor2,out_prefix,min_length = argv
55         read2_records = SeqIO.parse(open(read2_file),'fastq')
56         read1_out = open( '%s.1.fq'%out_prefix,'w' )
57         read2_out = open( '%s.2.fq'%out_prefix,'w' )
58         for read1 in SeqIO.parse(open(read1_file),'fastq'):
59             read2 = read2_records.next()
60             rmPE_res = rmPE(read1,read2,adaptor1,adaptor2,min_length)
61             if rmPE_res:
62                 read1_out.write(rmPE_res[0].format('fastq'))
63                 read2_out.write(rmPE_res[1].format('fastq'))
64     elif type=='SE':
65         reads_file,adaptor,out_prefix,min_length = argv
66         reads_out = open( '%s.single.fq'%out_prefix,'w' )
67         for reads in SeqIO.parse(open(reads_file),'fastq'):
68             rmSE_res = False
69             if re.search('[\s\/](\d)',reads.id).group(1) == '1':
70                 rmSE_res = rmSE(reads,adaptor1,min_length)
71             elif re.search('[\s\/](\d)',reads.id).group(1) == '2':
72                 rmSE_res = rmSE(reads,adaptor2,min_length)
73             if rmSE_res:
74                 reads_out.write(rmSE_res.format('fastq'))
75 
76 if __name__ == '__main__':
77     rmAdaptor(sys.argv)

 

你可能感兴趣的:(read)