建议在Jupyter实践,python版本3.9
class Index(object):
def __init__(self, t, k):
''' Create index from all substrings of size 'length' '''
# k-mer长度 (k)
self.k = k
self.index = []
# 遍历每个k-mer
for i in range(len(t) - k + 1):
# add (k-mer, offset) pair
self.index.append((t[i:i+k], i))
# 按k-mer字母顺序排列
self.index.sort()
def query(self, p):
''' Return index hits for first k-mer of P '''
# query with first k-mer
kmer = p[:self.k]
# binary search
i = bisect.bisect_left(self.index, (kmer, -1))
hits = []
# collect matching index entries
while i < len(self.index):
if self.index[i][0] != kmer:
break
hits.append(self.index[i][1])
i += 1
return hits
def queryIndex(p, t, index):
k = index.k
offsets = []
for i in index.query(p):
# 验证剩余部分是否匹配
if p[k:] == t[i+k:i+len(p)]:
offsets.append(i)
return offsets
# 测试1
test_seq = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
pattern_seq = 'GGTATTCGGGA'
index = Index(test_seq, 4)
print(queryIndex(pattern_seq, test_seq, index))
# 相同序列共匹配到2个位置,[21, 68]
# 测试2
test_seq = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
pattern_seq = 'ACTTG'
index = Index(test_seq, 4)
print(queryIndex(pattern_seq, test_seq, index))
# 未匹配到相同序列,[0]
生信算法1 - DNA测序算法实践之序列操作
生信算法2 - DNA测序算法实践之序列统计