Trie后缀树Python简单实现

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created on Dec 17, 2012


@author: honghe
'''


import pickle


class TrieNode(object):
    def __init__(self):
        self.count = 1  # 统计此结点代表的字符串出现的次数
        self.children = {}  
        
class Trie(object):
    def __init__(self):
        self.root = TrieNode()
        
    def add(self, sequence):
        node = self.root
        for c in sequence:
            if c not in node.children:
                child = TrieNode()
                node.children[c] = child
                node = child
            else:
                node = node.children[c]
                node.count = node.count + 1
    
    def countSeq(self, sequence):
        '''计算序列出现的次数
        '''
        node = self.root
        for c in sequence:
            if c not in node.children:
                return 0
            else:
                node = node.children[c]
        return node.count
    
def gen_trie(input_file, output_file):
    '''生成trie树
    '''
    trie = Trie()
    
    with open(input_file) as f:
        for line in f:
            # 增加'$'用来区别是否是完整后缀
            line = line.strip() + '$'
            for i in range(len(line)):
                l = line[i:]
                trie.add(l)
            
    with open(output_file, 'wb') as f:
        pickle.dump(trie, f)
        
    return trie
  
if __name__ == '__main__':
    txt = 'data.txt'
    pkl = 'data.pkl'
    t = gen_trie(txt, pkl)

你可能感兴趣的:(python)