双数组trie树

搜索树包括B_树、B+树、Trie树等以及它们的各种变形。用Trie树搜索一个关键码的时间与关键码本身及其长度有关,最快是O(1),即在第一层即可判断是否搜索到,最坏的情况是O(n),n为Trie树的层数。

Trie树的缺点是占内存,因为Trie树的大多数结点分支很少。为了减少空间浪费,可以用两个数组来表示Trie树,也就是双数组Trie树。

一、构建:

双数组trie树_第1张图片

首先对词表中所有出现的10个汉字进行编码:啊-1,阿-2,唉-3,根-4,胶-5,拉-6,及-7,廷-8,伯-9,人-10。。对于每一个汉字,需要确定一个base值,使得对于所有以该汉字开头的词,在双数组中都能放下。例如,现在要确定“阿”字的base值,假设以“阿”开头的词的第二个字序列码依次为a1,a2,a3……an,我们必须找到一个值i,使得base[i+a1],check[i+a1],base[i+a2],check[i+a2]……base[i+an],check[i+an]均为0。一旦找到了这个i,“阿”的base值就确定为i。用这种方法构建双数组Trie(Double-ArrayTrie),经过四次遍历,将所有的词语放入双数组中,然后还要遍历一遍词表,修改base值。因为我们用负的base值表示该位置为词语。如果状态i对应某一个词,而且Base=0,那么令Base=(-1)*i,如果Base的值不是0,那么令Base=(-1)*Base。

双数组trie树_第2张图片

双数组trie树_第3张图片

二、查询:

双数组trie树_第4张图片

 

# -*- coding:utf-8 -*-

import json
import sys
import codecs 

class Trie:

    def __init__(self):
        """
        Initialize your data structure here.
        """
        self.root = {}
        
    def insert(self, word):
        """
        Inserts a word into the trie.
        """
        node = self.root
        for s in word:
            if s in node.keys():
                node = node[s]
            else:
                node[s] = {}
                node = node[s]
        node['is_word'] = True
                
    def search(self, word):
        """
        Returns if the word is in the trie.
        """
        node = self.root
        for s in word:
            if s in node.keys():
                node = node[s]
            else:
                return False
        
        if 'is_word' in node.keys():
            return True
        else:
            return False

    def startsWith(self, prefix):
        """
        Returns if there is any word in the trie that starts with the given prefix.
        """
        node = self.root
        for s in prefix:
            if s in node.keys():
                node = node[s]
            else:
                return False
        
        return True

    def findTips(self, prefix):
	    #if not self.startsWith(prefix):
	    #    return [] 
        node = self.root
        for s in prefix:
            if s in node.keys():
                node = node[s]
            else:
                return False
	    tips = []
	    self.loopfindTips(node, prefix, tips)
	    return tips

    def loopfindTips(self, node, prefix, tips):
	    if ('is_word' in node and node['is_word']):
	        tips.append(prefix)
	        return
	    for s in node.keys():
	        self.loopfindTips(node[s], prefix+s, tips)      

# Your Trie object will be instantiated and called as such:
if __name__ == "__main__":
  obj = Trie()
  obj.insert(u"北京长城")
  obj.insert(u"北京故宫")
  obj.insert(u"上海")
  for i in obj.findTips(u"北京"):
    print i
#include   
#include   
#include   
#include   
#include   
  
using namespace std;  
  
const int branchNum = 26;  
  
struct TrieNode   
{  
    bool isStr;  
    TrieNode *next[branchNum];  
    TrieNode()  
        :isStr(false)  
    {  
        memset(next , 0  , sizeof(next));  
    }  
};  
  
class Trie  
{  
public:  
    Trie();  
    void insert(const char *word);  
    bool search(const char *word);  
    void deleteTrie(TrieNode *root);  
    vector findTips(const char *prefix); //here  
    void findTips(TrieNode *root , string track , vector &tips); //here  
private:  
    TrieNode *root;  
};  
  
Trie::Trie()  
{  
    root = new TrieNode(); //绗竴涓猲ode鏃犵敤  
}  
  
void Trie::insert(const char *word)  
{  
    TrieNode *location = root;  
    while(*word) {  
        if(location->next[*word - 'a'] == NULL) {  
            TrieNode *tmp = new TrieNode();  
            location->next[*word - 'a'] = tmp;  
        }  
        location = location->next[*word - 'a'];  
        word++;  
    }  
    location->isStr = true;  
}  
  
bool Trie::search(const char *word)  
{  
    TrieNode *location = root;  
    while(*word && location) {  
        location = location->next[*word - 'a'];  
        word ++;  
    }  
    return (location != NULL && location->isStr);  
}  
  
void Trie::deleteTrie(TrieNode *root)  
{  
    for(int i = 0 ; i < branchNum ; i++) {  
        if(root->next[i] != NULL) {  
            deleteTrie(root->next[i]);  
        }  
    }  
    delete root;  
}  
  
vector Trie::findTips(const char *prefix)  
{  
    vector tips;  
    string track(prefix);  
    cout<< track<next[prefix[i] - 'a'];  
    }  
    if(node == NULL && i != strlen(prefix))  
      return tips; //empty  
      
  
    findTips(node, track , tips);  
    return tips;  
}  
  
void Trie::findTips(TrieNode *root,string track , vector &tips) //track长度最多为最长字符串长度,  
{  
  
    if(root == NULL)  
        return;  
  
    if(root->isStr)  
        tips.push_back(track);  
  
    for(int i = 0 ; i < branchNum ; i++) {  
        findTips(root->next[i] , track + (char)('a'+i) , tips);  
    }  
}  
  
int main()  
{  
    Trie t;  
    t.insert("a");  
    t.insert("abandon");  
    t.insert("abandoned");  
    t.insert("abashed");  
    if(t.search("abashed"))  
        cout << "true" << endl;  
  
    vector tips = t.findTips("ab");  
    cout << "ab" << " tips are :(" << tips.size() <<")" <

参考:

https://wenku.baidu.com/view/2f1c8b18ed630b1c58eeb528.html

https://wenku.baidu.com/view/fcf1c56a561252d380eb6e1d.html

https://wenku.baidu.com/view/71e82437f111f18583d05a54.html?rec_flag=default&mark_pay_doc=2&mark_rec_page=1&mark_rec_position=5&mark_rec=view_r_1&clear_uda_param=1

你可能感兴趣的:(nlp)