后缀树(Suffix Tree)的文本匹配算法 后缀树(Suffix Tree)是一种特殊的Trie,它的用途非常广泛,其中一个主要的应用是作文本匹配,也像KMP等算法一样,它也是空间换时间的一个典范。利用 Suffix Tree做文本匹配与其他的模式匹配算法比如KMP和Boyer-Moore算法的主要区别是,后缀树文本匹配算法是对文本T做预处理,而KMP算法是对模式串P做预处理。因此后缀树常用于文本静态,而模式串动态的场合;而KMP等算法常用于文本动态,模式串静态的场合。设T的长度为n,P的长度为m,一般情况下m<n。在预处理中,用Suffix Tree匹配的复杂度为O(n),而KMP和Boyer-Moore的复杂度为O(m)。可是预处理结束后,KMP等算法的复杂度为O(n),后缀树匹配算法的复杂度只有O(m),这是令人惊叹的效率! 本文后缀树用蛮力法构建,跟构建前缀树Patricia Trie类似。后缀树用Patricia Trie压缩存储的好处是,Patricia Trie存储空间只与单词的个数相关(因为有了压缩),而普通的Trie的存储空间与单词的总长度相关(因为没有压缩)。一个文本text的所有后缀总长度为n + (n-1) + ... + 1 = n(n+1)/2,如果用普通的Trie存储后缀树,所需空间为O(n^2);而用Patricia Trie压缩之后的为O(n),这里n为后缀的个数。没有使用压缩存储的后缀树叫做Suffix Trie,而不是Suffix Tree。一般情况下,使用压缩方式存储后缀树是最基本的要求。 在下面的实现中,利用Patricia Trie来构造后缀树,每一个结点除了存储Patricia Trie的key值之外,还存储了该结点key值在文本text中出现的最小下标值minStartIndex,这样便于匹配时输出成功匹配的位置。另外,出于实际应用考虑,后缀树在叶子结点中不必要存储value。除了没有delete操作(文本是静态的,不需要修改)之外,建树操作(insert) 和查询匹配(find)操作跟Patricia Trie的实现差别不大。 实现: view sourceprint?import java.util.LinkedList; import java.util.List; /** * * Suffix-Tree String Pattern Matching(Building tree using brute-force) * * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) * * @author ljs * 2011-06-27 * */ public class SuffixTree { private class SuffixNode { private String key; private List<SuffixNode> children = new LinkedList<SuffixNode>(); //use "#" for terminal char private boolean terminal; private int minStartIndex; public SuffixNode(){ this.key = ""; minStartIndex = -1; } public SuffixNode(String key){ this.key = key; } public String toString(){ return this.key + "[" + this.minStartIndex + "]" + (this.terminal?"#":"") + "(" + children.size() +")"; } } private SuffixNode root; private String text; public SuffixTree(String text){ this.text = text; } //return the start index of the matched substring; //return -1 if no match is found public int find(String pattern){ if(pattern == null || pattern.length() == 0) return -1; if(root==null){ return -1; }else{ return find(root,pattern); } } private int find(SuffixNode currNode,String pattern) { for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); //use min(child.key.length, pattern.length) int len = child.key.length()<pattern.length()?child.key.length(): pattern.length(); int j = 0; for(;j<len;j++){ if(pattern.charAt(j) != child.key.charAt(j)){ break; } } if(j==0){//this child doesn't match any character with the new pattern //order suffix-key by lexi-order if(pattern.charAt(0)<child.key.charAt(0)){ //e.g. child="e", pattern="c" (currNode="abc") // abc // / \ // e h return -1; }else{ //e.g. child="e", pattern="h" (currNode="abc") continue; } }else{//current child's key partially matches with the new pattern; 0<j<=len if(j==len){ if(pattern.length()==child.key.length()){ if(child.terminal){ //e.g. child="ab", pattern="ab" // ab# // \ // f# return child.minStartIndex; }else{ //e.g. child="ab", pattern="ab" // ab // / \ // e f return child.minStartIndex; } }else if(pattern.length()>child.key.length()){ //e.g. child="ab#", pattern="abc" // ab# // / \ // a c# String subpattern = pattern.substring(j); //c //recursion int index = find(child,subpattern); if(index==-1){ return -1; }else{ return index-child.key.length(); } }else{ //pattern.length()<child.key.length() //e.g. child="abc", pattern="ab" // abc // / \ // e f return child.minStartIndex; } }else{//0<j<len //e.g. child="abc", pattern="abd" // abc // / \ // e f return -1; } } } return -1; } private void insert(SuffixNode currNode,String key,int startIndex) throws Exception{ boolean done = false; for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); //use min(child.key.length, key.length) int len = child.key.length()<key.length()?child.key.length(): key.length(); int j = 0; for(;j<len;j++){ if(key.charAt(j) != child.key.charAt(j)){ break; } } if(j==0){//this child doesn't match any character with the new key //order keys by lexi-order if(key.charAt(0)<child.key.charAt(0)){ //e.g. child="e" (currNode="abc") // abc abc // / \ =========> / | \ // e f insert "c" c# e f SuffixNode node = new SuffixNode(key); currNode.children.add(i,node); node.terminal = true; node.minStartIndex = startIndex; done = true; break; }else{ //key.charAt(0)>child.key.charAt(0) //don't forget to add the largest new key after iterating all children continue; } }else{//current child's key partially matches with the new key; 0<j<=len if(j==len){ if(key.length()==child.key.length()){ if(child.terminal){ throw new Exception("Duplicate Key is found when insertion!"); }else{ //e.g. child="ab" // ab ab# // / \ =========> / \ // e f insert "ab" e f child.terminal = true; if(child.minStartIndex>startIndex) child.minStartIndex = startIndex; } }else if(key.length()>child.key.length()){ //e.g. child="ab#" // ab# ab# // / \ ==========> / | \ // e f insert "abc" c# e f if(child.minStartIndex>startIndex) child.minStartIndex = startIndex; String subkey = key.substring(j); //recursion insert(child,subkey,startIndex+j); }else{ //key.length()<child.key.length() //e.g. child="abc#" // abc# ab# // / \ =========> / // e f insert "ab" c# // / \ // e f String childSubkey = child.key.substring(j); //c SuffixNode subChildNode = new SuffixNode(childSubkey); subChildNode.terminal = child.terminal; subChildNode.children = child.children; //inherited from parent subChildNode.minStartIndex = child.minStartIndex+j; child.key = key; //ab child.terminal = true; //ab# if(child.minStartIndex>startIndex) child.minStartIndex = startIndex; child.children = new LinkedList<SuffixNode>(); child.children.add(subChildNode); } }else{//0<j<len //e.g. child="abc#" // abc# ab // / \ ==========> / \ // e f insert "abd" c# d# // / \ // e f //split at j String childSubkey = child.key.substring(j); //c String subkey = key.substring(j); //d SuffixNode subChildNode = new SuffixNode(childSubkey); subChildNode.terminal = child.terminal; subChildNode.children = child.children; //inherited from parent subChildNode.minStartIndex = child.minStartIndex+j; //update child's key child.key = child.key.substring(0,j); if(child.minStartIndex>startIndex) child.minStartIndex = startIndex; //child is not terminal now due to split, it is inherited by subChildNode child.terminal = false; //Note: no need to merge subChildNode SuffixNode node = new SuffixNode(subkey); node.terminal = true; node.minStartIndex = startIndex+j; child.children = new LinkedList<SuffixNode>(); if(subkey.charAt(0)<childSubkey.charAt(0)){ child.children.add(node); child.children.add(subChildNode); }else{ child.children.add(subChildNode); child.children.add(node); } } done = true; break; } } if(!done){ SuffixNode node = new SuffixNode(key); node.terminal = true; node.minStartIndex = startIndex; currNode.children.add(node); } } public void insert(String suffix,int startIndex) throws Exception{ if(suffix == null || suffix.length() == 0) return; if(root==null){ root = new SuffixNode(); } insert(root,suffix,startIndex); } //build a suffix-tree for a string of text public void buildSuffixTree() throws Exception{ for(int i=0;i<text.length();i++){ this.insert(text.substring(i), i); } } //for test purpose only public void printTree(){ this.print(0, this.root); } private void print(int level, SuffixNode node){ for (int i = 0; i < level; i++) { System.out.format(" "); } System.out.format("|"); for (int i = 0; i < level; i++) { System.out.format("-"); } if (node.terminal) System.out.format("%s[%s]#%n", node.key,node.minStartIndex); else System.out.format("%s[%s]%n", node.key,node.minStartIndex); for (SuffixNode child : node.children) { print(level + 1, child); } } public void testFind(String pattern){ int index = this.find(pattern); if(index != -1) System.out.format("Found pattern \"%s\" at: %s%n",pattern,index); else System.out.format("Found no such pattern: \"%s\"%n",pattern); } public static void main(String[] args) throws Exception { //test suffix-tree System.out.println("****************************"); String text = "minimize"; SuffixTree strie = new SuffixTree(text); strie.buildSuffixTree(); strie.printTree(); System.out.println("****************************"); text = "mississippi"; strie = new SuffixTree(text); strie.buildSuffixTree(); strie.printTree(); String pattern = "iss"; strie.testFind(pattern); pattern = "ip"; strie.testFind(pattern); pattern = "pi"; strie.testFind(pattern); pattern = "miss"; strie.testFind(pattern); pattern = "tt"; strie.testFind(pattern); pattern = "si"; strie.testFind(pattern); pattern = "ssi"; strie.testFind(pattern); pattern = "sissippi"; strie.testFind(pattern); pattern = "ssippi"; strie.testFind(pattern); System.out.println("****************************"); text = "After a long text, here's a needle ZZZZZ"; pattern = "ZZZZZ"; strie = new SuffixTree(text); strie.buildSuffixTree(); //strie.printTree(); strie.testFind(pattern); System.out.println("****************************"); text = "The quick brown fox jumps over the lazy dog."; pattern = "lazy"; strie = new SuffixTree(text); strie.buildSuffixTree(); //strie.printTree(); strie.testFind(pattern); System.out.println("****************************"); text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna..."; pattern = "tempor"; strie = new SuffixTree(text); strie.buildSuffixTree(); //strie.printTree(); strie.testFind(pattern); System.out.println("****************************"); text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA"; pattern = "GCAGAGAG"; strie = new SuffixTree(text); strie.buildSuffixTree(); //strie.printTree(); strie.testFind(pattern); } } 测试输出: view sourceprint?**************************** |[-1] |-e[7]# |-i[1] |--mize[4]# |--nimize[2]# |--ze[6]# |-mi[0] |--nimize[2]# |--ze[6]# |-nimize[2]# |-ze[6]# **************************** |[-1] |-i[1]# |--ppi[8]# |--ssi[2] |---ppi[8]# |---ssippi[5]# |-mississippi[0]# |-p[8] |--i[10]# |--pi[9]# |-s[2] |--i[4] |---ppi[8]# |---ssippi[5]# |--si[3] |---ppi[8]# |---ssippi[5]# Found pattern "iss" at: 1 Found pattern "ip" at: 7 Found pattern "pi" at: 9 Found pattern "miss" at: 0 Found no such pattern: "tt" Found pattern "si" at: 3 Found pattern "ssi" at: 2 Found pattern "sissippi" at: 3 Found pattern "ssippi" at: 5 **************************** Found pattern "ZZZZZ" at: 35 **************************** Found pattern "lazy" at: 35 **************************** Found pattern "tempor" at: 73 **************************** Found pattern "GCAGAGAG" at: 23