mcc算法的核心思想是suffix link(后缀连接)和head/tail的概念。所谓结点X的suffix link指向的结点Y,指的是如果从根结点出发到X结点终止时的字符串等于xW(其中小写字母x表示单个字符,W表示一个字符串),那么从根结点出发到Y结点终止时的字符串等于W。head[i]指的是后缀树T中,Suffix[i]与所有后缀共享的前缀中最长的前缀。
mcc算法实现:
import java.util.LinkedList; import java.util.List; /** * * Build Suffix Tree using McCreight Algorithm * * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) * * @author ljs * 2011-07-03 * */ public class McCreightAlgorithm { private class SuffixNode { private String text; private List<SuffixNode> children = new LinkedList<SuffixNode>(); private SuffixNode link; private int start; private int end; private int pathlen; public SuffixNode(String text,int start,int end,int pathlen){ this.text = text; this.start = start; this.end = end; this.pathlen = pathlen; } public SuffixNode(String text){ this.text = text; this.start = -1; this.end = -1; this.pathlen = 0; } public int getLength(){ if(start == -1) return 0; else return end - start + 1; } public String getString(){ if(start != -1){ return this.text.substring(start,end+1); }else{ return ""; } } public boolean isRoot(){ return start == -1; } public String getCoordinate(){ return "[" + start+".." + end + "/" + this.pathlen + "]"; } public String toString(){ return getString() + "(" + getCoordinate() + ",link:" + ((this.link==null)?"N/A":this.link.getCoordinate()) + ",children:" + children.size() +")"; } } private class State{ private SuffixNode u; //parent(head) private SuffixNode w; //s(head[i-1]) private SuffixNode v; //head[i-1] private int j; //the global index of text starting from 0 to text.length() private boolean finished; //is this suffix insertion finished? } private SuffixNode root; private String text; public McCreightAlgorithm(String text){ this.text = text; } //build a suffix-tree for a string of text private void buildSuffixTree() throws Exception{ if(root==null){ root = new SuffixNode(text); root.link = root; //link to itself } SuffixNode u = root; SuffixNode v = root; State state = new State(); for(int i=0;i<text.length();i++){ //process each suffix SuffixNode s = u.link; int uvLen=v.pathlen - u.pathlen; if(u.isRoot() && !v.isRoot()){ uvLen--; } int j = s.pathlen + i; //init state state.u = s; state.w = s; //if uvLen = 0 state.v = s; state.j = j; state.finished = false; //execute fast scan if(uvLen > 0) { fastscan(state,s,uvLen,j); } //establish the suffix link with v SuffixNode w = state.w; v.link = w; //execute slow scan if(!state.finished){ j = state.j; state.u = w; //w must be an internal node when state.finished=false, then it must have a suffix link, so u can be updated. slowscan(state,w,j); } u = state.u; v = state.v; } } //slow scan until head(=state.v) is found private void slowscan(State state,SuffixNode currNode,int j){ boolean done = false; int keyLen = text.length() - j; for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); //use min(child.key.length, key.length) int childKeyLen = child.getLength(); int len = childKeyLen<keyLen?childKeyLen:keyLen; int delta = 0; for(;delta<len;delta++){ if(text.charAt(j+delta) != text.charAt(child.start+delta)){ break; } } if(delta==0){//this child doesn't match any character with the new key //order keys by lexi-order if(text.charAt(j) < text.charAt(child.start)){ //e.g. child="e" (currNode="abc") // abc abc // / \ =========> / | \ // e f insert "c^" c^ e f int pathlen = text.length() - j + currNode.pathlen; SuffixNode node = new SuffixNode(text,j,text.length()-1,pathlen); currNode.children.add(i,node); //state.u = currNode; //currNode is already registered as state.u, so commented out state.v = currNode; state.finished = true; done = true; break; }else{ //key.charAt(0)>child.key.charAt(0) //don't forget to add the largest new key after iterating all children continue; } }else{//current child's key partially matches with the new key if(delta==len){ if(keyLen>childKeyLen){ //suffix tree with ^ ending can't have other two cases //e.g. child="ab" // ab ab // / \ ==========> / | \ // e f insert "abc^" c^ e f //recursion state.u = child; j += childKeyLen; state.j = j; slowscan(state,child,j); } }else{//0<delta<len //e.g. child="abc" // abc ab // / \ ==========> / \ // e f insert "abd^" c d^ // / \ // e f //insert the new node: ab int nodepathlen = child.pathlen - (child.getLength()-delta); SuffixNode node = new SuffixNode(text, child.start,child.start + delta - 1,nodepathlen); node.children = new LinkedList<SuffixNode>(); int tailpathlen = (text.length() - (j + delta)) + nodepathlen; SuffixNode tail = new SuffixNode(text, j+delta,text.length()-1,tailpathlen); //update child node: c child.start += delta; if(text.charAt(j+delta)<text.charAt(child.start)){ node.children.add(tail); node.children.add(child); }else{ node.children.add(child); node.children.add(tail); } //update parent currNode.children.set(i, node); //state.u = currNode; //currNode is already registered as state.u, so commented out state.v = node; state.finished = true; } done = true; break; } } if(!done){ int pathlen = text.length() - j + currNode.pathlen; SuffixNode node = new SuffixNode(text,j,text.length()-1,pathlen); currNode.children.add(node); //state.u = currNode; //currNode is already registered as state.u, so commented out state.v = currNode; state.finished = true; } } //fast scan until w is found private void fastscan(State state,SuffixNode currNode,int uvLen,int j){ for(int i=0;i<currNode.children.size();i++){ SuffixNode child = currNode.children.get(i); if(text.charAt(child.start) == text.charAt(j)){ int len = child.getLength(); if(uvLen==len){ //then we find w //uvLen = 0; //need slow scan after this child state.u = child; state.w = child; state.j = j+len; }else if(uvLen<len){ //branching and cut child short //e.g. child="abc",uvLen = 2 // abc ab // / \ ================> / \ // e f suffix part: "abd^" c d^ // / \ // e f //insert the new node: ab; child is now c int nodepathlen = child.pathlen - (child.getLength()-uvLen); SuffixNode node = new SuffixNode(text, child.start,child.start + uvLen - 1,nodepathlen); node.children = new LinkedList<SuffixNode>(); int tailpathlen = (text.length() - (j + uvLen)) + nodepathlen; SuffixNode tail = new SuffixNode(text, j+uvLen,text.length()-1,tailpathlen); //update child node: c child.start += uvLen; if(text.charAt(j+uvLen)<text.charAt(child.start)){ node.children.add(tail); node.children.add(child); }else{ node.children.add(child); node.children.add(tail); } //update parent currNode.children.set(i, node); //uvLen = 0; //state.u = currNode; //currNode is already registered as state.u, so commented out state.w = node; state.finished = true; state.v = node; }else{//uvLen>len //e.g. child="abc", uvLen = 4 // abc // / \ ================> // e f suffix part: "abcdefg^" // // //jump to next node uvLen -= len; state.u = child; j += len; state.j = j; fastscan(state,child,uvLen,j); } break; } } } //for test purpose only public void printTree(){ System.out.format("The suffix tree for S = %s is: %n",this.text); this.print(0, this.root); } private void print(int level, SuffixNode node){ for (int i = 0; i < level; i++) { System.out.format(" "); } System.out.format("|"); for (int i = 0; i < level; i++) { System.out.format("-"); } //System.out.format("%s(%d..%d/%d)%n", node.getString(),node.start,node.end,node.pathlen); System.out.format("(%d,%d)%n", node.start,node.end); for (SuffixNode child : node.children) { print(level + 1, child); } } public static void main(String[] args) throws Exception { //test suffix-tree System.out.println("****************************"); String text = "xbxb^"; //the last char must be unique! McCreightAlgorithm stree = new McCreightAlgorithm(text); stree.buildSuffixTree(); stree.printTree(); System.out.println("****************************"); text = "mississippi^"; stree = new McCreightAlgorithm(text); stree.buildSuffixTree(); stree.printTree(); System.out.println("****************************"); text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA^"; stree = new McCreightAlgorithm(text); stree.buildSuffixTree(); stree.printTree(); System.out.println("****************************"); text = "ABCDEFGHIJKLMNOPQRSTUVWXYZ^"; stree = new McCreightAlgorithm(text); stree.buildSuffixTree(); stree.printTree(); System.out.println("****************************"); text = "AAAAAAAAAAAAAAAAAAAAAAAAAA^"; stree = new McCreightAlgorithm(text); stree.buildSuffixTree(); stree.printTree(); System.out.println("****************************"); text = "minimize"; //the last char e is different from other chars, so it is ok. stree = new McCreightAlgorithm(text); stree.buildSuffixTree(); stree.printTree(); System.out.println("****************************"); //the example from McCreight's: A Space-Economical Suffix Tree Construction Algorithm text = "bbbbbababbbaabbbbbc^"; stree = new McCreightAlgorithm(text); stree.buildSuffixTree(); stree.printTree(); } }
测试输出:
**************************** The suffix tree for S = xbxb^ is: |(-1,-1) |-(4,4) |-(1,1) |--(4,4) |--(2,4) |-(0,1) |--(4,4) |--(2,4) **************************** The suffix tree for S = mississippi^ is: |(-1,-1) |-(11,11) |-(1,1) |--(11,11) |--(8,11) |--(2,4) |---(8,11) |---(5,11) |-(0,11) |-(8,8) |--(10,11) |--(9,11) |-(2,2) |--(4,4) |---(8,11) |---(5,11) |--(3,4) |---(8,11) |---(5,11) **************************** The suffix tree for S = GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA^ is: |(-1,-1) |-(15,15) |--(16,16) |---(17,17) |----(18,18) |-----(35,35) |------(36,36) |-------(37,37) |--------(38,38) |---------(39,39) |----------(40,40) |-----------(41,41) |------------(42,42) |-------------(43,43) |--------------(44,44) |---------------(45,45) |----------------(46,46) |-----------------(47,47) |------------------(48,48) |-------------------(49,49) |--------------------(50,50) |---------------------(51,51) |----------------------(52,53) |----------------------(53,53) |---------------------(53,53) |--------------------(53,53) |-------------------(53,53) |------------------(53,53) |-----------------(53,53) |----------------(53,53) |---------------(53,53) |--------------(53,53) |-------------(53,53) |------------(53,53) |-----------(53,53) |----------(53,53) |---------(53,53) |--------(53,53) |-------(53,53) |------(53,53) |-----(19,53) |-----(53,53) |----(19,53) |----(53,53) |---(19,53) |---(53,53) |--(19,19) |---(27,27) |----(32,53) |----(28,29) |-----(32,53) |-----(30,53) |---(20,20) |----(25,53) |----(21,53) |--(53,53) |-(12,12) |--(15,15) |---(16,53) |---(26,53) |--(13,13) |---(22,53) |---(14,53) |-(0,0) |--(22,22) |---(32,53) |---(23,23) |----(29,29) |-----(32,53) |-----(30,53) |----(24,53) |--(12,12) |---(15,15) |----(16,53) |----(26,53) |---(13,13) |----(22,53) |----(14,53) |--(1,1) |---(12,53) |---(2,2) |----(12,53) |----(3,3) |-----(12,53) |-----(4,4) |------(12,53) |------(5,5) |-------(12,53) |-------(6,6) |--------(12,53) |--------(7,7) |---------(12,53) |---------(8,8) |----------(12,53) |----------(9,9) |-----------(12,53) |-----------(10,10) |------------(12,53) |------------(11,53) |-(53,53) **************************** The suffix tree for S = ABCDEFGHIJKLMNOPQRSTUVWXYZ^ is: |(-1,-1) |-(0,26) |-(1,26) |-(2,26) |-(3,26) |-(4,26) |-(5,26) |-(6,26) |-(7,26) |-(8,26) |-(9,26) |-(10,26) |-(11,26) |-(12,26) |-(13,26) |-(14,26) |-(15,26) |-(16,26) |-(17,26) |-(18,26) |-(19,26) |-(20,26) |-(21,26) |-(22,26) |-(23,26) |-(24,26) |-(25,26) |-(26,26) **************************** The suffix tree for S = AAAAAAAAAAAAAAAAAAAAAAAAAA^ is: |(-1,-1) |-(0,0) |--(1,1) |---(2,2) |----(3,3) |-----(4,4) |------(5,5) |-------(6,6) |--------(7,7) |---------(8,8) |----------(9,9) |-----------(10,10) |------------(11,11) |-------------(12,12) |--------------(13,13) |---------------(14,14) |----------------(15,15) |-----------------(16,16) |------------------(17,17) |-------------------(18,18) |--------------------(19,19) |---------------------(20,20) |----------------------(21,21) |-----------------------(22,22) |------------------------(23,23) |-------------------------(24,24) |--------------------------(25,26) |--------------------------(26,26) |-------------------------(26,26) |------------------------(26,26) |-----------------------(26,26) |----------------------(26,26) |---------------------(26,26) |--------------------(26,26) |-------------------(26,26) |------------------(26,26) |-----------------(26,26) |----------------(26,26) |---------------(26,26) |--------------(26,26) |-------------(26,26) |------------(26,26) |-----------(26,26) |----------(26,26) |---------(26,26) |--------(26,26) |-------(26,26) |------(26,26) |-----(26,26) |----(26,26) |---(26,26) |--(26,26) |-(26,26) **************************** The suffix tree for S = minimize is: |(-1,-1) |-(7,7) |-(1,1) |--(4,7) |--(2,7) |--(6,7) |-(0,1) |--(2,7) |--(6,7) |-(2,7) |-(6,7) **************************** The suffix tree for S = bbbbbababbbaabbbbbc^ is: |(-1,-1) |-(19,19) |-(5,5) |--(12,19) |--(6,6) |---(7,19) |---(9,10) |----(11,19) |----(16,19) |-(0,0) |--(5,5) |---(12,19) |---(6,6) |----(7,19) |----(9,19) |--(1,1) |---(5,5) |----(12,19) |----(6,19) |---(2,2) |----(5,5) |-----(12,19) |-----(6,19) |----(3,3) |-----(5,19) |-----(4,4) |------(5,19) |------(18,19) |-----(18,19) |----(18,19) |---(18,19) |--(18,19) |-(18,19)
参考资料:
EDWARD M. McCREIGHT, Journal of the Association for Computing Machinery, Vol 23, No. 2, April 1976, A Space-Economical Suffix Tree Construction Algorithm