Ukkonen算法构建后缀树的实现:
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
*
* Build Suffix Tree using Ukkonen Algorithm
*
* Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
* Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php)
*
* @author ljs
* 2011-07-10
*
*/
public class Ukkonen {
private class SuffixNode {
private StringBuilder sb;
private List children = new LinkedList();
private SuffixNode link;
private int start;
private int end;
private int pathlen;
public SuffixNode(StringBuilder sb,int start,int end,int pathlen){
this.sb = sb;
this.start = start;
this.end = end;
this.pathlen = pathlen;
}
public SuffixNode(StringBuilder sb){
this.sb = sb;
this.start = -1;
this.end = -1;
this.pathlen = 0;
}
public int getLength(){
if(start == -1) return 0;
else return end - start + 1;
}
public String getString(){
if(start != -1){
return this.sb.substring(start,end+1);
}else{
return "";
}
}
public boolean isRoot(){
return start == -1;
}
public String getCoordinate(){
return "[" + start+".." + end + "/" + this.pathlen + "]";
}
public String toString(){
return getString() + "(" + getCoordinate()
+ ",link:" + ((this.link==null)?"N/A":this.link.getCoordinate())
+ ",children:" + children.size() +")";
}
}
private class State{
private SuffixNode u; //parent(v)
//private SuffixNode w;
private SuffixNode v;
//private int k; //the global index of text starting from 0 to text.length()
//private boolean finished;
}
private SuffixNode root;
private StringBuilder sb = new StringBuilder();
public Ukkonen(){
}
//build a suffix-tree for a string of text
public void buildSuffixTree(String text) throws Exception{
int m = text.length();
if(m==0)
return;
if(root==null){
root = new SuffixNode(sb);
root.link = root; //link to itself
}
List leaves = new ArrayList();
//add first node
sb.append(text.charAt(0));
SuffixNode node = new SuffixNode(sb,0,0,1);
leaves.add(node);
root.children.add(node);
int j_star = 0; //j_{i-1}
SuffixNode u = root;
SuffixNode v = root;
for(int i=1;i<=m-1;i++){
//do phase i
sb.append(text.charAt(i));
//step 1: do implicit extensions
for(SuffixNode leafnode:leaves){
leafnode.end++;
leafnode.pathlen++;
}
//step 2: do explicit extensions until rule #3 is applied
State state = new State();
//for the first explicit extension, we reuse the last phase's u and do slowscan
//also note: suffix link doesn't span two phases.
int j=j_star+1;
SuffixNode s = u;
int k = s.pathlen + j;
state.u = s;
state.v = s;
SuffixNode newleaf = slowscan(state,s,k);
if(newleaf == null){
//if rule #3 is applied, then we can terminate this phase
j_star = j - 1;
//Note: no need to update state.v because it is not going to be used
//at the next phase
u = state.u;
continue;
}else{
j_star = j;
leaves.add(newleaf);
u = state.u;
v = state.v;
}
j++;
//for other explicit extensions, we start with fast scan.
for(;j<=i;j++){
s = u.link;
int uvLen=v.pathlen - u.pathlen;
if(u.isRoot() && !v.isRoot()){
uvLen--;
}
//starting with index k of the text
k = s.pathlen + j;
//init state
state.u = s;
state.v = s; //if uvLen = 0
//execute fast scan
newleaf = fastscan(state,s,uvLen,k);
//establish the suffix link with v
v.link = state.v;
if(newleaf == null){
//if rule #3 is applied, then we can terminate this phase
j_star = j - 1;
u = state.u;
break;
}else{
j_star = j;
leaves.add(newleaf);
u = state.u;
v = state.v;
}
}
}
}
//slow scan from currNode until state.v is found
//return the new leaf if a new one is created right after v;
//return null otherwise (i.e. when rule #3 is applied)
private SuffixNode slowscan(State state,SuffixNode currNode,int k){
SuffixNode newleaf = null;
boolean done = false;
int keyLen = sb.length() - k;
for(int i=0;i / | \
// e f insert "c" c e f
int pathlen = sb.length() - k + currNode.pathlen;
SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);
currNode.children.add(i,node);
//state.u = currNode; //currNode is already registered as state.u, so commented out
state.v = currNode;
newleaf = node;
done = true;
break;
}else{ //key.charAt(0)>child.key.charAt(0)
//don't forget to add the largest new key after iterating all children
continue;
}
}else{//current child's key partially matches with the new key
if(delta==len){
if(keyLen==childKeyLen){
//e.g. child="ab"
// ab ab
// / \ =========> / \
// e f insert "ab" e f
//terminate this phase (implicit tree with rule #3)
state.u = child;
state.v = currNode;
}else if(keyLen>childKeyLen){
//TODO: still need an example to test this condition
//e.g. child="ab"
// ab ab
// / \ ==========> / | \
// e f insert "abc" c e f
//recursion
state.u = child;
state.v = child;
k += childKeyLen;
//state.k = k;
newleaf = slowscan(state,child,k);
}
else{ //keyLen / \
// e f insert "ab" e f
//
//terminate this phase (implicit tree with rule #3)
//state.u = currNode;
state.v = currNode;
}
}else{//0 / \
// e f insert "abd" c d
// / \
// e f
//insert the new node: ab
int nodepathlen = child.pathlen
- (child.getLength()-delta);
SuffixNode node = new SuffixNode(sb,
child.start,child.start + delta - 1,nodepathlen);
node.children = new LinkedList();
int leafpathlen = (sb.length() - (k + delta)) + nodepathlen;
SuffixNode leaf = new SuffixNode(sb,
k+delta,sb.length()-1,leafpathlen);
//update child node: c
child.start += delta;
if(sb.charAt(k+delta) / \
// e f suffix part: "abd" c d
// / \
// e f
//insert the new node: ab; child is now c
int nodepathlen = child.pathlen
- (child.getLength()-uvLen);
SuffixNode node = new SuffixNode(sb,
child.start,child.start + uvLen - 1,nodepathlen);
node.children = new LinkedList();
int leafpathlen = (sb.length() - (k + uvLen)) + nodepathlen;
SuffixNode leaf = new SuffixNode(sb,
k+uvLen,sb.length()-1,leafpathlen);
//update child node: c
child.start += uvLen;
if(sb.charAt(k+uvLen)len
//e.g. child="abc", uvLen = 4
// abc
// / \ ================>
// e f suffix part: "abcde"
//
//
//jump to next node
uvLen -= len;
state.u = child;
//state.v = child;
k += len;
//state.k = k;
newleaf = fastscan(state,child,uvLen,k);
}
done = true;
break;
}
}
if(!done){
//TODO: still need an example to test this condition
//add a leaf under the currNode
int pathlen = sb.length() - k + currNode.pathlen;
SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);
currNode.children.add(node);
//state.u = currNode; //currNode is already registered as state.u, so commented out
state.v = currNode;
newleaf = node;
}
return newleaf;
}
//for test purpose only
public void printTree(){
System.out.format("The suffix tree for S = %s is: %n",this.sb);
this.print(0, this.root);
}
private void print(int level, SuffixNode node){
for (int i = 0; i < level; i++) {
System.out.format(" ");
}
System.out.format("|");
for (int i = 0; i < level; i++) {
System.out.format("-");
}
//System.out.format("%s(%d..%d/%d)%n", node.getString(),node.start,node.end,node.pathlen);
System.out.format("(%d,%d)%n", node.start,node.end);
for (SuffixNode child : node.children) {
print(level + 1, child);
}
}
public static void main(String[] args) throws Exception {
//test suffix-tree
System.out.println("****************************");
String text = "xbxb^"; //the last char must be unique!
Ukkonen stree = new Ukkonen();
stree.buildSuffixTree(text);
stree.printTree();
System.out.println("****************************");
text = "mississippi^";
stree = new Ukkonen();
stree.buildSuffixTree(text);
stree.printTree();
System.out.println("****************************");
text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA^";
stree = new Ukkonen();
stree.buildSuffixTree(text);
stree.printTree();
System.out.println("****************************");
text = "ABCDEFGHIJKLMNOPQRSTUVWXYZ^";
stree = new Ukkonen();
stree.buildSuffixTree(text);
stree.printTree();
System.out.println("****************************");
text = "AAAAAAAAAAAAAAAAAAAAAAAAAA^";
stree = new Ukkonen();
stree.buildSuffixTree(text);
stree.printTree();
System.out.println("****************************");
text = "minimize"; //the last char e is different from other chars, so it is ok.
stree = new Ukkonen();
stree.buildSuffixTree(text);
stree.printTree();
System.out.println("****************************");
//the example from McCreight's: A Space-Economical Suffix Tree Construction Algorithm
text = "bbbbbababbbaabbbbbc^";
stree = new Ukkonen();
stree.buildSuffixTree(text);
stree.printTree();
}
}
****************************
The suffix tree for S = xbxb^ is:
|(-1,-1)
|-(4,4)
|-(1,1)
|--(4,4)
|--(2,4)
|-(0,1)
|--(4,4)
|--(2,4)
****************************
The suffix tree for S = mississippi^ is:
|(-1,-1)
|-(11,11)
|-(1,1)
|--(11,11)
|--(8,11)
|--(2,4)
|---(8,11)
|---(5,11)
|-(0,11)
|-(8,8)
|--(10,11)
|--(9,11)
|-(2,2)
|--(4,4)
|---(8,11)
|---(5,11)
|--(3,4)
|---(8,11)
|---(5,11)
****************************
The suffix tree for S = GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA^ is:
|(-1,-1)
|-(15,15)
|--(16,16)
|---(17,17)
|----(18,18)
|-----(35,35)
|------(36,36)
|-------(37,37)
|--------(38,38)
|---------(39,39)
|----------(40,40)
|-----------(41,41)
|------------(42,42)
|-------------(43,43)
|--------------(44,44)
|---------------(45,45)
|----------------(46,46)
|-----------------(47,47)
|------------------(48,48)
|-------------------(49,49)
|--------------------(50,50)
|---------------------(51,51)
|----------------------(52,53)
|----------------------(53,53)
|---------------------(53,53)
|--------------------(53,53)
|-------------------(53,53)
|------------------(53,53)
|-----------------(53,53)
|----------------(53,53)
|---------------(53,53)
|--------------(53,53)
|-------------(53,53)
|------------(53,53)
|-----------(53,53)
|----------(53,53)
|---------(53,53)
|--------(53,53)
|-------(53,53)
|------(53,53)
|-----(19,53)
|-----(53,53)
|----(19,53)
|----(53,53)
|---(19,53)
|---(53,53)
|--(19,19)
|---(27,27)
|----(32,53)
|----(28,29)
|-----(32,53)
|-----(30,53)
|---(20,20)
|----(25,53)
|----(21,53)
|--(53,53)
|-(12,12)
|--(15,15)
|---(16,53)
|---(26,53)
|--(13,13)
|---(22,53)
|---(14,53)
|-(0,0)
|--(22,22)
|---(32,53)
|---(23,23)
|----(29,29)
|-----(32,53)
|-----(30,53)
|----(24,53)
|--(12,12)
|---(15,15)
|----(16,53)
|----(26,53)
|---(13,13)
|----(22,53)
|----(14,53)
|--(1,1)
|---(12,53)
|---(2,2)
|----(12,53)
|----(3,3)
|-----(12,53)
|-----(4,4)
|------(12,53)
|------(5,5)
|-------(12,53)
|-------(6,6)
|--------(12,53)
|--------(7,7)
|---------(12,53)
|---------(8,8)
|----------(12,53)
|----------(9,9)
|-----------(12,53)
|-----------(10,10)
|------------(12,53)
|------------(11,53)
|-(53,53)
****************************
The suffix tree for S = ABCDEFGHIJKLMNOPQRSTUVWXYZ^ is:
|(-1,-1)
|-(0,26)
|-(1,26)
|-(2,26)
|-(3,26)
|-(4,26)
|-(5,26)
|-(6,26)
|-(7,26)
|-(8,26)
|-(9,26)
|-(10,26)
|-(11,26)
|-(12,26)
|-(13,26)
|-(14,26)
|-(15,26)
|-(16,26)
|-(17,26)
|-(18,26)
|-(19,26)
|-(20,26)
|-(21,26)
|-(22,26)
|-(23,26)
|-(24,26)
|-(25,26)
|-(26,26)
****************************
The suffix tree for S = AAAAAAAAAAAAAAAAAAAAAAAAAA^ is:
|(-1,-1)
|-(0,0)
|--(1,1)
|---(2,2)
|----(3,3)
|-----(4,4)
|------(5,5)
|-------(6,6)
|--------(7,7)
|---------(8,8)
|----------(9,9)
|-----------(10,10)
|------------(11,11)
|-------------(12,12)
|--------------(13,13)
|---------------(14,14)
|----------------(15,15)
|-----------------(16,16)
|------------------(17,17)
|-------------------(18,18)
|--------------------(19,19)
|---------------------(20,20)
|----------------------(21,21)
|-----------------------(22,22)
|------------------------(23,23)
|-------------------------(24,24)
|--------------------------(25,26)
|--------------------------(26,26)
|-------------------------(26,26)
|------------------------(26,26)
|-----------------------(26,26)
|----------------------(26,26)
|---------------------(26,26)
|--------------------(26,26)
|-------------------(26,26)
|------------------(26,26)
|-----------------(26,26)
|----------------(26,26)
|---------------(26,26)
|--------------(26,26)
|-------------(26,26)
|------------(26,26)
|-----------(26,26)
|----------(26,26)
|---------(26,26)
|--------(26,26)
|-------(26,26)
|------(26,26)
|-----(26,26)
|----(26,26)
|---(26,26)
|--(26,26)
|-(26,26)
****************************
The suffix tree for S = minimize is:
|(-1,-1)
|-(7,7)
|-(1,1)
|--(4,7)
|--(2,7)
|--(6,7)
|-(0,1)
|--(2,7)
|--(6,7)
|-(2,7)
|-(6,7)
****************************
The suffix tree for S = bbbbbababbbaabbbbbc^ is:
|(-1,-1)
|-(19,19)
|-(5,5)
|--(12,19)
|--(6,6)
|---(7,19)
|---(9,10)
|----(11,19)
|----(16,19)
|-(0,0)
|--(5,5)
|---(12,19)
|---(6,6)
|----(7,19)
|----(9,19)
|--(1,1)
|---(5,5)
|----(12,19)
|----(6,19)
|---(2,2)
|----(5,5)
|-----(12,19)
|-----(6,19)
|----(3,3)
|-----(5,19)
|-----(4,4)
|------(5,19)
|------(18,19)
|-----(18,19)
|----(18,19)
|---(18,19)
|--(18,19)
|-(18,19)