本文继续“根据后缀树得出后缀数组“的讨论,进一步探讨从后缀树计算后缀数组的LCP。
后缀数组的LCP(Longest Common Prefix)问题等价于后缀树的最小公共祖先LCA(Least Common Ancestor)问题。下面的方法将采用“根据后缀树得出后缀数组”一文中介绍的方法,即以词典顺序(lexicographic order)遍历后缀树,但是这里在得出后缀数组的过程中,需记录每一个结点(包含内部结点)跟上一个前驱叶子结点的LCA。如果当前结点(currNode)是内部结点,需要分两种情况考虑来计算它的孩子结点的LCA值:1)它的第一个孩子结点(不管是内部结点还是叶子结点)的LCA等于currNode的LCA;2)它的其他孩子结点(不管是内部结点还是叶子结点)的LCA等于currNode.pathlen。显然currNode是叶子结点的情况不需要考虑,因为LCA是采用top-down(自顶向下)方式计算的,在考虑它的parent结点时已经得出了它的LCA。
实现(后缀树结点添加了lca属性):
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
*
* Derive the suffix array and its LCP(Longest Common Prefix) from the suffix tree
* (The suffix tree is built with ukk algorithm)
* @author ljs
* 2011-07-21
*
*/
public class SuffixTree2EnhancedArray {
private class SuffixNode {
private StringBuilder sb;
private List children = new LinkedList();
private SuffixNode link;
private int start;
private int end;
private int pathlen;
//LCA with its predecessor node
private int lca;
public SuffixNode(StringBuilder sb,int start,int end,int pathlen){
this.sb = sb;
this.start = start;
this.end = end;
this.pathlen = pathlen;
}
public SuffixNode(StringBuilder sb){
this.sb = sb;
this.start = -1;
this.end = -1;
this.pathlen = 0;
}
public int getLength(){
if(start == -1) return 0;
else return end - start + 1;
}
public String getString(){
if(start != -1){
return this.sb.substring(start,end+1);
}else{
return "";
}
}
public boolean isRoot(){
return start == -1;
}
public String getCoordinate(){
return "[" + start+".." + end + "/" + this.pathlen + "]";
}
public String toString(){
return getString() + "(" + getCoordinate()
+ ",link:" + ((this.link==null)?"N/A":this.link.getCoordinate())
+ ",children:" + children.size() +")";
}
}
private class State{
private SuffixNode u; //parent(v)
//private SuffixNode w;
private SuffixNode v;
//private int k; //the global index of text starting from 0 to text.length()
//private boolean finished;
}
private SuffixNode root;
private StringBuilder sb = new StringBuilder();
//build a suffix-tree for a string of text
public void buildSuffixTree(String text) throws Exception{
int m = text.length();
if(m==0)
return;
if(root==null){
root = new SuffixNode(sb);
root.link = root; //link to itself
}
List leaves = new ArrayList();
//add first node
sb.append(text.charAt(0));
SuffixNode node = new SuffixNode(sb,0,0,1);
leaves.add(node);
root.children.add(node);
int j_star = 0; //j_{i-1}
SuffixNode u = root;
SuffixNode v = root;
for(int i=1;i<=m-1;i++){
//do phase i
sb.append(text.charAt(i));
//step 1: do implicit extensions
for(SuffixNode leafnode:leaves){
leafnode.end++;
leafnode.pathlen++;
}
//step 2: do explicit extensions until rule #3 is applied
State state = new State();
//for the first explicit extension, we reuse the last phase's u and do slowscan
//also note: suffix link doesn't span two phases.
int j=j_star+1;
SuffixNode s = u;
int k = s.pathlen + j;
state.u = s;
state.v = s;
SuffixNode newleaf = slowscan(state,s,k);
if(newleaf == null){
//if rule #3 is applied, then we can terminate this phase
j_star = j - 1;
//Note: no need to update state.v because it is not going to be used
//at the next phase
u = state.u;
continue;
}else{
j_star = j;
leaves.add(newleaf);
u = state.u;
v = state.v;
}
j++;
//for other explicit extensions, we start with fast scan.
for(;j<=i;j++){
s = u.link;
int uvLen=v.pathlen - u.pathlen;
if(u.isRoot() && !v.isRoot()){
uvLen--;
}
//starting with index k of the text
k = s.pathlen + j;
//init state
state.u = s;
state.v = s; //if uvLen = 0
//execute fast scan
newleaf = fastscan(state,s,uvLen,k);
//establish the suffix link with v
v.link = state.v;
if(newleaf == null){
//if rule #3 is applied, then we can terminate this phase
j_star = j - 1;
u = state.u;
break;
}else{
j_star = j;
leaves.add(newleaf);
u = state.u;
v = state.v;
}
}
}
}
//slow scan from currNode until state.v is found
//return the new leaf if a new one is created right after v;
//return null otherwise (i.e. when rule #3 is applied)
private SuffixNode slowscan(State state,SuffixNode currNode,int k){
SuffixNode newleaf = null;
boolean done = false;
int keyLen = sb.length() - k;
for(int i=0;i / | \
// e f insert "c" c e f
int pathlen = sb.length() - k + currNode.pathlen;
SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);
currNode.children.add(i,node);
//state.u = currNode; //currNode is already registered as state.u, so commented out
state.v = currNode;
newleaf = node;
done = true;
break;
}else{ //key.charAt(0)>child.key.charAt(0)
//don't forget to add the largest new key after iterating all children
continue;
}
}else{//current child's key partially matches with the new key
if(delta==len){
if(keyLen==childKeyLen){
//e.g. child="ab"
// ab ab
// / \ =========> / \
// e f insert "ab" e f
//terminate this phase (implicit tree with rule #3)
state.u = child;
state.v = currNode;
}else if(keyLen>childKeyLen){
//TODO: still need an example to test this condition
//e.g. child="ab"
// ab ab
// / \ ==========> / | \
// e f insert "abc" c e f
//recursion
state.u = child;
state.v = child;
k += childKeyLen;
//state.k = k;
newleaf = slowscan(state,child,k);
}
else{ //keyLen / \
// e f insert "ab" e f
//
//terminate this phase (implicit tree with rule #3)
//state.u = currNode;
state.v = currNode;
}
}else{//0 / \
// e f insert "abd" c d
// / \
// e f
//insert the new node: ab
int nodepathlen = child.pathlen
- (child.getLength()-delta);
SuffixNode node = new SuffixNode(sb,
child.start,child.start + delta - 1,nodepathlen);
node.children = new LinkedList();
int leafpathlen = (sb.length() - (k + delta)) + nodepathlen;
SuffixNode leaf = new SuffixNode(sb,
k+delta,sb.length()-1,leafpathlen);
//update child node: c
child.start += delta;
if(sb.charAt(k+delta) / \
// e f suffix part: "abd" c d
// / \
// e f
//insert the new node: ab; child is now c
int nodepathlen = child.pathlen
- (child.getLength()-uvLen);
SuffixNode node = new SuffixNode(sb,
child.start,child.start + uvLen - 1,nodepathlen);
node.children = new LinkedList();
int leafpathlen = (sb.length() - (k + uvLen)) + nodepathlen;
SuffixNode leaf = new SuffixNode(sb,
k+uvLen,sb.length()-1,leafpathlen);
//update child node: c
child.start += uvLen;
if(sb.charAt(k+uvLen)len
//e.g. child="abc", uvLen = 4
// abc
// / \ ================>
// e f suffix part: "abcde"
//
//
//jump to next node
uvLen -= len;
state.u = child;
//state.v = child;
k += len;
//state.k = k;
newleaf = fastscan(state,child,uvLen,k);
}
done = true;
break;
}
}
if(!done){
//TODO: still need an example to test this condition
//add a leaf under the currNode
int pathlen = sb.length() - k + currNode.pathlen;
SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);
currNode.children.add(node);
//state.u = currNode; //currNode is already registered as state.u, so commented out
state.v = currNode;
newleaf = node;
}
return newleaf;
}
public void toEnhancedSuffixArray(int[] S,int[] LCP){
toEnhancedSuffixArray(root,S,LCP,0);
}
public int toEnhancedSuffixArray(SuffixNode currNode,int[] S,int[] LCP,int j){
//if the first child
if(currNode.children.size()>0){
SuffixNode child = currNode.children.get(0);
child.lca = currNode.lca; //first child's lca is equal to parent's lca
if(child.children.size()==0){
S[j] = sb.length() - child.pathlen;
LCP[j] = child.lca; //collect LCP data
j++;
}else{
j=toEnhancedSuffixArray(child,S,LCP,j);
}
}
for(int i=1;i
测试:
****************************
Suffix array for "mississippi$" is:
11 10 7 4 1 0 9 8 6 3 5 2
LCP:
0 0 1 1 4 0 0 1 0 2 1 3
****************************
Suffix array for "GACCCACCACC$" is:
11 8 5 1 10 7 4 9 6 3 2 0
LCP:
0 0 3 3 0 1 4 1 2 5 2 0