根据后缀树LCA计算后缀数组及其LCP

本文继续“根据后缀树得出后缀数组“的讨论,进一步探讨从后缀树计算后缀数组的LCP。


后缀数组的LCP(Longest Common Prefix)问题等价于后缀树的最小公共祖先LCA(Least Common Ancestor)问题。下面的方法将采用“根据后缀树得出后缀数组”一文中介绍的方法,即以词典顺序(lexicographic order)遍历后缀树,但是这里在得出后缀数组的过程中,需记录每一个结点(包含内部结点)跟上一个前驱叶子结点的LCA。如果当前结点(currNode)是内部结点,需要分两种情况考虑来计算它的孩子结点的LCA值:1)它的第一个孩子结点(不管是内部结点还是叶子结点)的LCA等于currNode的LCA;2)它的其他孩子结点(不管是内部结点还是叶子结点)的LCA等于currNode.pathlen。显然currNode是叶子结点的情况不需要考虑,因为LCA是采用top-down(自顶向下)方式计算的,在考虑它的parent结点时已经得出了它的LCA。

实现(后缀树结点添加了lca属性):

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List; 
 
/**
 * 
 * Derive the suffix array and its LCP(Longest Common Prefix) from the suffix tree
 * (The suffix tree is built with ukk algorithm)
 * @author ljs
 * 2011-07-21
 *
 */
public class SuffixTree2EnhancedArray {
	private class SuffixNode {		
		private StringBuilder sb;
		
	    private List children = new LinkedList();
	    
	    private SuffixNode link;
	    private int start;
	    private int end;
	    private int pathlen;
	    
	    //LCA with its predecessor node
	    private int lca;
	    
	    public SuffixNode(StringBuilder sb,int start,int end,int pathlen){	
	    	this.sb = sb;
	    	this.start = start;
	    	this.end = end;
	    	this.pathlen = pathlen;
	    }
	    public SuffixNode(StringBuilder sb){	    
	    	this.sb = sb;
	    	this.start = -1;
	    	this.end = -1;	    
	    	this.pathlen = 0;
	    }
	    public int getLength(){
	    	if(start == -1) return 0;
	    	else return end - start + 1;
	    }
	    public String getString(){
	    	if(start != -1){
	    		return this.sb.substring(start,end+1);
	    	}else{
	    		return "";
	    	}
	    }
	    public boolean isRoot(){
	    	return start == -1;
	    }
	    public String getCoordinate(){
	    	return "[" + start+".." + end + "/" + this.pathlen + "]";
	    }
	    public String toString(){	    	
	    	return getString() + "(" + getCoordinate() 
	    		+ ",link:" + ((this.link==null)?"N/A":this.link.getCoordinate()) 
	    		+ ",children:" + children.size() +")";
	    }	   
	}
	private class State{
		private SuffixNode u; //parent(v)
		//private SuffixNode w;  
		private SuffixNode v;  
		//private int k; //the global index of text starting from 0 to text.length()
		//private boolean finished;  
	}
	
	private SuffixNode root;
	private StringBuilder sb = new StringBuilder();
	 
	
	//build a suffix-tree for a string of text
	public void  buildSuffixTree(String text) throws Exception{	
		int m = text.length();
		
		if(m==0)
			return;
		
		if(root==null){
			root = new SuffixNode(sb);				
			root.link = root; //link to itself
		}
		
		List leaves =  new ArrayList();
		
		//add first node
		sb.append(text.charAt(0));
		SuffixNode node = new SuffixNode(sb,0,0,1);
		leaves.add(node);
		root.children.add(node);	
		int j_star = 0; //j_{i-1}
		
		SuffixNode u = root;
		SuffixNode v = root;			
		for(int i=1;i<=m-1;i++){			
			//do phase i
			sb.append(text.charAt(i));
			
			//step 1: do implicit extensions 
			for(SuffixNode leafnode:leaves){
				leafnode.end++;
				leafnode.pathlen++;
			}
			
			//step 2: do explicit extensions until rule #3 is applied			
			State state = new State();	
			
			//for the first explicit extension, we reuse the last phase's u and do slowscan
			//also note: suffix link doesn't span two phases.
			int j=j_star+1;
			SuffixNode s = u;		 
			int k = s.pathlen + j;		
			state.u = s;			
			state.v = s;  
			SuffixNode newleaf = slowscan(state,s,k);
			if(newleaf == null){
				//if rule #3 is applied, then we can terminate this phase
				j_star = j - 1;
				//Note: no need to update state.v because it is not going to be used
				//at the next phase
				u = state.u;
				continue;
			}else{			
				
				j_star = j;
				leaves.add(newleaf);
				
				u = state.u;
				v = state.v;
			}		
			j++;
			
			//for other explicit extensions, we start with fast scan.
			for(;j<=i;j++){
				s = u.link;
				
				int uvLen=v.pathlen - u.pathlen;  		
				if(u.isRoot() && !v.isRoot()){
					uvLen--;
				}
				//starting with index k of the text
				k = s.pathlen + j;		
				
				
				//init state
				state.u = s;			
				state.v = s; //if uvLen = 0 
				
				//execute fast scan
				newleaf = fastscan(state,s,uvLen,k);				
				//establish the suffix link with v		
				v.link = state.v;
				
				if(newleaf == null){
					//if rule #3 is applied, then we can terminate this phase
					j_star = j - 1;
					u = state.u;
					break;
				}else{
					
					j_star = j;
					leaves.add(newleaf);
					
					u = state.u;
					v = state.v;
				}			
			}
		}
	}
	//slow scan from currNode until state.v is found
	//return the new leaf if a new one is created right after v;
	//return null otherwise (i.e. when rule #3 is applied)
	private SuffixNode slowscan(State state,SuffixNode currNode,int k){
		SuffixNode newleaf = null;
		
		boolean done = false;		
		int keyLen = sb.length() - k;
		for(int i=0;i      / | \
					//   e    f   insert "c"     c  e  f
					int pathlen = sb.length() - k + currNode.pathlen;
					SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);
					currNode.children.add(i,node);		
					//state.u = currNode; //currNode is already registered as state.u, so commented out
					state.v = currNode;
					newleaf = node;
					done = true;
					break;					
				}else{ //key.charAt(0)>child.key.charAt(0)
					//don't forget to add the largest new key after iterating all children
					continue;
				}
			}else{//current child's key partially matches with the new key	
				if(delta==len){
					if(keyLen==childKeyLen){						
						//e.g. child="ab"
						//	   ab                    ab
						//    /  \    =========>    /  \
						//   e    f   insert "ab"  e    f
						//terminate this phase  (implicit tree with rule #3)		
						state.u = child;
						state.v = currNode;
					}else if(keyLen>childKeyLen){ 
						//TODO: still need an example to test this condition
						//e.g. child="ab"
						//	   ab                      ab
						//    /  \    ==========>     / | \ 							
						//   e    f   insert "abc"   c e  f		
						//recursion
						state.u = child;
						state.v = child;
						k += childKeyLen;
						//state.k = k;
						newleaf = slowscan(state,child,k);
					}
					else{ //keyLen     /  \ 
						//   e     f     insert "ab"   e   f	   
						//					          
						//terminate this phase  (implicit tree with rule #3)
						//state.u = currNode;
						state.v = currNode;
					}
				}else{//0     / \
					//   e    f   insert "abd"    c   d 
					//                           /  \
					//                          e    f					
					//insert the new node: ab 
					int nodepathlen = child.pathlen 
							- (child.getLength()-delta);
					SuffixNode node = new SuffixNode(sb,
							child.start,child.start + delta - 1,nodepathlen); 
					node.children = new LinkedList();
					
					int leafpathlen = (sb.length() - (k + delta)) + nodepathlen;
					SuffixNode leaf = new SuffixNode(sb,
							k+delta,sb.length()-1,leafpathlen);
					
					//update child node: c
					child.start += delta;
					if(sb.charAt(k+delta)     / \
					//   e    f   suffix part: "abd"   c   d 
					//                                /  \
					//                               e    f				
					
					//insert the new node: ab; child is now c 
					int nodepathlen = child.pathlen 
							- (child.getLength()-uvLen);
					SuffixNode node = new SuffixNode(sb,
							child.start,child.start + uvLen - 1,nodepathlen); 
					node.children = new LinkedList();
					
					int leafpathlen = (sb.length() - (k + uvLen)) + nodepathlen;
					SuffixNode leaf = new SuffixNode(sb,
							k+uvLen,sb.length()-1,leafpathlen);
					
					//update child node: c
					child.start += uvLen;
					if(sb.charAt(k+uvLen)len
					//e.g. child="abc", uvLen = 4
					//	   abc                          
					//    /  \    ================>      
					//   e    f   suffix part: "abcde"   
					//                                
					//                  
					//jump to next node
					uvLen -= len;
					state.u = child;
					//state.v = child;
					k += len;
					//state.k = k;
					newleaf = fastscan(state,child,uvLen,k);
				}
				done = true;
				break;
			}
		}		
		if(!done){			
			//TODO: still need an example to test this condition
			//add a leaf under the currNode
			int pathlen = sb.length() - k + currNode.pathlen;
			SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);
			currNode.children.add(node);
			//state.u = currNode; //currNode is already registered as state.u, so commented out
			state.v = currNode;	
			newleaf = node;
		}
		
		return newleaf;
	}
	
	
	public void toEnhancedSuffixArray(int[] S,int[] LCP){
		toEnhancedSuffixArray(root,S,LCP,0);
	}
	public int toEnhancedSuffixArray(SuffixNode currNode,int[] S,int[] LCP,int j){
		
		//if the first child
		if(currNode.children.size()>0){			
			SuffixNode child = currNode.children.get(0);
			child.lca = currNode.lca; //first child's lca is equal to parent's lca
			
			if(child.children.size()==0){
				S[j] = sb.length() - child.pathlen;	
				LCP[j] = child.lca; //collect LCP data
				j++;
			}else{
				j=toEnhancedSuffixArray(child,S,LCP,j);
			}
		} 
		
		for(int i=1;i


测试:

****************************
Suffix array for "mississippi$" is:
11 10 7 4 1 0 9 8 6 3 5 2
LCP:
0 0 1 1 4 0 0 1 0 2 1 3
****************************
Suffix array for "GACCCACCACC$" is:
11 8 5 1 10 7 4 9 6 3 2 0
LCP:
0 0 3 3 0 1 4 1 2 5 2 0


你可能感兴趣的:(数据结构和算法,Stringology)