后缀数组的自底向上(bottom-up)遍历算法

后缀数组自底向上遍历等价于后缀树的自底向上遍历。由于后缀数组不是树型结构,在遍历时除了SA本身之外还需要额外的信息,这时Suffix Array就是一个增强的后缀数组(Enhanced Suffix Array)了。该算法使用后缀数组的一个增强信息---LCP表,并通过堆栈模拟自底向上的遍历。遍历的结果就是一颗虚拟的lcp-interval树,其中每一个结点对应后缀树的一个内部结点。有些应用中,遍历时需要知道每个结点的孩子信息,因此在下面的实现中提供了两个版本bottomUpTraverseWithoutChildren和bottomUpTraverseWithChildren。

需要说明的是,树中每一个lcp-interval结点表示为:lcp-[i..j],其中lcp相当于后缀树的pathlen,i和j分别是以该lcp-interval结点为根结点表示的子树中的最小和最大后缀数组下标,例如mississippi$的一个lcp-interval结点是1-[1..4],表示后缀数组中第1个后缀到第4个后缀这总共四个后缀作为该lcp-interval结点为根的子树的四个叶子结点。


实现:

import java.util.ArrayList;
import java.util.List;
import java.util.Stack;

/**
 * 
 * Bottom-Up Traversal of a suffix array (with LCP table)
 * (The suffix array is constructed with prefix doubling algorithm)
 * 
 *  
 * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
 * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) 
 * 
 * @author ljs
 * 2011-07-23
 *
 */
public class ESA_BottomUpTraversal {

	public static final char MAX_CHAR = '\u00FF';
	
	private String text;
	private int[] suffixarray;
	private int[] ranktable;
	private int[] lcptable;
 
	
	public ESA_BottomUpTraversal(String text){	
		this.text = text;
	}

	class Suffix{
		int[] sa;  
		//Note: the p-th suffix in sa: SA[rank[p]-1]];
		//p is the index of the array "rank", start with 0;
		//a text S's p-th suffix is S[p..n], n=S.length-1.
		int[] rank; 
		boolean done;
		 
		public Suffix(int[] sa,int[] rank){
			this.sa = sa;
			this.rank = rank;
		}
	}
	

	//a prefix of suffix[isuffix] represented with digits
	class Tuple{
		int isuffix; //the p-th suffix
		int[] digits;
		public Tuple(int suffix,int[] digits){
			this.isuffix = suffix;
			this.digits = digits;			
		}
		public String toString(){
			StringBuffer sb = new StringBuffer();			
			sb.append(isuffix);
			sb.append("(");
			for(int i=0;i=0;j--){
			//C[A[j]] <= A.length 
			tB[--C[tA[j].digits[d]]]=tA[j];			
		}
	}
	
	//tA: input
	//tB: output for rank caculation
	private void radixSort(Tuple[] tA,Tuple[] tB,int max,int digitsLen){
		int len = tA.length;
		int digitsTotalLen = tA[0].digits.length;
			
		for(int d=digitsTotalLen-1,j=0;jrank[q]){
					sa[k++] = q;j++;
				}else{
					if(rank12[p+1]rank[q]){
					sa[k++] = q;j++;
				}else{
					if(rank[p+1]rank[q+1]){
						sa[k++] = q;j++;
					}else{
						if(rank12[p+2]0){
		   int q=sa[r-1];
		   //caculate LCP by definition
		   for(int i=0,j=q;i=1
		//ignore p == sa[0] because LCP=0 for suffix[p] where rank[p]=0				
		for(int p=1;p1){ //for h<=1, caculate LCP by definition (i.e. start with lcp=0)			
				//jump h-1 chars for suffix[p] and suffix[q]						
				lcp = h-1;			    
			}
			for(int i=p+lcp,j=q+lcp,k=0;i children = new ArrayList();
		public LCPInterval(int lcp,int lb,int rb){
			this.lcp = lcp;
			this.lb = lb;
			this.rb = rb;
		}
		public String toString(){
			return String.format("%d-[%d..%d]", 
					this.lcp,this.lb,this.rb);
		}
	}	
	
	private void reportLCPInterval(LCPInterval interval){
		if(interval.children.size()>0){
			StringBuilder sb = new StringBuilder();
			for(LCPInterval child:interval.children){
				sb.append(child.toString());
				sb.append(",");
			}
			sb.deleteCharAt(sb.length()-1);
			System.out.format("%s, children={%s}%n", 
					interval,sb.toString());				
		}else{
			System.out.format("%s%n", interval);	
		}
	}
	
	//traverse the corresponding suffix tree with a bottom-up approach
	//each internal node is equivalent to an lcp-interval.
	public void bottomUpTraverseWithoutChildren(){
		int len = text.length();
		
		Stack stack = new Stack();
		int lb = -1;
		//push root's first child
		stack.push(new LCPInterval(0,0,-1));
		for(int i=1;istack.peek().lcp){
				stack.push(new LCPInterval(lcptable[i],lb,-1));
			} //if lcptable[i]==interval.lcp, no push because rb is updated when popped
		}
		
		while(!stack.isEmpty()){
			LCPInterval interval = stack.pop();
			//update the popped interval's rb
			interval.rb = len-1;
			
			reportLCPInterval(interval);				
		}		
	}
	
	public void bottomUpTraverseWithChildren(){
		int len = text.length();
		
		Stack stack = new Stack();
		int lb = -1;
		LCPInterval lastInterval = null;
		//push root's first child
		stack.push(new LCPInterval(0,0,-1));
		for(int i=1;inext>i, then top is next's child
					LCPInterval next = stack.peek();	
					next.children.add(lastInterval);
					lastInterval = null;
				}
			}
			if(lcptable[i]>stack.peek().lcp){
				LCPInterval curr=new LCPInterval(lcptable[i],lb,-1);
				if(lastInterval != null){ 
					//case 2: top>i>next, then top is i's child
					curr.children.add(lastInterval);
					lastInterval = null;
				}
				stack.push(curr);
			} //if lcptable[i]==interval.lcp, no push because rb is updated when popped
		}
		
		while(!stack.isEmpty()){
			lastInterval = stack.pop();
			//update the popped interval's rb
			lastInterval.rb = len-1;
			
			reportLCPInterval(lastInterval);	
			
			if(!stack.isEmpty()){//case 1: top > next, i.e. top is next's child
				LCPInterval next = stack.peek();	
				next.children.add(lastInterval);
			}
		}		
	}
	public void solve(){
		this.computeSuffixArray();
		this.computeLCPtable();
		this.reportLCP();
		
		System.out.format("%nbottom-up traversal with no children list: %n");
		this.bottomUpTraverseWithoutChildren();		
		System.out.format("%nbottom-up traversal with children list: %n");
		this.bottomUpTraverseWithChildren();	
	}
	
	public static void main(String[] args) {
		String text = "mississippi#";
		ESA_BottomUpTraversal esa = new ESA_BottomUpTraversal(text);
		esa.solve();
		System.out.format("%n********************************%n");
		
		text = "GACCCACCACC#";
		esa = new ESA_BottomUpTraversal(text);
		esa.solve();
		System.out.format("%n********************************%n");	
	}
}



测试:

Text: mississippi#
suffix array:
 11 10 7 4 1 0 9 8 6 3 5 2
rank table:
 6 5 12 10 4 11 9 3 8 7 2 1
lcp table:
 0 0 1 1 4 0 0 1 0 2 1 3
bottom-up traversal with no children list:
4-[3..4]
1-[1..4]
1-[6..7]
2-[8..9]
3-[10..11]
1-[8..11]
0-[0..11]

bottom-up traversal with children list:
4-[3..4]
1-[1..4], children={4-[3..4]}
1-[6..7]
2-[8..9]
3-[10..11]
1-[8..11], children={2-[8..9],3-[10..11]}
0-[0..11], children={1-[1..4],1-[6..7],1-[8..11]}

********************************
Text: GACCCACCACC#
suffix array:
 11 8 5 1 10 7 4 9 6 3 2 0
rank table:
 12 4 11 10 7 3 9 6 2 8 5 1
lcp table:
 0 0 3 3 0 1 4 1 2 5 2 0
bottom-up traversal with no children list:
3-[1..3]
4-[5..6]
5-[8..9]
2-[7..10]
1-[4..10]
0-[0..11]

bottom-up traversal with children list:
3-[1..3]
4-[5..6]
5-[8..9]
2-[7..10], children={5-[8..9]}
1-[4..10], children={4-[5..6],2-[7..10]}
0-[0..11], children={3-[1..3],1-[4..10]}

********************************

参考:

Mohamed Ibrahim Abouelhoda, Stefan Kurtz, Enno Ohlebusch: Replacing suffix tree with enhanced suffix arrays  (2004)


你可能感兴趣的:(Stringology,数据结构和算法,traversal,string,table,list,null,arrays)