后缀数组自底向上遍历等价于后缀树的自底向上遍历。由于后缀数组不是树型结构,在遍历时除了SA本身之外还需要额外的信息,这时Suffix Array就是一个增强的后缀数组(Enhanced Suffix Array)了。该算法使用后缀数组的一个增强信息---LCP表,并通过堆栈模拟自底向上的遍历。遍历的结果就是一颗虚拟的lcp-interval树,其中每一个结点对应后缀树的一个内部结点。有些应用中,遍历时需要知道每个结点的孩子信息,因此在下面的实现中提供了两个版本bottomUpTraverseWithoutChildren和bottomUpTraverseWithChildren。
需要说明的是,树中每一个lcp-interval结点表示为:lcp-[i..j],其中lcp相当于后缀树的pathlen,i和j分别是以该lcp-interval结点为根结点表示的子树中的最小和最大后缀数组下标,例如mississippi$的一个lcp-interval结点是1-[1..4],表示后缀数组中第1个后缀到第4个后缀这总共四个后缀作为该lcp-interval结点为根的子树的四个叶子结点。
实现:
import java.util.ArrayList; import java.util.List; import java.util.Stack; /** * * Bottom-Up Traversal of a suffix array (with LCP table) * (The suffix array is constructed with prefix doubling algorithm) * * * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) * * @author ljs * 2011-07-23 * */ public class ESA_BottomUpTraversal { public static final char MAX_CHAR = '\u00FF'; private String text; private int[] suffixarray; private int[] ranktable; private int[] lcptable; public ESA_BottomUpTraversal(String text){ this.text = text; } class Suffix{ int[] sa; //Note: the p-th suffix in sa: SA[rank[p]-1]]; //p is the index of the array "rank", start with 0; //a text S's p-th suffix is S[p..n], n=S.length-1. int[] rank; boolean done; public Suffix(int[] sa,int[] rank){ this.sa = sa; this.rank = rank; } } //a prefix of suffix[isuffix] represented with digits class Tuple{ int isuffix; //the p-th suffix int[] digits; public Tuple(int suffix,int[] digits){ this.isuffix = suffix; this.digits = digits; } public String toString(){ StringBuffer sb = new StringBuffer(); sb.append(isuffix); sb.append("("); for(int i=0;i<digits.length;i++){ sb.append(digits[i]); if(i<digits.length-1) sb.append("-"); } sb.append(")"); return sb.toString(); } } //d: the digit to do countingsort //max: A value's range is 0...max private void countingSort(int d,Tuple[] tA,Tuple[] tB,int max){ //init the counter array int[] C = new int[max+1]; for(int i=0;i<=max;i++){ C[i] = 0; } //stat the count for(int j=0;j<tA.length;j++){ C[tA[j].digits[d]]++; } //process the counter array C for(int i=1;i<=max;i++){ C[i]+=C[i-1]; } //distribute the values for(int j=tA.length-1;j>=0;j--){ //C[A[j]] <= A.length tB[--C[tA[j].digits[d]]]=tA[j]; } } //tA: input //tB: output for rank caculation private void radixSort(Tuple[] tA,Tuple[] tB,int max,int digitsLen){ int len = tA.length; int digitsTotalLen = tA[0].digits.length; for(int d=digitsTotalLen-1,j=0;j<digitsLen;d--,j++){ this.countingSort(d, tA, tB, max); //assign tB to tA if(j<digitsLen-1){ for(int i=0;i<len;i++){ tA[i] = tB[i]; } } } } //max is the maximum value in any digit of TA.digits[], used for counting sort //tA: input //tB: the place holder, reused between iterations private Suffix rank(Tuple[] tA,Tuple[] tB,int max,int digitsLen){ int len = tA.length; radixSort(tA,tB,max,digitsLen); int digitsTotalLen = tA[0].digits.length; //caculate rank and sa int[] sa = new int[len]; sa[0] = tB[0].isuffix; int[] rank = new int[len+2]; //add 2 for sentinel rank[len]=1;rank[len+1] = 1; int r = 1; //rank starts with 1 rank[tB[0].isuffix] = r; for(int i=1;i<len;i++){ sa[i] = tB[i].isuffix; boolean equalLast = true; for(int j=digitsTotalLen-digitsLen;j<digitsTotalLen;j++){ if(tB[i].digits[j]!=tB[i-1].digits[j]){ equalLast = false; break; } } if(!equalLast){ r++; } rank[tB[i].isuffix] = r; } Suffix suffix = new Suffix(sa,rank); //judge if we are done if(r==len){ suffix.done = true; }else{ suffix.done = false; } return suffix; } private int[] orderSuffixes(Tuple[] tA,Tuple[] tB,int max,int digitsLen){ int len = tA.length; radixSort(tA,tB,max,digitsLen); //caculate rank and sa int[] sa = new int[len]; for(int i=0;i<len;i++){ sa[i] = tB[i].isuffix; } return sa; } //rank needs sentinel: len+2 private Suffix reduce(int[] rank,int max){ int len = rank.length - 2; int n1 = (len+1)/3; int n2 = len/3; Tuple[] tA = new Tuple[n1+n2]; Tuple[] tB = new Tuple[n1+n2]; for(int i=0,j=1;i<n1;i++,j+=3){ int r1 = rank[j]; int r2 = rank[j+1]; int r3 = rank[j+2]; tA[i] = new Tuple(i,new int[]{r1,r2,r3}); } for(int i=n1,j=2;i<n1+n2;i++,j+=3){ int r1 = rank[j]; int r2 = rank[j+1]; int r3 = rank[j+2]; tA[i] = new Tuple(i,new int[]{r1,r2,r3}); } return rank(tA,tB,max,3); } private int[] skew(int[] rank,int max){ int len = rank.length - 2; //step 1: caculate sa12 Suffix suffixT12 = reduce(rank,max); int[] sa12 = null; if(!suffixT12.done){ int[] rankT12 = suffixT12.rank; int maxT12 = rankT12[suffixT12.sa[suffixT12.sa.length-1]]; sa12 = skew(rankT12,maxT12); // debug for string: GACCCACCACC# //s12 = new Suffix(); //s12.rank = new int[]{3,6,5,4,7,2,1,1,1}; //s12.sa = new int[]{7,6,5,0,3,2,1,4}; //s12.done =true; }else{ sa12 = suffixT12.sa; } //index conversion for sa12 int n1 = (len+1)/3; for(int j=0;j<sa12.length;j++){ if(sa12[j]<n1){ sa12[j] = 1 + 3*sa12[j]; }else{ sa12[j] = 2 + 3*(sa12[j]-n1); } } //recaculate rank for sa12 int[] rank12 = new int[len+2]; rank12[len] = 1;rank12[len+1] = 1; for(int k=0;k<sa12.length;k++){ rank12[sa12[k]] = k+1; } //step 2: caculate sa0 int n0=(len+2)/3; Tuple[] tA = new Tuple[n0]; Tuple[] tB = new Tuple[n0]; for(int i=0,j=0;i<n0;i++,j+=3){ int r1 = rank[j]; int r2 = rank12[j+1]; tA[i] = new Tuple(i,new int[]{r1,r2}); } int max12 = rank12[sa12[sa12.length-1]]; int[] sa0 = orderSuffixes(tA,tB,max<max12?max12:max,2); //index conversion for sa0 for(int j=0;j<n0;j++){ sa0[j] = 3*sa0[j]; } //step 3: merge sa12 and sa0 int[] sa = new int[len]; int i=0,j=0; int k=0; while(i<sa12.length && j<sa0.length){ int p = sa12[i]; int q = sa0[j]; if(p%3==1){ //case 1 if(rank[p]<rank[q]){ sa[k++] = p;i++; }else if(rank[p]>rank[q]){ sa[k++] = q;j++; }else{ if(rank12[p+1]<rank12[q+1]){ sa[k++] = p;i++; }else{ sa[k++] = q;j++; } } }else{ //case 2 if(rank[p]<rank[q]){ sa[k++] = p;i++; }else if(rank[p]>rank[q]){ sa[k++] = q;j++; }else{ if(rank[p+1]<rank[q+1]){ sa[k++] = p;i++; }else if(rank[p+1]>rank[q+1]){ sa[k++] = q;j++; }else{ if(rank12[p+2]<rank12[q+2]){ sa[k++] = p;i++; }else{ sa[k++] = q;j++; } } } } } for(int m=i;m<sa12.length;m++){ sa[k++] = sa12[m]; } for(int m=j;m<sa0.length;m++){ sa[k++] = sa0[m]; } return sa; } //Precondition: the last char in text must be less than other chars. public void computeSuffixArray(){ if(text == null)return; int len = text.length(); if(len == 0) return ; char base = text.charAt(len-1); //the smallest char Tuple[] tA = new Tuple[len]; Tuple[] tB = new Tuple[len]; //placeholder for(int i=0;i<len;i++){ tA[i] = new Tuple(i,new int[]{0,text.charAt(i)-base}); } Suffix suffix = rank(tA,tB,MAX_CHAR-base,1); int max = suffix.rank[suffix.sa[len-1]]; int[] sa = skew(suffix.rank,max); //caculate rank for result suffix array int[] r = new int[len]; for(int k=0;k<sa.length;k++){ r[sa[k]] = k+1; } this.suffixarray = sa; this.ranktable = r; } //rank[p]'s index starts with 1 (not 0) public void computeLCPtable(){ if(text == null)return; int len = text.length(); if(len == 0) return ; int[] sa = this.suffixarray; int[] rank = this.ranktable; int[] lcpz = new int[len]; //base case: p=0 //caculate LCP of suffix[0] int lcp = 0; int r = rank[0]-1; if(r>0){ int q=sa[r-1]; //caculate LCP by definition for(int i=0,j=q;i<len && j<len;i++,j++){ if(text.charAt(i) != text.charAt(j)){ lcp=i; break; } } } lcpz[0] = lcp; //other cases: p>=1 //ignore p == sa[0] because LCP=0 for suffix[p] where rank[p]=0 for(int p=1;p<len && p != sa[0];p++){ int h = lcpz[p-1]; int q=sa[rank[p]-2]; lcp = 0; if(h>1){ //for h<=1, caculate LCP by definition (i.e. start with lcp=0) //jump h-1 chars for suffix[p] and suffix[q] lcp = h-1; } for(int i=p+lcp,j=q+lcp,k=0;i<len && j<len;i++,j++,k++){ if(text.charAt(i) != text.charAt(j)){ lcp+=k; break; } } lcpz[p] = lcp; } //caculate LCP int[] LCP = new int[len]; for(int i=0;i<len;i++){ LCP[i] = lcpz[sa[i]]; } this.lcptable = LCP; } private void reportLCP(){ System.out.format("Text: %s%n",text); int len = this.text.length(); System.out.println("suffix array:"); for(int i=0;i<len;i++){ System.out.format(" %s", this.suffixarray[i]); } System.out.println(); System.out.println("rank table:"); for(int i=0;i<len;i++){ System.out.format(" %s", this.ranktable[i]); } System.out.println(); System.out.println("lcp table:"); for(int i=0;i<len;i++){ System.out.format(" %s", this.lcptable[i]); } } class LCPInterval{ int lcp; //the lcp value of the lcp-interval int lb; //the left boundary suffix index int rb; //the right boundary suffix index List<LCPInterval> children = new ArrayList<LCPInterval>(); public LCPInterval(int lcp,int lb,int rb){ this.lcp = lcp; this.lb = lb; this.rb = rb; } public String toString(){ return String.format("%d-[%d..%d]", this.lcp,this.lb,this.rb); } } private void reportLCPInterval(LCPInterval interval){ if(interval.children.size()>0){ StringBuilder sb = new StringBuilder(); for(LCPInterval child:interval.children){ sb.append(child.toString()); sb.append(","); } sb.deleteCharAt(sb.length()-1); System.out.format("%s, children={%s}%n", interval,sb.toString()); }else{ System.out.format("%s%n", interval); } } //traverse the corresponding suffix tree with a bottom-up approach //each internal node is equivalent to an lcp-interval. public void bottomUpTraverseWithoutChildren(){ int len = text.length(); Stack<LCPInterval> stack = new Stack<LCPInterval>(); int lb = -1; //push root's first child stack.push(new LCPInterval(0,0,-1)); for(int i=1;i<len;i++){ //save lb lb = i - 1; //pop all the deeper suffixes while(lcptable[i]<stack.peek().lcp){ LCPInterval interval = stack.pop(); //update the popped interval's rb interval.rb = i-1; reportLCPInterval(interval); //update lb lb = interval.lb; } if(lcptable[i]>stack.peek().lcp){ stack.push(new LCPInterval(lcptable[i],lb,-1)); } //if lcptable[i]==interval.lcp, no push because rb is updated when popped } while(!stack.isEmpty()){ LCPInterval interval = stack.pop(); //update the popped interval's rb interval.rb = len-1; reportLCPInterval(interval); } } public void bottomUpTraverseWithChildren(){ int len = text.length(); Stack<LCPInterval> stack = new Stack<LCPInterval>(); int lb = -1; LCPInterval lastInterval = null; //push root's first child stack.push(new LCPInterval(0,0,-1)); for(int i=1;i<len;i++){ //save lb lb = i - 1; //pop all the deeper suffixes while(lcptable[i]<stack.peek().lcp){ lastInterval = stack.pop(); //update the popped interval's rb lastInterval.rb = i-1; //Note: when lastInterval is popped out, if it has children then its //children are already known! reportLCPInterval(lastInterval); //update lb lb = lastInterval.lb; if(lcptable[i]<=stack.peek().lcp){ //case 1: top>next>i, then top is next's child LCPInterval next = stack.peek(); next.children.add(lastInterval); lastInterval = null; } } if(lcptable[i]>stack.peek().lcp){ LCPInterval curr=new LCPInterval(lcptable[i],lb,-1); if(lastInterval != null){ //case 2: top>i>next, then top is i's child curr.children.add(lastInterval); lastInterval = null; } stack.push(curr); } //if lcptable[i]==interval.lcp, no push because rb is updated when popped } while(!stack.isEmpty()){ lastInterval = stack.pop(); //update the popped interval's rb lastInterval.rb = len-1; reportLCPInterval(lastInterval); if(!stack.isEmpty()){//case 1: top > next, i.e. top is next's child LCPInterval next = stack.peek(); next.children.add(lastInterval); } } } public void solve(){ this.computeSuffixArray(); this.computeLCPtable(); this.reportLCP(); System.out.format("%nbottom-up traversal with no children list: %n"); this.bottomUpTraverseWithoutChildren(); System.out.format("%nbottom-up traversal with children list: %n"); this.bottomUpTraverseWithChildren(); } public static void main(String[] args) { String text = "mississippi#"; ESA_BottomUpTraversal esa = new ESA_BottomUpTraversal(text); esa.solve(); System.out.format("%n********************************%n"); text = "GACCCACCACC#"; esa = new ESA_BottomUpTraversal(text); esa.solve(); System.out.format("%n********************************%n"); } }
测试:
Text: mississippi#
suffix array:
11 10 7 4 1 0 9 8 6 3 5 2
rank table:
6 5 12 10 4 11 9 3 8 7 2 1
lcp table:
0 0 1 1 4 0 0 1 0 2 1 3
bottom-up traversal with no children list:
4-[3..4]
1-[1..4]
1-[6..7]
2-[8..9]
3-[10..11]
1-[8..11]
0-[0..11]
bottom-up traversal with children list:
4-[3..4]
1-[1..4], children={4-[3..4]}
1-[6..7]
2-[8..9]
3-[10..11]
1-[8..11], children={2-[8..9],3-[10..11]}
0-[0..11], children={1-[1..4],1-[6..7],1-[8..11]}
********************************
Text: GACCCACCACC#
suffix array:
11 8 5 1 10 7 4 9 6 3 2 0
rank table:
12 4 11 10 7 3 9 6 2 8 5 1
lcp table:
0 0 3 3 0 1 4 1 2 5 2 0
bottom-up traversal with no children list:
3-[1..3]
4-[5..6]
5-[8..9]
2-[7..10]
1-[4..10]
0-[0..11]
bottom-up traversal with children list:
3-[1..3]
4-[5..6]
5-[8..9]
2-[7..10], children={5-[8..9]}
1-[4..10], children={4-[5..6],2-[7..10]}
0-[0..11], children={3-[1..3],1-[4..10]}
********************************
参考:
Mohamed Ibrahim Abouelhoda, Stefan Kurtz, Enno Ohlebusch: Replacing suffix tree with enhanced suffix arrays (2004)