源码分享-go语言实现qsufsort后缀数组生成算法

qsufsort是开源差分工具bsdiff使用的后缀树生成算法。

qsufsort实现原理为Jesper Larsson的Faster Suffix Sorting算法。

package main

import "fmt"

func split(I []int, V []int, start, len_, h int) {
	if len(I) < 16 {
		j := 0
		for k := start; k < start+len_; k += j {
			j = 1
			x := V[I[k]+h]
			for i := 1; k+i < start+len_; i++ {
				if V[I[k+i]+h] < x {
					x = V[I[k+i]+h]
					j = 0
				}
				if V[I[k+i]+h] == x {
					I[k+i], I[k+j] = I[k+j], I[k+i]
					j++
				}
			}
			for i := 0; i < j; i++ {
				V[I[k+i]] = k + j - 1
			}
			if j == 1 {
				I[k] = -1
			}
		}
		return
	}

	x := V[I[start+len_/2]+h]
	jj := 0
	kk := 0
	for i := start; i < start+len_; i++ {
		if V[I[i]+h] < x {
			jj++
		}
		if V[I[i]+h] == x {
			kk++
		}
	}
	jj += start
	kk += jj

	j := 0
	k := 0
	for i := start; i < jj; {
		if V[I[i]+h] < x {
			i++
		} else if V[I[i]+h] == x {
			I[i], I[jj+j] = I[jj+j], I[i]
			j++
		} else {
			I[i], I[kk+k] = I[kk+k], I[i]
			k++
		}
	}

	for jj+j < kk {
		if V[I[jj+j]+h] == x {
			j++
		} else {
			I[jj+j], I[kk+k] = I[kk+k], I[jj+j]
			k++
		}
	}

	if jj > start {
		split(I, V, start, jj-start, h)
	}

	for i := 0; i < kk-jj; i++ {
		V[I[jj+i]] = kk - 1
	}
	if jj == kk-1 {
		I[jj] = -1
	}

	if start+len_ > kk {
		split(I, V, kk, start+len_-kk, h)
	}
}

func Qsufsort(old []byte) []int {
	var buckets [256]int
	for i := 0; i < len(old); i++ {
		buckets[old[i]]++
	}
	buckets[255] = len(old) - buckets[255]
	for i := 254; i >= 0; i-- {
		buckets[i] = buckets[i+1] - buckets[i]
	}

	I := make([]int, len(old)+1)
	for i := 0; i < len(old); i++ {
		buckets[old[i]]++
		I[buckets[old[i]]] = i
	}
	I[0] = len(old)
	V := make([]int, len(old)+1)
	for i := 0; i < len(old); i++ {
		V[i] = buckets[old[i]]
	}
	V[len(old)] = 0
	for i := 1; i < 256; i++ {
		if buckets[i] == buckets[i-1]+1 {
			I[buckets[i]] = -1
		}
	}
	I[0] = -1

	for h := 1; I[0] != -len(I); h += h {
		len_ := 0
		i := 0
		for i = 0; i < len(I); {
			if I[i] < 0 {
				len_ -= I[i]
				i -= I[i]
			} else {
				if len_ != 0 {
					I[i-len_] = -len_
				}
				len_ = V[I[i]] + 1 - i
				split(I, V, i, len_, h)
				i += len_
				len_ = 0
			}
		}
		if len_ != 0 {
			I[i-len_] = -len_
		}
	}

	for i := 0; i < len(I); i++ {
		I[V[i]] = i
	}
	return I
}

func main() {
	str := "hello word"
	I := Qsufsort([]byte(str))
	for i, v := range I {
		fmt.Println(i, str[v:])
	}
}

编译输出结果:

0
1  word     
2 d         
3 ello word 
4 hello word
5 llo word  
6 lo word   
7 o word    
8 ord       
9 rd        
10 word     

你可能感兴趣的:(源码分析,golang,算法,qsufsort,bsdiff,后缀数组)