POJ 2774 解题报告

这道题可能是近期写过的最纠结的一道题了。之前一直没有看过suffix array,这次必须看了。

geeksforgeeks上面有通俗易懂的O(nlognlogn)的实现:http://www.geeksforgeeks.org/suffix-array-set-2-a-nlognlogn-algorithm/。但我不清楚是否卡时间。

最好的资料还是discuss中大家都提到的罗穗骞大神的实现:https://github.com/oeddyo/algorithm/blob/master/resources/%E7%89%9B%E4%BA%BA%E8%B0%88ACM%E7%BB%8F%E9%AA%8C(%E5%8C%85%E6%8B%AC%E5%9B%BD%E5%AE%B6%E9%9B%86%E8%AE%AD%E9%98%9F%E8%AE%BA%E6%96%87)/%E5%9B%BD%E5%AE%B6%E9%9B%86%E8%AE%AD%E9%98%9F%E8%AE%BA%E6%96%87/%E5%9B%BD%E5%AE%B6%E9%9B%86%E8%AE%AD%E9%98%9F2009%E8%AE%BA%E6%96%87%E9%9B%86/11.%E7%BD%97%E7%A9%97%E9%AA%9E%E3%80%8A%E5%90%8E%E7%BC%80%E6%95%B0%E7%BB%84%E2%80%94%E2%80%94%E5%A4%84%E7%90%86%E5%AD%97%E7%AC%A6%E4%B8%B2%E7%9A%84%E6%9C%89%E5%8A%9B%E5%B7%A5%E5%85%B7%E3%80%8B/%E5%90%8E%E7%BC%80%E6%95%B0%E7%BB%84%E2%80%94%E2%80%94%E5%A4%84%E7%90%86%E5%AD%97%E7%AC%A6%E4%B8%B2%E7%9A%84%E6%9C%89%E5%8A%9B%E5%B7%A5%E5%85%B7.pdf

后缀数组本不容易,所以需要费些功夫理解,虽然罗穗骞大神的解释已经非常通俗易懂了。

最终我只是大致了解了大神的程序,这里照搬了源程序(俗称“模板”)。

解题思路是将两个string合成一个,然后看后缀的最长共同前缀(longest common prefix, LCP)。这是后缀数组的一个常见应用。

需要注意的是,需要保证两个后缀来自不同的string,最简单的办法是在第一个string后面加一个从未出现过的字符,比如'$'。这里有很好的解释:http://poj.org/showmessage?message_id=85977。

由于模板程序需要在字符串后面加个`0`(sa中将排在第一位)。所以n, n - 1, 0, 1之类的要区分清楚。

贡献了很多WA和RE,不过也是个理解加深的过程。

thestoryofsnow 2774 Accepted 5640K 344MS C++ 5079B
/* 
ID: thestor1 
LANG: C++ 
TASK: poj2774 
*/
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

// the largest size of each string
const int MAXS = 100000 + 1;
// each is size of MAXS, so 2 * MAXS
// pad '$' at the end of first string (so that the common prefix will not overlap)
// pad '\0' at the end of whole string
const int MAXN = 2 * MAXS + 2;

int wa[MAXN], wb[MAXN], wv[MAXN], wc[MAXN];
int sa[MAXN];

// r is the rank of parts, each of size l
// a = sa[i - 1], b = sa[i]
// compare if subsequent sa has the same "rank"
// "rank" consists of first part(r[a] == r[b]) and second part(r[a + l] == r[b + l]).
int cmp(int *r, int a, int b, int l)
{
	return r[a] == r[b] && r[a + l] == r[b + l];
}

// r is the input char sequence (expressed as int[])
// r[n - 1] == 0 and r[i] > 0 (0 <= r < n - 1) for simplicity of computation
// n is the length of r
// m is the range of r, that is, 0 <= r[i] < m
void da(int *r, int *sa, int n, int m)
{
	int i, j, p, *x = wa, *y = wb, *t;

	// radix sort, j = 1
	for (i = 0; i < m; i++)
	{
		wc[i] = 0;
	}
	for (i = 0; i < n;i++) 
	{
		wc[x[i] = r[i]]++;
	}
	for (i = 1; i < m; i++)
	{
		wc[i] += wc[i - 1];
	}
	for (i = n - 1; i >= 0; i--)
	{
		sa[--wc[x[i]]] = i;
	}

	for (j = 1, p = 1; p < n; j *= 2, m = p)
	{
		// rank of second part can take advantage of sa
		// i + j >= n for i in [n - j ~ n - 1]
		// that is, this range do not have second part
		// that is, second part should be smallest
		for (p = 0, i = n - j; i < n; i++)
		{
			y[p++] = i;
		}
		for (i = 0; i < n; i++)
		{
			// this position can be second part (sa[i] - j >= 0)
			if (sa[i] >= j)
			{
				// rank them according to sa
				y[p++] = sa[i] - j;
			}
		}

		// radix sort, according to first part rank (x)
		// both parts (x and y) have size of j
		// now the total size is 2 * j
		for (i = 0; i < n; i++)
		{
			// get its first part
			wv[i] = x[y[i]];
		}
		for (i = 0; i < m; i++)
		{
			wc[i] = 0;
		}
		for (i = 0; i < n; i++)
		{
			wc[wv[i]]++;
		}
		for (i = 1; i < m; i++)
		{
			wc[i] +=  wc[i - 1];
		}
		for (i = n - 1; i >= 0; i--)
		{
			sa[--wc[wv[i]]] = y[i];
		}

		// swap x and y
		// after swap, y stands for current rank (based on two parts)
		// x will be overwritten
		t = x, x = y, y = t;
		
		p = 1;
		x[sa[0]] = 0;
		for (i = 1; i < n; i++)
		{
			x[sa[i]] = cmp(y, sa[i - 1], sa[i], j) ? p - 1 : p++; 
		}
	 }

	 // in the end, sa[0] will be meaningless as it will be n - 1
	 // remember r[n - 1] is always 0 and others are larger than 0?
	 return; 
}

// rank is the sequence number in sa (which is ranked)
// rank[sa[i]] = i

// height[i] is the longest common prefix of sa[i] and sa[i - 1]
int rank[MAXN], height[MAXN];
void calheight(int *r, int *sa, int n)
{
	int i, j, k = 0;
	for (i = 1; i < n; i++)
	{
		rank[sa[i]] = i;
	}
	// if we calculate height according to original sequence, that is, input r
	// that is, define h[i] = height[rank[i]]
	// then h[i] >= h[i - 1] - 1
	// which means we can start with h[i - 1] - 1
	// k = h[i - 1]
	for (i = 0; i < n - 1; i++)
	{
		if (k > 0)
		{
			k--;
		}
		j = sa[rank[i] - 1];
		while (r[i + k] == r[j + k])
		{
			k++;
		}
		height[rank[i]] = k;
	}
	
	// in the end, height[0] is meaningless
	return;
}

int main()
{	
	char str[MAXN];
	int r[MAXN];

	scanf("%s", str);
	int N1 = strlen(str);
	str[N1] = 'z' + 1;
	scanf(" %s", str + N1 + 1);
	// printf("str:[%s]\nstr2:[%s]\n", str, str + N1 + 1);

	int N = strlen(str) + 1;
	
	for (int i = 0; i < N - 1; ++i)
	{
		r[i] = str[i] - 'a' + 1;
	}
	r[N - 1] = 0;

	// for (int i = 0; i < N; ++i)
	// {
	// 	printf("%d ", r[i]);
	// }
	// printf("\n");

	// from 'a' (1) to 'z' (26), then 'z' + 1 (27)
	// m (unreachable upper bound) thus should be 28
	da(r, sa, N, 28);
	
	// for (int i = 0; i < N; ++i)
	// {
	// 	printf("%d: %s\n", sa[i], str + sa[i]);
	// }
	// printf("\n");

	calheight(r, sa, N);

	// for (int i = 0; i < N; ++i)
	// {
	// 	printf("%d ", height[i]);
	// }
	// printf("\n");

	int ans = 0;
	for (int i = 1; i < N; ++i)
	{
		if (height[i] > ans && ((sa[i] < N1 && sa[i - 1] > N1) || (sa[i] > N1 && sa[i - 1] < N1)))
		{
			// printf("sa[i - 1]:%d, str + sa[i - 1]:%s\n", sa[i - 1], str + sa[i - 1]);
			// printf("sa[i]:%d, str + sa[i]:%s\n", sa[i], str + sa[i]);
			// printf("height[i]:%d\n", height[i]);
			// printf("\n");

			// the common prefix can not go over the end of first string
			// int h;
			// if (sa[i] < N1)
			// {
			// 	h = std::min(height[i], N1 - sa[i]);
			// }
			// else
			// {
			// 	assert(sa[i - 1] < N1);
			// 	h = std::min(height[i], N1 - sa[i - 1]);
			// }
			ans = height[i];
		}
	}

	printf("%d\n", ans);

	return 0;  
}


你可能感兴趣的:(POJ)