The answer is given in the 146th episode of Star Trek - The Next Generation,titled The Chase. Itturns out that in the vast majority ofthe quadrant's life forms ended up with a large fragment of common DNA.
Given the DNA sequences of several life forms represented as stringsof letters, you are to find the longestsubstring that is shared by more than half of them.
Standard input contains several test cases. Each test case begins with1 ≤ n ≤ 100, the number of life forms. n lines follow; eachcontains a string of lower case letters representing the DNA sequence ofa life form. Each DNA sequence contains at least one and not more than 1000letters. A line containing 0 follows the last test case.
For each test case, output the longest string or stringsshared by more than half of the life forms. If there are many, output all ofthem in alphabetical order. If there is no solution with at least oneletter, output "?". Leave an empty line between test cases.
3 abcdefg bcdefgh cdefghi 3 xxx yyy zzz 0
bcdefg cdefgh ?
有N个DNA序列,求一个长度最大的字符串,使它在超过一半的DNA序列中出现。如果有多解,按字典序输出。
首先用不同的分隔符把输入字符串拼起来,记录位置i的字符在第j个DNA序列,idx[i]=j。二分最大长度p,扫一遍height数组,把它分成若干段。若height[i]<p时新开一段,则每一段前p个字符相同,再判断这一段中是否出现在超过N/2个DNA序列中,直接根据idx判断就可以了(这一段后缀的前p个字符肯定都是在某一个DNA序列中,不会同时跨越两个DNA,因为DNA之间的分隔符是不同的)。找到最大长度后再扫一遍height,同样的分段方法,找到就输出,height已经是字典序。
#include<iostream> #include<queue> #include<cstring> #include<cstdio> #include<cmath> #include<set> #include<map> #include<vector> #include<stack> #include<algorithm> #define INF 0x3f3f3f3f #define eps 1e-9 #define MAXNODE 105 #define MOD 10000007 #define SIGMA_SIZE 4 typedef long long LL; using namespace std; const int MAXN=110000; const int MAXM=110; int N,flag[MAXM],idx[MAXN]; char str[1010]; struct SuffixArray{ int s[MAXN]; //原始字符数组 int sa[MAXN]; //后缀数组,sa[i]为第i小后缀在s中的下标,最后一个字符是0,前面非0 int rank[MAXN]; //名次数组,rank[i]为s[i]后缀是第几小,rank[n-1]=0 int height[MAXN]; //height[i]为sa[i-1]和sa[i]的最长公共前缀 int c[MAXN]; //基数排序数组 int t[MAXN],t2[MAXN]; //x,y辅助数组 int n; //字符个数 void clear(){ n=0; memset(sa,0,sizeof(sa)); } //m为最大字符值+1,调用前需设置好s和n void build_sa(int m){ int i,*x=t,*y=t2; //基数排序 for(i=0;i<m;i++) c[i]=0; for(i=0;i<n;i++) c[x[i]=s[i]]++; for(i=1;i<m;i++) c[i]+=c[i-1]; for(i=n-1;i>=0;i--) sa[--c[x[i]]]=i; for(int k=1;k<=n;k<<=1){ int p=0; //用sa数组排序第二关键字 for(i=n-k;i<n;i++) y[p++]=i; for(i=0;i<n;i++) if(sa[i]>=k) y[p++]=sa[i]-k; //基数排序第一关键字 for(int i=0;i<m;i++) c[i]=0; for(int i=0;i<n;i++) c[x[y[i]]]++; for(int i=1;i<m;i++) c[i]+=c[i-1]; for(int i=n-1;i>=0;i--) sa[--c[x[y[i]]]]=y[i]; swap(x,y); p=1; x[sa[0]]=0; for(int i=1;i<n;i++) x[sa[i]]=y[sa[i-1]]==y[sa[i]]&&y[sa[i-1]+k]==y[sa[i]+k]?p-1:p++; if(p>=n) break; m=p; } } void build_height(){ int i,j,k=0; for(int i=0;i<n;i++) rank[sa[i]]=i; height[0]=0; for(int i=0;i<n-1;i++){ if(k) k--; j=sa[rank[i]-1]; while(s[i+k]==s[j+k]) k++; height[rank[i]]=k; } } }sa; bool good(int L,int R){ if(R-L<=N/2) return false; memset(flag,0,sizeof(flag)); int cnt=0; for(int i=L;i<R;i++) if(idx[sa.sa[i]]!=N&&!flag[idx[sa.sa[i]]]){ cnt++; flag[idx[sa.sa[i]]]=1; } return cnt>N/2; } bool check(int len){ //[L,R) int L=0; for(int R=1;R<=sa.n;R++) if(R==sa.n||sa.height[R]<len){ if(good(L,R)) return true; L=R; } return false; } void print(int len){ int L=0; for(int R=1;R<=sa.n;R++) if(R==sa.n||sa.height[R]<len){ if(good(L,R)){ for(int i=sa.sa[L];i<sa.sa[L]+len;i++) printf("%c",sa.s[i]+'a'-1); puts(""); } L=R; } } void solve(int maxlen){ if(!check(1)){ printf("?\n"); return; } int L=1,R=maxlen; while(L<R){ int mid=L+(R-L+1)/2; if(check(mid)) L=mid; else R=mid-1; } print(L); } int main(){ freopen("in.txt","r",stdin); int cas=0; while(scanf("%d",&N)!=EOF&&N){ if(++cas>1) puts(""); sa.clear(); int maxlen=0; for(int i=0;i<N;i++){ scanf("%s",str); int len=strlen(str); maxlen=max(maxlen,len); for(int j=0;j<len;j++){ idx[sa.n]=i; sa.s[sa.n++]=str[j]-'a'+1; } idx[sa.n]=N; sa.s[sa.n++]=i+100; } idx[sa.n]=N; sa.s[sa.n++]=0; if(N==1){ printf("%s\n",str); continue; } sa.build_sa(110+N); sa.build_height(); solve(maxlen); } return 0; }