uva10526 - Intellectual Property 后缀数组

Problem A: Intellectual Property

TDP Inc. has decided to sue JCN Inc. for copyright infringement. To this end, TDP wishes to find  infringing segments  within JCN's code base, to show to selected media representatives. Since TDP has fired all its technical staff, it is looking to hire a consultant to be paid on a contingency basis should the lawsuit be successful. To demonstrate your qualifications for this position, you are to solve the problem on a number of test cases.

Each test case begins with a positive integer k, the number of infringing segments to be found. Following this line are two code bases. The first code base is preceded by the line "BEGIN TDP CODEBASE" and contains a number of lines followed by the line "END TDP CODEBASE". The second code base is preceded by "BEGIN JCN CODEBASE" and followed by "END JCN CODEBASE". The line "END TDP CODEBASE" does not appear in the first code base and the line "END JCN CODEBASE" does not appear in the second. A line containing 0 follows the last test case.

For each test case you should output: (1) a line containing "CASE n" where n is the number of the test case; (2) up to k infringing segments. Each segment should be printed exactly as it appears in the JCN code base (including newlines and whitespace) and should be preceded by a line containing "INFRINGING SEGMENT m LENGTHl POSITION p" where m is the number of the segment within the test case, l is the length of the segment (in characters) and p is the position of the segment (in characters from the start of the JCN codebase). Output an empty line between test cases.

A code base is simply a string of characters. An infringing segment is a non-empty contiguous sequence of characters in the JCN code base that is textually identical to some contiguous sequence of characters in the TDP code base, and is not contained in a larger infringing segment. You should consider all characters in the code base, including spaces and the newline character at the end of each line.

If there are k or fewer common segments, print them all in decreasing order by length; if several segments have the same length, print them in the order they occur in JCN's code base. If there are more than ksegments, print the first k according to the given order.

You may assume that no code base contains more than 50,000 characters.

Sample Input

6
BEGIN TDP CODEBASE
the quick brown fox
jumps over the lazy dog.
so there!
END TDP CODEBASE
BEGIN JCN CODEBASE
now is the time for all
good men to come to the aid
of the party.
so there!
END JCN CODEBASE
100
BEGIN TDP CODEBASE
xyzzy
END TDP CODEBASE
BEGIN JCN CODEBASE
xyzzabczzyy
END JCN CODEBASE
0

Output for Sample Input

CASE 1
INFRINGING SEGMENT 1 LENGTH 12 POSITION 64
.
so there!

INFRINGING SEGMENT 2 LENGTH 5 POSITION 6
 the 
INFRINGING SEGMENT 3 LENGTH 5 POSITION 42
o the
INFRINGING SEGMENT 4 LENGTH 5 POSITION 43
 the 
INFRINGING SEGMENT 5 LENGTH 5 POSITION 54
 the 
INFRINGING SEGMENT 6 LENGTH 3 POSITION 15
 fo

CASE 2
INFRINGING SEGMENT 1 LENGTH 4 POSITION 0
xyzz
INFRINGING SEGMENT 2 LENGTH 3 POSITION 7
zzy
INFRINGING SEGMENT 3 LENGTH 2 POSITION 10
y

  输出N个第二段抄袭第一段的地方,如果少于N个输出全部。如果多于N个,优先输出长的,长度相等优先输出位置靠前的。

  把两个串连起来中间用分隔符隔开,求出height数组。有些后缀是在第一段中的,有些后缀是在第二段中的,可以根据sa判断这个后缀是在第一段还是第二段。按后缀大小顺序,正着遍历一遍可以得到第二段每个位置在第一段中后缀排它前面的抄袭长度,倒着遍历一遍可以得到第二段每个位置在第一段中后缀排它后面的抄袭长度,两者里面取大的。得到每个位置的抄袭后,处理掉覆盖的,排序输出。

  注意换行符也算抄袭,输入会有空行的情况。

#include <cstdio>
#include <cstring>
#include <algorithm>
#include <string>
#include <iostream>
using namespace std;

const int MAXN=100010*2;
const int MAXNODE=4*MAXN;
const int LOGMAXN=50;
const int INF=0x3f3f3f3f;

int T,N,pos,len[MAXN];
char str[MAXN];

struct st{
    int p,l;
    bool operator < (const st& rhs)const{
        if(l==rhs.l) return p<rhs.p;
        return l>rhs.l;
    }
}ans[MAXN];

struct SuffixArray{
    int s[MAXN];
    int c[MAXN];
    int sa[MAXN];
    int height[MAXN];
    int rank[MAXN];
    int t[MAXN],t2[MAXN];
    int n;

    void clear(){
        n=0;
        memset(sa,0,sizeof(sa));
    }
    void build_sa(int m){
        int i,*x=t,*y=t2;
        for(i=0;i<m;i++) c[i]=0;
        for(i=0;i<n;i++) c[x[i]=s[i]]++;
        for(i=1;i<m;i++) c[i]+=c[i-1];
        for(i=n-1;i>=0;i--) sa[--c[x[i]]]=i;
        for(int k=1;k<n;k<<=1){
            int p=0;
            for(i=n-k;i<n;i++) y[p++]=i;
            for(i=0;i<n;i++) if(sa[i]>=k) y[p++]=sa[i]-k;
            for(i=0;i<m;i++) c[i]=0;
            for(i=0;i<n;i++) c[x[y[i]]]++;
            for(i=1;i<m;i++) c[i]+=c[i-1];
            for(i=n-1;i>=0;i--) sa[--c[x[y[i]]]]=y[i];
            swap(x,y);
            p=1;
            x[sa[0]]=0;
            for(i=1;i<n;i++) x[sa[i]]=y[sa[i-1]]==y[sa[i]]&&y[sa[i-1]+k]==y[sa[i]+k]?p-1:p++;
            if(p>=n) break;
            m=p;
        }
    }
    void build_height(){
        int k=0;
        for(int i=0;i<n;i++) rank[sa[i]]=i;
        height[0]=0;
        for(int i=0;i<n-1;i++){
            if(k) k--;
            int j=sa[rank[i]-1];
            while(s[i+k]==s[j+k]) k++;
            height[rank[i]]=k;
        }
    }
}sa;

void solve(){
    memset(len,0,sizeof(len));
    int MIN=-1;
    for(int i=0;i<sa.n;i++){
        if(sa.sa[i]<pos) MIN=INF;
        else{
            if(MIN==-1) continue;
            MIN=min(MIN,sa.height[i]);
            len[sa.sa[i]-pos-1]=max(len[sa.sa[i]-pos-1],MIN);
        }
    }
    MIN=-1;
    for(int i=sa.n-1;i>=0;i--){
        if(sa.sa[i]<pos) MIN=INF;
        else{
            if(MIN==-1) continue;
            MIN=min(MIN,sa.height[i+1]);
            len[sa.sa[i]-pos-1]=max(len[sa.sa[i]-pos-1],MIN);
        }
    }
    int now=-1,k=0;
    for(int i=0;i<sa.n-pos;i++){
        if(i+len[i]<=now||len[i]<=0) continue;
        ans[k++]=(st){i,len[i]};
        now=i+len[i];
    }
    sort(ans,ans+k);
    k=min(k,N);
    for(int i=0;i<k;i++){
        printf("INFRINGING SEGMENT %d LENGTH %d POSITION %d\n",i+1,ans[i].l,ans[i].p);
        for(int j=0;j<ans[i].l;j++) printf("%c",sa.s[pos+ans[i].p+1+j]);
        puts("");
    }
}

int main(){
    freopen("in.txt","r",stdin);
    int cas=0;
    while(scanf("%d",&N)!=EOF&&N){
        sa.clear();
        getchar();
        gets(str);
        while(gets(str),strcmp(str,"END TDP CODEBASE")){
            for(int i=0;str[i];i++) sa.s[sa.n++]=str[i];
            sa.s[sa.n++]='\n';
        }
        pos=sa.n;
        sa.s[sa.n++]=260;
        gets(str);
        while(gets(str),strcmp(str,"END JCN CODEBASE")){
            for(int i=0;str[i];i++) sa.s[sa.n++]=str[i];
            sa.s[sa.n++]='\n';
        }
        sa.s[sa.n++]=0;
        sa.build_sa(270);
        sa.build_height();
        if(cas) puts("");
        printf("CASE %d\n",++cas);
        solve();
    }
    return 0;
}



你可能感兴趣的:(uva10526 - Intellectual Property 后缀数组)