后缀树的定义
后缀树(Suffix tree)是一种数据结构,能快速解决很多关于字符串的问题。后缀树的概念最早由Weiner 于1973年提出,既而由McCreight 在1976年和Ukkonen在1992年和1995年加以改进完善。
后缀,顾名思义,甚至通俗点来说,就是所谓后缀就是后面尾巴的意思。比如说给定一长度为n的字符串S=S1S2..Si..Sn,和整数i,1 <= i <= n,子串SiSi+1…Sn便都是字符串S的后缀。
以字符串S=XMADAMYX为例,它的长度为8,所以S[1..8], S[2..8], … , S[8..8]都算S的后缀,我们一般还把空字串也算成后缀。这样,我们一共有如下后缀。对于后缀S[i..n],我们说这项后缀起始于i。
S[1..8], XMADAMYX, 也就是字符串本身,起始位置为1
S[2..8], MADAMYX,起始位置为2
S[3..8], ADAMYX,起始位置为3
S[4..8], DAMYX,起始位置为4
S[5..8], AMYX,起始位置为5
S[6..8], MYX,起始位置为6
S[7..8], YX,起始位置为7
S[8..8], X,起始位置为8
空字串,记为$。
后缀树的结构
首先,说明一下后缀树中的结构。在该程序中,定义了后缀树的类,以及后缀树的节点的类。节点的结构包括3个部分:1.节点的ID.2.节点所存的字符串。3.该节点的孩子指针。具体结构如下:
map
class suffix_node {
public:
map<char,suffix_node*> children;
string str;
int id;
suffix_node(){}
void insertstring(string s) //通过insertstring来将每一个后缀插入树中
{
suffix_node* child=NULL;
suffix_node* nod=NULL;
if(children.find(s[0])==children.end()) //root的map中不存在s[0]
{
child=new suffix_node();
child->str=s;
child->id=num_node;
num_node++; //num_node用于生成ID
children[s[0]]=child;
}
else
{
child=children[s[0]];
int i;
string st,st1;
char ch;
if(child->str.length()<=s.length())
i=compare(child->str,s);
else
i=compare(s,child->str);
while(i>=child->str.length()&&i<s.length())
{
nod=child;
s=s.substr(i,s.length()-i);
if(nod->children.find(s[0])==nod->children.end())
{
suffix_node* n=new suffix_node();
n->id=num_node;
num_node++;
n->str=s;
nod->children[s[0]]=n;
return ;
}
child=child->children[s[0]];
i=compare(s,child->str);
}
if(i<s.length()&&i<child->str.length())
{
st=s.substr(0,i);
suffix_node* node=new suffix_node();
node->id=num_node;
num_node++;
node->str=st;
suffix_node* node1=new suffix_node();
node1->id=num_node;
num_node++;
node1->str=s.substr(i,s.length()-i);
child->str=child->str.substr(i,child->str.length()-i);
st1=child->str;
node->children[st1[0]]=child;
st1=node1->str;
node->children[st1[0]]=node1;
st1=node->str;
if(nod==NULL)
children[st1[0]]=node;
else
nod->children[st1[0]]=node;
}
}
}
};
class suffix_tree {
public:
suffix_node* root;
suffix_tree(string s)
{
string str;
root=new suffix_node();
root->id=num_node;
num_node++;
for(int i=0;i<s.length();i++)
{
str=s.substr(i);
root->insertstring(str);
}
}
~suffix_tree(){}
};
4.后缀树的运用。
1. 查找字符串o是否在字符串S中。
方案:用S构造后缀树,按在trie中搜索字串的方法搜索o即可。
原理:若o在S中,则o必然是S的某个后缀的前缀。
例如S: leconte,查找o: con是否在S中,则o(con)必然是S(leconte)的后缀之一conte的前缀.有了这个前提,采用trie搜索的方法就不难理解了。 代码如下:
int find_str(string T,string s) //寻找字符串S是否在字符串T中
{
suffix_tree tree(T);
if(T.length()<s.length())
{
cout<<"字符串S不在字符串T中"<<endl;
return 0;
}
string str=s;
suffix_node* node=NULL;
int i;
node=tree.root->children[s[0]];
while(str.length()!=0)
{
if(str.length()<=node->str.length())
{
i=compare(str,node->str);
if(i>=str.length())
{
cout<<"字符串S在字符串T中"<<endl;
return 1;
}
else
{
cout<<"字符串S不在字符串T中"<<endl;
return 0;
}
}
else
{
i=compare(node->str,str);
if(i>=node->str.length())
{
str=str.substr(i,str.length()-i);
node=node->children[str[0]];
}
else
{
cout<<"字符串S不在字符串T中"<<endl;
return 0;
}
}
}
}
`
2.指定字符串T在字符串S中的重复次数。
方案:用S+’$’构造后缀树,搜索T节点下的叶节点数目即为重复次数
原理:如果T在S中重复了两次,则S应有两个后缀以T为前缀,重复次数就自然统计出来了。代码如下:
int num_leaf(suffix_tree tree,suffix_node* node)
{
int num=0,number=0;
suffix_node* node1=NULL;
map<char,suffix_node*>::iterator its;
for(its=node->children.begin();its!=node->children.end();its++)
num++;
if(num==0)
{
return 1;
}
else
{
for(its=node->children.begin();its!=node->children.end();its++)
{
number+=num_leaf(tree,its->second);
}
}
return number;
}
int num_echo(string T,string s) //指定字符串S在字符串T中的重复次数
{
suffix_tree tree(T);
suffix_node* node=NULL;
int i,num=0;
i=find_str(T,s);
if(i==0)
return 0;
node=tree.root->children[s[0]];
while(s.length()>node->str.length())
{
s=s.substr(node->str.length(),s.length()-node->str.length());
node=node->children[s[0]];
}
return num_leaf(tree,node);
}
3.字符串S中的最长重复子串
方案:原理同2,具体做法就是找到最深的非叶节点。
这个深是指从root所经历过的字符个数,最深非叶节点所经历的字符串起来就是最长重复子串。
为什么要非叶节点呢?因为既然是要重复,当然叶节点个数要>=2。 代码如下:
int is_leaf(suffix_node* node,suffix_tree tree)
{
map<char,suffix_node*>::iterator its;
int num=0;
for(its=node->children.begin();its!=node->children.end();its++)
num++;
if(num>0)
return 0;
else
return 1;
}
string longest_echo_string(string T)
{
map<char,suffix_node*>::iterator its;
queue<suffix_node*> q;
string long_str;
int max=0;
suffix_node* node=NULL;
suffix_tree tree(T);
q.push(tree.root);
while(!q.empty())
{
node=q.front();
q.pop();
if(max<node->str.length())
{
max=node->str.length();
long_str=node->str;
}
for(its=node->children.begin();its!=node->children.end();its++)
{
if(is_leaf(its->second,tree)==0)
{
(its->second)->str+=node->str;
q.push(its->second);
}
}
}
return long_str;
}
4.两个字符串S1,S2的最长公共部分
方案:将S1#S2作为字符串压入后缀树,找到最深的非叶节点,且该节点的叶节点既有#也有(无#),代码如下:
int has_$(suffix_node *node,suffix_tree tree)
{
map<char,suffix_node*>::iterator its;
string s;
int num1=1;int num2=1;int i;
for(its=node->children.begin();its!=node->children.end();++its)
{
s+=its->second->str;
}
for(i=0;i<s.length();i++)
{
if(s[i]=='y')
num1--;
if(s[i]=='z')
num2--;
if(num1<=0&&num2<=0)
break;
}
if(i>=s.length())
return 0;
else
return 1;
}
string longest_common(string t,string s)
{
suffix_node* node=NULL;
map<char,suffix_node*>::iterator its;
queue<suffix_node*> q;
string long_str,str;
int max=0;int num;
t+="y";s+="z";
str=t+s;
suffix_tree tree(str);
q.push(tree.root);
while(!q.empty())
{
node=q.front();
q.pop();
num=0;
for(its=node->children.begin();its!=node->children.end();its++)
{
if(is_leaf(its->second,tree)==0)
num++;
}
if(num==0&&has_$(node,tree))
{
if(max<node->str.length())
{
max=node->str.length();
long_str=node->str;
}
}
for(its=node->children.begin();its!=node->children.end();its++)
{
if(is_leaf(its->second,tree)==0)
{
(its->second)->str=node->str+(its->second)->str;
q.push(its->second);
}
}
}
return long_str;
}
5.最后给一个最终代码:
#include <iostream>
#include<map>
#include<string>
#include<iterator>
#include<queue>
using namespace std;
int num_node=0;
int compare(string a,string b)
{
int i;
for(i=0;i<a.length();i++)
{
if(a[i]!=b[i])
break;
}
return i;
}
class suffix_node
{
public:
map<char,suffix_node*> children;
string str;
int id;
suffix_node(){}
void insertstring(string s)
{
suffix_node* child=NULL;
suffix_node* nod=NULL;
if(children.find(s[0])==children.end())
{
child=new suffix_node();
child->str=s;
child->id=num_node;
num_node++;
children[s[0]]=child;
}
else
{
child=children[s[0]];
int i;
string st,st1;
char ch;
if(child->str.length()<=s.length())
i=compare(child->str,s);
else
i=compare(s,child->str);
while(i>=child->str.length()&&i<s.length())
{
nod=child;
s=s.substr(i,s.length()-i);
if(nod->children.find(s[0])==nod->children.end())
{
suffix_node* n=new suffix_node();
n->id=num_node;
num_node++;
n->str=s;
nod->children[s[0]]=n;
return ;
}
child=child->children[s[0]];
i=compare(s,child->str);
}
if(i<s.length()&&i<child->str.length())
{
st=s.substr(0,i);
suffix_node* node=new suffix_node();
node->id=num_node;
num_node++;
node->str=st;
suffix_node* node1=new suffix_node();
node1->id=num_node;
num_node++;
node1->str=s.substr(i,s.length()-i);
child->str=child->str.substr(i,child->str.length()-i);
st1=child->str;
node->children[st1[0]]=child;
st1=node1->str;
node->children[st1[0]]=node1;
st1=node->str;
if(nod==NULL)
children[st1[0]]=node;
else
nod->children[st1[0]]=node;
}
}
}
};
class suffix_tree
{
public:
suffix_node* root;
suffix_tree(string s)
{
string str;
root=new suffix_node();
root->id=num_node;
num_node++;
for(int i=0;i<s.length();i++)
{
str=s.substr(i);
root->insertstring(str);
}
}
~suffix_tree(){}
};
int find_str(string T,string s) //寻找字符串S是否在字符串T中
{
suffix_tree tree(T);
if(T.length()<s.length())
{
cout<<"字符串S不在字符串T中"<<endl;
return 0;
}
string str=s;
suffix_node* node=NULL;
int i;
node=tree.root->children[s[0]];
while(str.length()!=0)
{
if(str.length()<=node->str.length())
{
i=compare(str,node->str);
if(i>=str.length())
{
cout<<"字符串S在字符串T中"<<endl;
return 1;
}
else
{
cout<<"字符串S不在字符串T中"<<endl;
return 0;
}
}
else
{
i=compare(node->str,str);
if(i>=node->str.length())
{
str=str.substr(i,str.length()-i);
node=node->children[str[0]];
}
else
{
cout<<"字符串S不在字符串T中"<<endl;
return 0;
}
}
}
}
int num_leaf(suffix_tree tree,suffix_node* node)
{
int num=0,number=0;
suffix_node* node1=NULL;
map<char,suffix_node*>::iterator its;
for(its=node->children.begin();its!=node->children.end();its++)
num++;
if(num==0)
{
return 1;
}
else
{
for(its=node->children.begin();its!=node->children.end();its++)
{
number+=num_leaf(tree,its->second);
}
}
return number;
}
int num_echo(string T,string s) //指定字符串S在字符串T中的重复次数
{
suffix_tree tree(T);
suffix_node* node=NULL;
int i,num=0;
i=find_str(T,s);
if(i==0)
return 0;
node=tree.root->children[s[0]];
while(s.length()>node->str.length())
{
s=s.substr(node->str.length(),s.length()-node->str.length());
node=node->children[s[0]];
}
return num_leaf(tree,node);
}
int is_leaf(suffix_node* node,suffix_tree tree)
{
map<char,suffix_node*>::iterator its;
int num=0;
for(its=node->children.begin();its!=node->children.end();its++)
num++;
if(num>0)
return 0;
else
return 1;
}
string longest_echo_string(string T)
{
map<char,suffix_node*>::iterator its;
queue<suffix_node*> q;
string long_str;
int max=0;
suffix_node* node=NULL;
suffix_tree tree(T);
q.push(tree.root);
while(!q.empty())
{
node=q.front();
q.pop();
if(max<node->str.length())
{
max=node->str.length();
long_str=node->str;
}
for(its=node->children.begin();its!=node->children.end();its++)
{
if(is_leaf(its->second,tree)==0)
{
(its->second)->str+=node->str;
q.push(its->second);
}
}
}
return long_str;
}
int has_$(suffix_node *node,suffix_tree tree)
{
map<char,suffix_node*>::iterator its;
string s;
int num1=1;int num2=1;int i;
for(its=node->children.begin();its!=node->children.end();++its)
{
s+=its->second->str;
}
for(i=0;i<s.length();i++)
{
if(s[i]=='y')
num1--;
if(s[i]=='z')
num2--;
if(num1<=0&&num2<=0)
break;
}
if(i>=s.length())
return 0;
else
return 1;
}
string longest_common(string t,string s)
{
suffix_node* node=NULL;
map<char,suffix_node*>::iterator its;
queue<suffix_node*> q;
string long_str,str;
int max=0;int num;
t+="y";s+="z";
str=t+s;
suffix_tree tree(str);
q.push(tree.root);
while(!q.empty())
{
node=q.front();
q.pop();
num=0;
for(its=node->children.begin();its!=node->children.end();its++)
{
if(is_leaf(its->second,tree)==0)
num++;
}
if(num==0&&has_$(node,tree))
{
if(max<node->str.length())
{
max=node->str.length();
long_str=node->str;
}
}
for(its=node->children.begin();its!=node->children.end();its++)
{
if(is_leaf(its->second,tree)==0)
{
(its->second)->str=node->str+(its->second)->str;
q.push(its->second);
}
}
}
return long_str;
}
int main()
{
string T,s,str;
int a;
// cin>>T;
// cin>>s;
// a=find_str(T,s);
// a=num_echo(T,s);
// cout<<a<<endl;
// str=longest_echo_string(T);
// str=longest_common("keks","khkr");
// cout<<str<<endl;
// suffix_tree tree(T);
// suffix_node* node=(tree.root)->children['e'];
// suffix_node* node1=node->children['k'];
// cout<<node1->children['a']->str<<endl;
return 0;
}
`