后缀树的生成以及一些运用

  1. 后缀树的定义
    后缀树(Suffix tree)是一种数据结构,能快速解决很多关于字符串的问题。后缀树的概念最早由Weiner 于1973年提出,既而由McCreight 在1976年和Ukkonen在1992年和1995年加以改进完善。

    后缀,顾名思义,甚至通俗点来说,就是所谓后缀就是后面尾巴的意思。比如说给定一长度为n的字符串S=S1S2..Si..Sn,和整数i,1 <= i <= n,子串SiSi+1…Sn便都是字符串S的后缀。

    以字符串S=XMADAMYX为例,它的长度为8,所以S[1..8], S[2..8], … , S[8..8]都算S的后缀,我们一般还把空字串也算成后缀。这样,我们一共有如下后缀。对于后缀S[i..n],我们说这项后缀起始于i。

    S[1..8], XMADAMYX, 也就是字符串本身,起始位置为1
    S[2..8], MADAMYX,起始位置为2
    S[3..8], ADAMYX,起始位置为3
    S[4..8], DAMYX,起始位置为4
    S[5..8], AMYX,起始位置为5
    S[6..8], MYX,起始位置为6
    S[7..8], YX,起始位置为7
    S[8..8], X,起始位置为8
    空字串,记为$。

  2. 后缀树的结构
    首先,说明一下后缀树中的结构。在该程序中,定义了后缀树的类,以及后缀树的节点的类。节点的结构包括3个部分:1.节点的ID.2.节点所存的字符串。3.该节点的孩子指针。具体结构如下:
    map

class suffix_node {
public:
    map<char,suffix_node*> children;
    string str;
    int id;
    suffix_node(){}              
    void insertstring(string s)   //通过insertstring来将每一个后缀插入树中
    {
        suffix_node* child=NULL;
        suffix_node* nod=NULL;
     if(children.find(s[0])==children.end()) //root的map中不存在s[0]
     {
        child=new suffix_node();         
        child->str=s;
        child->id=num_node;
        num_node++;                         //num_node用于生成ID
        children[s[0]]=child;
     }
     else
     {
       child=children[s[0]];
       int i;
       string st,st1;
       char ch;
       if(child->str.length()<=s.length()) 
       i=compare(child->str,s);
       else
        i=compare(s,child->str);
       while(i>=child->str.length()&&i<s.length())
        {
            nod=child;
          s=s.substr(i,s.length()-i);
       if(nod->children.find(s[0])==nod->children.end())
       {
           suffix_node* n=new suffix_node();
           n->id=num_node;
           num_node++;
           n->str=s;
           nod->children[s[0]]=n;
           return ;
       }
          child=child->children[s[0]];
          i=compare(s,child->str);
        }
       if(i<s.length()&&i<child->str.length())
       {
       st=s.substr(0,i);
       suffix_node* node=new suffix_node();
       node->id=num_node;
       num_node++;
       node->str=st;
       suffix_node* node1=new suffix_node();
       node1->id=num_node;
       num_node++;
       node1->str=s.substr(i,s.length()-i);
       child->str=child->str.substr(i,child->str.length()-i);
       st1=child->str;
         node->children[st1[0]]=child;
         st1=node1->str;
         node->children[st1[0]]=node1;
         st1=node->str;
         if(nod==NULL)
            children[st1[0]]=node;
         else
            nod->children[st1[0]]=node;
       }
     }
   }
};

class suffix_tree {

public:
    suffix_node* root;
    suffix_tree(string s)
    {
        string str;
        root=new suffix_node();
        root->id=num_node;
        num_node++;
        for(int i=0;i<s.length();i++)
        {
            str=s.substr(i);
           root->insertstring(str);
        }
    }
    ~suffix_tree(){}
};

4.后缀树的运用。
1. 查找字符串o是否在字符串S中。
方案:用S构造后缀树,按在trie中搜索字串的方法搜索o即可。
原理:若o在S中,则o必然是S的某个后缀的前缀。
例如S: leconte,查找o: con是否在S中,则o(con)必然是S(leconte)的后缀之一conte的前缀.有了这个前提,采用trie搜索的方法就不难理解了。 代码如下:

 int find_str(string T,string s)          //寻找字符串S是否在字符串T中
{
    suffix_tree tree(T);
      if(T.length()<s.length())
    {
        cout<<"字符串S不在字符串T中"<<endl;
        return 0;
    }
    string str=s;
    suffix_node* node=NULL;
        int i;
       node=tree.root->children[s[0]];
       while(str.length()!=0)
       {
           if(str.length()<=node->str.length())
           {
              i=compare(str,node->str);
              if(i>=str.length())
              {
                cout<<"字符串S在字符串T中"<<endl;
                return 1;
              }
              else
              {
                cout<<"字符串S不在字符串T中"<<endl;
                 return 0;
              }
           }
           else
           {
              i=compare(node->str,str);
              if(i>=node->str.length())
              {
                str=str.substr(i,str.length()-i);
                node=node->children[str[0]];
              }
              else
              {
                cout<<"字符串S不在字符串T中"<<endl;
                 return 0;
              }
           }
       }
}

`
2.指定字符串T在字符串S中的重复次数。
方案:用S+’$’构造后缀树,搜索T节点下的叶节点数目即为重复次数
原理:如果T在S中重复了两次,则S应有两个后缀以T为前缀,重复次数就自然统计出来了。代码如下:

 int num_leaf(suffix_tree tree,suffix_node* node)
{
    int num=0,number=0;
    suffix_node* node1=NULL;
    map<char,suffix_node*>::iterator its;
    for(its=node->children.begin();its!=node->children.end();its++)
        num++;
        if(num==0)
        {
            return 1;
        }
        else
        {
             for(its=node->children.begin();its!=node->children.end();its++)
             {
                 number+=num_leaf(tree,its->second);
             }
        }
        return number;
}
int num_echo(string T,string s)        //指定字符串S在字符串T中的重复次数
{
   suffix_tree tree(T);
   suffix_node* node=NULL;
   int i,num=0;
   i=find_str(T,s);
   if(i==0)
    return 0;
   node=tree.root->children[s[0]];
      while(s.length()>node->str.length())
      {
          s=s.substr(node->str.length(),s.length()-node->str.length());
          node=node->children[s[0]];
      }

      return num_leaf(tree,node);
}

3.字符串S中的最长重复子串
方案:原理同2,具体做法就是找到最深的非叶节点。
这个深是指从root所经历过的字符个数,最深非叶节点所经历的字符串起来就是最长重复子串。
为什么要非叶节点呢?因为既然是要重复,当然叶节点个数要>=2。 代码如下:

int is_leaf(suffix_node* node,suffix_tree tree)
{
   map<char,suffix_node*>::iterator its;
   int num=0;
   for(its=node->children.begin();its!=node->children.end();its++)
        num++;
   if(num>0)
    return 0;
   else
    return 1;
}

string longest_echo_string(string T)
{
    map<char,suffix_node*>::iterator its;
    queue<suffix_node*> q;
    string long_str;
    int max=0;
    suffix_node* node=NULL;
    suffix_tree tree(T);
    q.push(tree.root);
    while(!q.empty())
    {
        node=q.front();
        q.pop();
        if(max<node->str.length())
        {
            max=node->str.length();
            long_str=node->str;
        }
        for(its=node->children.begin();its!=node->children.end();its++)
        {
            if(is_leaf(its->second,tree)==0)
            {
                (its->second)->str+=node->str;
                q.push(its->second);
            }
        }
    }
    return long_str;
}

4.两个字符串S1,S2的最长公共部分
方案:将S1#S2作为字符串压入后缀树,找到最深的非叶节点,且该节点的叶节点既有#也有(无#),代码如下:

int has_$(suffix_node *node,suffix_tree tree)
{
    map<char,suffix_node*>::iterator its;
    string s;
    int num1=1;int num2=1;int i;
    for(its=node->children.begin();its!=node->children.end();++its)
    {
        s+=its->second->str;
    }
    for(i=0;i<s.length();i++)
    {
        if(s[i]=='y')
            num1--;
        if(s[i]=='z')
            num2--;
        if(num1<=0&&num2<=0)
        break;
    }
    if(i>=s.length())
        return 0;
    else
        return 1;
}

string longest_common(string t,string s)
{
    suffix_node* node=NULL;
    map<char,suffix_node*>::iterator its;
    queue<suffix_node*> q;
    string long_str,str;
    int max=0;int num;
    t+="y";s+="z";
    str=t+s;
    suffix_tree tree(str);
    q.push(tree.root);
    while(!q.empty())
    {
        node=q.front();
        q.pop();
        num=0;
        for(its=node->children.begin();its!=node->children.end();its++)
        {
            if(is_leaf(its->second,tree)==0)
                num++;
        }

        if(num==0&&has_$(node,tree))
        {
         if(max<node->str.length())
         {
            max=node->str.length();
            long_str=node->str;
         }
        }
        for(its=node->children.begin();its!=node->children.end();its++)
        {
            if(is_leaf(its->second,tree)==0)
            {
                (its->second)->str=node->str+(its->second)->str;
                q.push(its->second);
            }
        }
    }
    return long_str;
}

5.最后给一个最终代码:

#include <iostream>
#include<map>
#include<string>
#include<iterator>
#include<queue>
using namespace std;
int num_node=0;
int compare(string a,string b)
{
    int i;
    for(i=0;i<a.length();i++)
    {
        if(a[i]!=b[i])
            break;
    }
    return i;
}
class suffix_node
{
public:
    map<char,suffix_node*> children;
    string str;
    int id;
    suffix_node(){}
    void insertstring(string s)
    {
        suffix_node* child=NULL;
        suffix_node* nod=NULL;
     if(children.find(s[0])==children.end())
     {
        child=new suffix_node();
        child->str=s;
        child->id=num_node;
        num_node++;
        children[s[0]]=child;
     }
     else
     {
       child=children[s[0]];
       int i;
       string st,st1;
       char ch;
       if(child->str.length()<=s.length())
       i=compare(child->str,s);
       else
        i=compare(s,child->str);
       while(i>=child->str.length()&&i<s.length())
        {
            nod=child;
          s=s.substr(i,s.length()-i);
       if(nod->children.find(s[0])==nod->children.end())
       {
           suffix_node* n=new suffix_node();
           n->id=num_node;
           num_node++;
           n->str=s;
           nod->children[s[0]]=n;
           return ;
       }
          child=child->children[s[0]];
          i=compare(s,child->str);
        }
       if(i<s.length()&&i<child->str.length())
       {
       st=s.substr(0,i);
       suffix_node* node=new suffix_node();
       node->id=num_node;
       num_node++;
       node->str=st;
       suffix_node* node1=new suffix_node();
       node1->id=num_node;
       num_node++;
       node1->str=s.substr(i,s.length()-i);
       child->str=child->str.substr(i,child->str.length()-i);
       st1=child->str;
         node->children[st1[0]]=child;
         st1=node1->str;
         node->children[st1[0]]=node1;
         st1=node->str;
         if(nod==NULL)
            children[st1[0]]=node;
         else
            nod->children[st1[0]]=node;
       }
     }
   }
};

class suffix_tree
{

public:
    suffix_node* root;
    suffix_tree(string s)
    {
        string str;
        root=new suffix_node();
        root->id=num_node;
        num_node++;
        for(int i=0;i<s.length();i++)
        {
            str=s.substr(i);
           root->insertstring(str);
        }
    }
    ~suffix_tree(){}
};

int find_str(string T,string s)          //寻找字符串S是否在字符串T中
{
    suffix_tree tree(T);
      if(T.length()<s.length())
    {
        cout<<"字符串S不在字符串T中"<<endl;
        return 0;
    }
    string str=s;
    suffix_node* node=NULL;
        int i;
       node=tree.root->children[s[0]];
       while(str.length()!=0)
       {
           if(str.length()<=node->str.length())
           {
              i=compare(str,node->str);
              if(i>=str.length())
              {
                cout<<"字符串S在字符串T中"<<endl;
                return 1;
              }
              else
              {
                cout<<"字符串S不在字符串T中"<<endl;
                 return 0;
              }
           }
           else
           {
              i=compare(node->str,str);
              if(i>=node->str.length())
              {
                str=str.substr(i,str.length()-i);
                node=node->children[str[0]];
              }
              else
              {
                cout<<"字符串S不在字符串T中"<<endl;
                 return 0;
              }
           }
       }
}

int num_leaf(suffix_tree tree,suffix_node* node)
{
    int num=0,number=0;
    suffix_node* node1=NULL;
    map<char,suffix_node*>::iterator its;
    for(its=node->children.begin();its!=node->children.end();its++)
        num++;
        if(num==0)
        {
            return 1;
        }
        else
        {
             for(its=node->children.begin();its!=node->children.end();its++)
             {
                 number+=num_leaf(tree,its->second);
             }
        }
        return number;
}
int num_echo(string T,string s)        //指定字符串S在字符串T中的重复次数
{
   suffix_tree tree(T);
   suffix_node* node=NULL;
   int i,num=0;
   i=find_str(T,s);
   if(i==0)
    return 0;
   node=tree.root->children[s[0]];
      while(s.length()>node->str.length())
      {
          s=s.substr(node->str.length(),s.length()-node->str.length());
          node=node->children[s[0]];
      }

      return num_leaf(tree,node);
}

int is_leaf(suffix_node* node,suffix_tree tree)
{
   map<char,suffix_node*>::iterator its;
   int num=0;
   for(its=node->children.begin();its!=node->children.end();its++)
        num++;
   if(num>0)
    return 0;
   else
    return 1;
}

string longest_echo_string(string T)
{
    map<char,suffix_node*>::iterator its;
    queue<suffix_node*> q;
    string long_str;
    int max=0;
    suffix_node* node=NULL;
    suffix_tree tree(T);
    q.push(tree.root);
    while(!q.empty())
    {
        node=q.front();
        q.pop();
        if(max<node->str.length())
        {
            max=node->str.length();
            long_str=node->str;
        }
        for(its=node->children.begin();its!=node->children.end();its++)
        {
            if(is_leaf(its->second,tree)==0)
            {
                (its->second)->str+=node->str;
                q.push(its->second);
            }
        }
    }
    return long_str;
}
int has_$(suffix_node *node,suffix_tree tree)
{
    map<char,suffix_node*>::iterator its;
    string s;
    int num1=1;int num2=1;int i;
    for(its=node->children.begin();its!=node->children.end();++its)
    {
        s+=its->second->str;
    }
    for(i=0;i<s.length();i++)
    {
        if(s[i]=='y')
            num1--;
        if(s[i]=='z')
            num2--;
        if(num1<=0&&num2<=0)
        break;
    }
    if(i>=s.length())
        return 0;
    else
        return 1;
}

string longest_common(string t,string s)
{
    suffix_node* node=NULL;
    map<char,suffix_node*>::iterator its;
    queue<suffix_node*> q;
    string long_str,str;
    int max=0;int num;
    t+="y";s+="z";
    str=t+s;
    suffix_tree tree(str);
    q.push(tree.root);
    while(!q.empty())
    {
        node=q.front();
        q.pop();
        num=0;
        for(its=node->children.begin();its!=node->children.end();its++)
        {
            if(is_leaf(its->second,tree)==0)
                num++;
        }

        if(num==0&&has_$(node,tree))
        {
         if(max<node->str.length())
         {
            max=node->str.length();
            long_str=node->str;
         }
        }
        for(its=node->children.begin();its!=node->children.end();its++)
        {
            if(is_leaf(its->second,tree)==0)
            {
                (its->second)->str=node->str+(its->second)->str;
                q.push(its->second);
            }
        }
    }
    return long_str;
}
int main()
{
    string T,s,str;
    int a;
// cin>>T;
// cin>>s;
// a=find_str(T,s);
// a=num_echo(T,s);
// cout<<a<<endl;
// str=longest_echo_string(T);
// str=longest_common("keks","khkr");
// cout<<str<<endl;
// suffix_tree tree(T);
// suffix_node* node=(tree.root)->children['e'];
// suffix_node* node1=node->children['k'];
// cout<<node1->children['a']->str<<endl;

    return 0;
}

`

你可能感兴趣的:(后缀树-C++)