原创:数据挖掘十大算法之4——Apriori算法+编码

  1. 基本Apriori算法:

主要思想是:连接步+Apriori性质

  • 统计单项在的支持度,再利用连接步生成2项;
  • 对2项采取Apriori性质剪枝,得到可能的2项;
  • 再将2项的再原始中统计得出其支持度,并减去达不到支持度的项。
  • 按照上面步骤重复,直到不能产生新的更多项。

因为从网上找的程序都不太好调试,特贴出自己编写的程序,当然这个针对的是固定的一个对象,通过修改能达到同样的目的,欢迎指出错误!

主程序:

#include <iostream>

#include <string>

#include <math.h>

#include "Apri.h"

int main(){

    Apri aa(2);

    First_Iteam First_set[9];

    First_set[0].name = "T100";

    First_set[0].Iteams[0] = "I1";

    First_set[0].Iteams[1] = "I2";

    First_set[0].Iteams[2] = "I5";

    

    First_set[1].name = "T200";

    First_set[1].Iteams[0] = "I2";

    First_set[1].Iteams[1] = "I4";

    

    First_set[2].name = "T300";

    First_set[2].Iteams[0] = "I2";

    First_set[2].Iteams[1] = "I3";

    

    First_set[3].name = "T400";

    First_set[3].Iteams[0] = "I1";

    First_set[3].Iteams[1] = "I2";

    First_set[3].Iteams[2] = "I4";

    

    First_set[4].name = "T500";

    First_set[4].Iteams[0] = "I1";

    First_set[4].Iteams[1] = "I3";

    

    First_set[5].name = "T600";

    First_set[5].Iteams[0] = "I2";

    First_set[5].Iteams[1] = "I3";

    

    First_set[6].name = "T700";

    First_set[6].Iteams[0] = "I1";

    First_set[6].Iteams[1] = "I3";

    

    First_set[7].name = "T800";

    First_set[7].Iteams[0] = "I1";

    First_set[7].Iteams[1] = "I2";

    First_set[7].Iteams[2] = "I3";

    First_set[7].Iteams[3] = "I5";

    

    First_set[8].name = "T900";

    First_set[8].Iteams[0] = "I1";

    First_set[8].Iteams[1] = "I2";

    First_set[8].Iteams[2] = "I3";

    

    aa.setFirstSet(First_set);

    aa.Find_frequent_1_itemsets();

    

    

    

    aa.print();

    aa.apriori_gen();

    

    cin.get();

    return 0;   

}

 Apriori类

.h文件:

#include <iostream>

#include <string>

using namespace std;



struct First_Iteam{

       string name;

       string Iteams[5];

       };

       

struct Ci{

       string Items[5];

       int confident;

       };

        

class Apri{

private:

        First_Iteam First_set[9];//存放事物,相当于从数据库中读取的信息,固定为9 

        Ci Ck_Items[100];//计算的K-1的统计信息 

        Ci Cl_Items[100];//新计算的第k个统计信息 

        int Items_k;//k-1的统计信息的数目 

        int Items_l;//k个统计信息的数目 

        int Gen;//现在的代数,开始值为1 

        int min_sup;//最小支持度 

        

public:

       Apri(int);

       void print();

       void setFirstSet(First_Iteam*);

       void Find_frequent_1_itemsets();//查找第一代的数据及其信息

       void Find_ck(string item);//查找是否在K-1中存在此对象

       void apriori_gen();//连接步,将K-1中的数据连接成K的数据 

       bool Apriori_check(Ci);//在K-1中检查一个k-1的字符串是否达到了min_sup,或者是否存在这样的K-1 

       bool Ci_Equal(Ci,Ci);//判断两个串是否相等 

       int ContainCi(Ci);//判断在原集合中对Ci的支持度 

        

};

 .cpp文件:

#include "Apri.h"

Apri::Apri(int a){

    min_sup = a; 

}

void Apri::print(){

     cout << "原始项目集:" << endl; 

     for(int i = 0; i < 9; i++)

     {

         cout << First_set[i].name << ": ";

         for( int j = 0; j < 5; j++)

             if(First_set[i].Iteams[j] != "")

                 cout << First_set[i].Iteams[j] << " ";

         cout << endl;

     }

}

     

     

void Apri::setFirstSet(First_Iteam a[]){

     for(int i = 0; i < 9; i++)

             First_set[i] = a[i];

}

void Apri::Find_frequent_1_itemsets(){

     Gen = 1;

     Items_k = 0;

     for(int i = 0; i < 9; i++)

     {

         int j = 0;

         while(First_set[i].Iteams[j] != ""){

             Find_ck(First_set[i].Iteams[j]);

             j++;

         }

     }

}

//查找是否在第一次中存在这样的项,并统计信息,

//在统计信息时要做到判断准确,不能失误。 

void Apri::Find_ck(string item){

     int i;

     bool isbreak = false;

     for(i = 0; i < Items_k; i++)

         if(item == Ck_Items[i].Items[0])

         {

             isbreak =!isbreak;

             break;

         }

     if(isbreak)

         Ck_Items[i].confident++;

     else

     {

         Ck_Items[i].Items[0] = item;

         Ck_Items[i].confident = 1;

         Items_k++;

     }

}

void Apri::apriori_gen(){

    while(Items_k != 0)

    {

         cout << "-----------------" << "L" << Gen << "------------------" << endl;

         for(int i = 0; i < Items_k; i++)

         {

             for(int j = 0; j < 5; j++)

                 if(Ck_Items[i].Items[j] != "")

                     cout << Ck_Items[i].Items[j] <<" ";

             cout << Ck_Items[i].confident<< endl; 

         }           

               

    

         Items_l = 0;

         for(int i = 0; i < Items_k-1; i++)

         {

             for(int j = i+1; j < Items_k; j++)

             {

                 bool isLink = true;

                 //判断是否能够连接,前面k-2个都相同,但是K-1是不同的 

                 for(int m = 0; m < Gen-1; m++)

                 {

                     if(Ck_Items[i].Items[m] != Ck_Items[j].Items[m])

                     {

                         isLink = false;   

                         break;

                     }                  

                 }

                 if(isLink)

                 {

                     if(Ck_Items[i].Items[Gen-1] != Ck_Items[j].Items[Gen-1])

                     //连接条件成立,进行连接; 

                     {

                         Ci newItem;//生成新连接后得到的串 

                         for(int n = 0; n < Gen; n++)

                             newItem.Items[n] = Ck_Items[i].Items[n];

                         newItem.Items[Gen] = Ck_Items[j].Items[Gen-1]; 

                         //对新生成的串进行数据Apriori剪枝,判断是否合法

                         Ci AllItem;//生成k的所有的k-1子串,并进行检查 

                         bool ThroughApriori = true;//是否通过Apriori检查 

                         for(int g = 0; g <= Gen; g++)

                         {

                              int start = 0;

                              for(int h=0; h <= Gen; h++)

                              {

                                   if(h != g)

                                   {

                                        AllItem.Items[start] = newItem.Items[h]; 

                                        start++;

                                   }      

                              }        

                              //对K-1的子串检查是否达到min_sup

                              if(!Apriori_check(AllItem))

                              {

                                  ThroughApriori = false;

                                  break;                           

                              }

                         } 

                         if(ThroughApriori)//通过了Apriori性质检查,将此条加入到K中 

                         {

                              Cl_Items[Items_l] = newItem;

                              Cl_Items[Items_l].confident = 0;

                              Items_l++;                     

                         } 

                     }

                 }

             } 

         }

         Gen++;

         //接下来处理L的数据的支持度,数据存放在Cl_Items中 

         for(int i = 0; i < Items_l; i++)

         {

             Cl_Items[i].confident =  ContainCi(Cl_Items[i]);

         }

         

         //剪去小于最小支持度的并将CL_Items中的数据转移到Ck_Items中 

         for(int m = 0; m < Items_k; m++)

         {

              for(int j = 0; j < 5; j++)

                  Ck_Items[m].Items[j] = "";

              Ck_Items[m].confident = 0;             

         }

         int new_Items_l = 0;

         for(int i = 0; i < Items_l; i++)

         {

              if(Cl_Items[i].confident >= min_sup)

              { 

                  for(int j = 0; j < 5; j++)

                      Ck_Items[new_Items_l].Items[j] = Cl_Items[i].Items[j];

                  Ck_Items[new_Items_l].confident = Cl_Items[i].confident;

                  new_Items_l++;

              } 

         }

         Items_k = new_Items_l;

         Items_l = 0;

    }  

} 

//当检查时不存在这样的K-1或者存在相等的K-1但是支持度小于min_sup则返回falsse 

bool Apri::Apriori_check(Ci OldCi){

     bool pass = false;

     for(int i = 0; i < Items_k; i++)

     {

         /*if(Ci_Equal(OldCi,Ck_Items[i] && Ck_Items[i].confident < min_sup)

         {

             pass = false;

             break;

         } 

         */

         if(Ci_Equal(OldCi,Ck_Items[i])) 

         {

             pass = true;

             break;

         } 

     }

     return pass;

} 

//判断该两个Ci是否相等 

bool Apri::Ci_Equal(Ci a,Ci b){

    bool isequal  = false;

    int num_a = 0,num_b = 0; 

    while(a.Items[num_a] != "")

        num_a++;

    while(b.Items[num_b] != "")

        num_b++;

    if(num_a != num_b)

        return false;

        

    for(int i = 0;i < Gen; i++)

    {

        isequal = false;

        for(int j = 0; j < Gen; j++)

            if(a.Items[i] == b.Items[j])

            {

                isequal = true;

                break;

            }

        if(!isequal)

            break;                

    }

    return isequal;

}

int Apri::ContainCi(Ci NewCi){

    int num_new = 0, num_item, num_contain = 0; 

    while(NewCi.Items[num_new] != "")

        num_new++;

    bool isequal  = false;

    for(int i = 0; i < 9; i++)

    {

         num_item = 0;          

         while(First_set[i].Iteams[num_item] != "")

             num_item++;

         if(num_new > num_item)

             continue;

         else

         {

             for(int j = 0; j < num_new; j++)

             {

                 isequal = false;

                 for(int m = 0; m < num_item; m++)

                 {

                      if(NewCi.Items[j] == First_set[i].Iteams[m])

                      {

                          isequal = true;

                          break;                  

                      }

                 }

                 if(!isequal)

                     break;     

             }

             if(isequal)

                 num_contain++;

         } 

    }

    return num_contain;

}

 

宿舍

你可能感兴趣的:(数据挖掘)