apriori算法实现挖掘商品关联规则(python)

     Apriori算法是一种挖掘关联规则的频繁项集算法,可用于消费市场价格分析,猜测顾客的消费习惯。网上有很多详细介绍,这里就不再赘述。

主要步骤如下:

1.依据支持度找出所有频繁项集(频度)

2.依据置信度产生关联规则(强度)

支持度:

置信度:


接下来解释python代码:

 
  
#--coding=utf-8
import numpy as np
import pandas as pd

class AssociationRule:
    def __init__(self,dataSet,minSupport=0.5,minCconf=0.98):
        """initialize class AssociationRule"""
        self.sentences = dataSet
        self.minSupport = minSupport
        self.minConf = minCconf
        self.rows = len(self.sentences)
        self.columns = len(self.sentences.columns)
        self.L = []

    def CkStats(self,Ck):
        """create candidate itemsets of size 1 C1"""
        statslist = []
        for colj in Ck:
            colValues = self.sentences.ix[:,colj]
            statslist.append((colj,len(colValues[colValues>0])/float(self.rows)))
            #可以按支持度进行排序,此处注释掉
            # statslist.sort(key=lambda x:x[1],reverse=True)
        return statslist

    def scan(self,Ck,k):
        """generate frequent itemsets Lk from candidate itemsets Ck"""
        retList = []
        retListTmp = []
        for colj in Ck:
            for colk in Ck:
                if colj != colk:
                    print(colj,colk)
                    for elek in colk:
                        #形成K+1项集
                        Lktmp = colj + [elek]
                        # 去重
                        Lk = list(set(Lktmp))
                        # 去重后,按顺序存储
                        Lk.sort(key=Lktmp.index)
                        if len(Lk)>k and Lk not in retListTmp:
                            retListTmp.append(Lk)
                            coljvalue = self.sentences[colj]
                            Lkvalue = self.sentences[Lk]
                            coljmeanvalue = coljvalue.mean(axis=1)
                            # 求得置信度分母
                            NJ = len(coljmeanvalue[coljmeanvalue==1.0])
                            Lkmeanvalue = Lkvalue.mean(axis=1)
                            if NJ > 0:
                                # 求得支持度分子
                                JK = len(Lkmeanvalue[Lkmeanvalue==1.0])
                                support = JK/self.rows
                                conf = JK/NJ
                                if support >= self.minSupport and conf >= self.minConf:
                                    retList.append((Lk,support,conf))
        self.L.append(retList)
        return self.L[-1]

    def apriori(self):
        """generate a list of frequent itemsets"""
        C1 = range(self.columns)
        C1stats = self.CkStats(C1)
        print(C1stats)
        self.L.append(C1stats)
        C1 = [[x[0]] for x in C1stats if x[1]>=self.minSupport]
        # self.sentences.reindex(columns=C1)
        Ck = self.scan(C1,1)
        k = 2
        # 迭代
        while k<=3:
            Ck = [x[0] for x in Ck]
            Ck = self.scan(Ck,k)
            k += 1
        for ele in self.L:
            print(ele)

def read_file(filepath):
    affinityX = np.loadtxt(filepath)
    print(len(affinityX))
    print(affinityX[:5])
    affinityColumns = ['bread','milk','cheese','apple','banana']
    return pd.DataFrame(affinityX,columns=affinityColumns)

if __name__ == "__main__":
    datasetPath = "../dataset/"
    filename = "affinity_dataset.txt"
    filepath = datasetPath + filename
    sentences = read_file(filepath)
    minSupport = 0.1
    minConf = 0.5
    assrules = AssociationRule(sentences,minSupport,minConf)
    assrules.apriori()
    print("OK")

程序运行结果如下:
#数据行数
100
#前5行数据
[[ 0.  0.  1.  1.  1.]
 [ 1.  1.  0.  1.  0.]
 [ 1.  0.  1.  1.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 0.  1.  0.  0.  1.]]
#C1统计信息
[(0, 0.27), (1, 0.46), (2, 0.41), (3, 0.36), (4, 0.59)]
#迭代时输出中间信息
[0] [1]
[0] [2]
[0] [3]
[0] [4]
[1] [0]
[1] [2]
[1] [3]
[1] [4]
[2] [0]
[2] [1]
[2] [3]
[2] [4]
[3] [0]
[3] [1]
[3] [2]
[3] [4]
[4] [0]
[4] [1]
[4] [2]
[4] [3]
[0, 1] [0, 4]
[0, 1] [2, 3]
[0, 1] [2, 4]
[0, 1] [3, 2]
[0, 1] [3, 4]
[0, 4] [0, 1]
[0, 4] [2, 3]
[0, 4] [2, 4]
[0, 4] [3, 2]
[0, 4] [3, 4]
[2, 3] [0, 1]
[2, 3] [0, 4]
[2, 3] [2, 4]
[2, 3] [3, 2]
[2, 3] [3, 4]
[2, 4] [0, 1]
[2, 4] [0, 4]
[2, 4] [2, 3]
[2, 4] [3, 2]
[2, 4] [3, 4]
[3, 2] [0, 1]
[3, 2] [0, 4]
[3, 2] [2, 3]
[3, 2] [2, 4]
[3, 2] [3, 4]
[3, 4] [0, 1]
[3, 4] [0, 4]
[3, 4] [2, 3]
[3, 4] [2, 4]
[3, 4] [3, 2]
[2, 3, 4] [2, 4, 3]
[2, 3, 4] [3, 2, 4]
[2, 3, 4] [3, 4, 2]
[2, 4, 3] [2, 3, 4]
[2, 4, 3] [3, 2, 4]
[2, 4, 3] [3, 4, 2]
[3, 2, 4] [2, 3, 4]
[3, 2, 4] [2, 4, 3]
[3, 2, 4] [3, 4, 2]
[3, 4, 2] [2, 3, 4]
[3, 4, 2] [2, 4, 3]
[3, 4, 2] [3, 2, 4]
#k=1
[(0, 0.27), (1, 0.46), (2, 0.41), (3, 0.36), (4, 0.59)]
 
  
#k=2
[([0, 1], 0.14, 0.5185185185185185), ([0, 4], 0.17, 0.6296296296296297), ([2, 3], 0.25, 0.6097560975609756), ([2, 4], 0.27, 0.6585365853658537), ([3, 2], 0.25, 0.6944444444444444), ([3, 4], 0.21, 0.5833333333333334)]
 
  
#k=3
[([2, 3, 4], 0.17, 0.68), ([2, 4, 3], 0.17, 0.6296296296296297), ([3, 2, 4], 0.17, 0.68), ([3, 4, 2], 0.17, 0.8095238095238095)]
[]
OK
  时间有限,先写到这里。如有不当之处,欢迎指正!

你可能感兴趣的:(数据挖掘)