Apriori算法是一种挖掘关联规则的频繁项集算法,可用于消费市场价格分析,猜测顾客的消费习惯。网上有很多详细介绍,这里就不再赘述。
主要步骤如下:
1.依据支持度找出所有频繁项集(频度)
2.依据置信度产生关联规则(强度)
支持度:
接下来解释python代码:
#--coding=utf-8 import numpy as np import pandas as pd class AssociationRule: def __init__(self,dataSet,minSupport=0.5,minCconf=0.98): """initialize class AssociationRule""" self.sentences = dataSet self.minSupport = minSupport self.minConf = minCconf self.rows = len(self.sentences) self.columns = len(self.sentences.columns) self.L = [] def CkStats(self,Ck): """create candidate itemsets of size 1 C1""" statslist = [] for colj in Ck: colValues = self.sentences.ix[:,colj] statslist.append((colj,len(colValues[colValues>0])/float(self.rows))) #可以按支持度进行排序,此处注释掉 # statslist.sort(key=lambda x:x[1],reverse=True) return statslist def scan(self,Ck,k): """generate frequent itemsets Lk from candidate itemsets Ck""" retList = [] retListTmp = [] for colj in Ck: for colk in Ck: if colj != colk: print(colj,colk) for elek in colk: #形成K+1项集 Lktmp = colj + [elek] # 去重 Lk = list(set(Lktmp)) # 去重后,按顺序存储 Lk.sort(key=Lktmp.index) if len(Lk)>k and Lk not in retListTmp: retListTmp.append(Lk) coljvalue = self.sentences[colj] Lkvalue = self.sentences[Lk] coljmeanvalue = coljvalue.mean(axis=1) # 求得置信度分母 NJ = len(coljmeanvalue[coljmeanvalue==1.0]) Lkmeanvalue = Lkvalue.mean(axis=1) if NJ > 0: # 求得支持度分子 JK = len(Lkmeanvalue[Lkmeanvalue==1.0]) support = JK/self.rows conf = JK/NJ if support >= self.minSupport and conf >= self.minConf: retList.append((Lk,support,conf)) self.L.append(retList) return self.L[-1] def apriori(self): """generate a list of frequent itemsets""" C1 = range(self.columns) C1stats = self.CkStats(C1) print(C1stats) self.L.append(C1stats) C1 = [[x[0]] for x in C1stats if x[1]>=self.minSupport] # self.sentences.reindex(columns=C1) Ck = self.scan(C1,1) k = 2 # 迭代 while k<=3: Ck = [x[0] for x in Ck] Ck = self.scan(Ck,k) k += 1 for ele in self.L: print(ele) def read_file(filepath): affinityX = np.loadtxt(filepath) print(len(affinityX)) print(affinityX[:5]) affinityColumns = ['bread','milk','cheese','apple','banana'] return pd.DataFrame(affinityX,columns=affinityColumns) if __name__ == "__main__": datasetPath = "../dataset/" filename = "affinity_dataset.txt" filepath = datasetPath + filename sentences = read_file(filepath) minSupport = 0.1 minConf = 0.5 assrules = AssociationRule(sentences,minSupport,minConf) assrules.apriori() print("OK")程序运行结果如下:
#数据行数
100
#前5行数据 [[ 0. 0. 1. 1. 1.] [ 1. 1. 0. 1. 0.] [ 1. 0. 1. 1. 0.] [ 0. 0. 1. 1. 1.] [ 0. 1. 0. 0. 1.]]
#C1统计信息 [(0, 0.27), (1, 0.46), (2, 0.41), (3, 0.36), (4, 0.59)]
#迭代时输出中间信息 [0] [1] [0] [2] [0] [3] [0] [4] [1] [0] [1] [2] [1] [3] [1] [4] [2] [0] [2] [1] [2] [3] [2] [4] [3] [0] [3] [1] [3] [2] [3] [4] [4] [0] [4] [1] [4] [2] [4] [3] [0, 1] [0, 4] [0, 1] [2, 3] [0, 1] [2, 4] [0, 1] [3, 2] [0, 1] [3, 4] [0, 4] [0, 1] [0, 4] [2, 3] [0, 4] [2, 4] [0, 4] [3, 2] [0, 4] [3, 4] [2, 3] [0, 1] [2, 3] [0, 4] [2, 3] [2, 4] [2, 3] [3, 2] [2, 3] [3, 4] [2, 4] [0, 1] [2, 4] [0, 4] [2, 4] [2, 3] [2, 4] [3, 2] [2, 4] [3, 4] [3, 2] [0, 1] [3, 2] [0, 4] [3, 2] [2, 3] [3, 2] [2, 4] [3, 2] [3, 4] [3, 4] [0, 1] [3, 4] [0, 4] [3, 4] [2, 3] [3, 4] [2, 4] [3, 4] [3, 2] [2, 3, 4] [2, 4, 3] [2, 3, 4] [3, 2, 4] [2, 3, 4] [3, 4, 2] [2, 4, 3] [2, 3, 4] [2, 4, 3] [3, 2, 4] [2, 4, 3] [3, 4, 2] [3, 2, 4] [2, 3, 4] [3, 2, 4] [2, 4, 3] [3, 2, 4] [3, 4, 2] [3, 4, 2] [2, 3, 4] [3, 4, 2] [2, 4, 3] [3, 4, 2] [3, 2, 4]
#k=1 [(0, 0.27), (1, 0.46), (2, 0.41), (3, 0.36), (4, 0.59)]
#k=2[([0, 1], 0.14, 0.5185185185185185), ([0, 4], 0.17, 0.6296296296296297), ([2, 3], 0.25, 0.6097560975609756), ([2, 4], 0.27, 0.6585365853658537), ([3, 2], 0.25, 0.6944444444444444), ([3, 4], 0.21, 0.5833333333333334)]
#k=3[([2, 3, 4], 0.17, 0.68), ([2, 4, 3], 0.17, 0.6296296296296297), ([3, 2, 4], 0.17, 0.68), ([3, 4, 2], 0.17, 0.8095238095238095)]
时间有限,先写到这里。如有不当之处,欢迎指正!