首先大家了解一下Apriori算法,如下:
Apriori算法 是一种最有影响的挖掘布尔关联规则频繁项集的算法。其核心是基于两阶段频集思想的递推算法。该关联规则在分类上属于单维、单层、布尔关联规则。在这里,所有支持度大于最小支持度的项集称为频繁项集,简称频集。
该算法的基本思想 是:首先找出所有的频集,这些项集出现的频繁性至少和预定义的最小支持度一样。然后由频集产生强关联规则,这些规则必须满足最小支持度和最小可信度。然后使用第1步找到的频集产生期望的规则,产生只包含集合的项的所有规则,其中每一条规则的右部只有一项,这里采用的是中规则的定义。一旦这些规则被生成,那么只有那些大于用户给定的最小可信度的规则才被留下来。为了生成所有频集,使用了递归的方法。
我们来看一下该算法的python实现,如下:
#创建基础字典
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1)#use frozen set so we
#can use it as a key in a dict
#计算数据集在字典中的数据且满足最低支持度要求比例
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not ssCnt.has_key(can): ssCnt[can]=1
else: ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems
if support >= minSupport:
retList.insert(0,key)
supportData[key] = support
return retList, supportData
#子集组装新的字典
def aprioriGen(Lk, k): #creates Ck
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort()
if L1==L2: #if first k-2 elements are equal
retList.append(Lk[i] | Lk[j]) #set union
return retList
#算法核心
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriGen(L[k-2], k)
Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
我们开始用java实现,如下:
首先是创建基础字典
private static List> createC1(List> dataSet){
List C1 = new ArrayList();
for(int i=0;i> rs = new ArrayList>();
for(String item : C1) {
List tmp = new ArrayList();
tmp.add(item);
rs.add(tmp);
}
return rs;
}
然后是计算数据集在字典中的数据且满足最低支持度要求比例
private static ScanInfo scanD(List> D,List> Ck,double minSupport){
Map,Integer> ssCnt = new HashMap,Integer>();
for(List tid : D) {
for(List can : Ck) {
if(tid.containsAll(can)) {
if(ssCnt.containsKey(can)) {
ssCnt.replace(can, ssCnt.get(can)+1);
}else {
ssCnt.put(can, 1);
}
}
}
}
List> retList = new ArrayList>();
Map,Double> supportData = new HashMap,Double>();
for (Map.Entry,Integer> entry : ssCnt.entrySet()) {
double support = (double)entry.getValue()/(double)D.size();
if(support >= minSupport) {
retList.add(entry.getKey());
}
supportData.put(entry.getKey(), support);
}
return new ScanInfo(retList,supportData);
}
然后根据字典组装新的字典
private static List> aprioriGen(List> Lk,int k){
List> retList = new ArrayList>();
for(int i=0;i L1 = new ArrayList();
List L2 = new ArrayList();
if(Lk.get(i).size() > 1) {
for(int n=0;n<(Lk.get(i).size()-k+2);n++) {
L1.add(Lk.get(i).get(n));
}
}
if(Lk.get(j).size() > 1) {
for(int n=0;n<(Lk.get(j).size()-k+2);n++) {
L2.add(Lk.get(j).get(n));
}
}
Collections.sort(L1);
Collections.sort(L2);
if((L1.containsAll(L2) && L2.containsAll(L1)) || (L1.size() ==0 && L2.size() ==0)) {
List tmp = new ArrayList();
tmp.addAll(Lk.get(i));
tmp.addAll(Lk.get(j));
retList.add(removeDuplicate(tmp));
}
}
}
return retList;
}
最后是Apriori的主体,循环推算各层满足最低支持度的集合
public static List>> apriori(List> dataSet,double minSupport) {
List> C1 = createC1(dataSet);
ScanInfo si = scanD(dataSet, C1, minSupport);
List>> L = new ArrayList>>();
L.add(si.getRetList());
int k = 2;
while(L.get(k-2).size() > 0) {
List> Ck = aprioriGen(L.get(k-2), k);
ScanInfo tmp = scanD(dataSet, Ck, minSupport);
L.add(tmp.getRetList());
k += 1;
}
return L;
}
我们开始测试
List datas = new ArrayList();
datas.add(new String[]{"1","3","4"});
datas.add(new String[]{"2","3","5"});
datas.add(new String[]{"1","2","3","5"});
datas.add(new String[]{"2","5"});
List> dataIns = new ArrayList>();
for(String[] item : datas) {
dataIns.add(Arrays.asList(item));
}
List>> tmp = apriori(dataIns,0.5);
System.out.println(tmp.toString());
结果正确如下: