本人代码能力确实有限,算法实现比较粗糙,并且在实现Apriori算法的时候,写了之后才想到了更好的实现方法,但是当时已经凌晨两点了,就懒得再改了,这也导致后续跑大数据集的时候很慢!!!!算法的原理主要是根据人民邮电出版社出版的《数据挖掘与分析 概念与算法》一书中p192,算法8.2
这里我是用的是Anaconda3 python=3.7的环境,如果没有能力装Anaconda的话,只要一个pyhton=3.7的解释器就够用。具体调用的库函数,大家在编译调试的时候可以根据遇到的BUG进行导入,如有问题可以私信或评论。
要求每个事务单独一行,项与项之间有一空格,文件格式为txt格式:
A B D E
B C E
A B D E
A B C E
A B C D E
B C D
代码如下(示例):
def ReadDataset(filename):
#读入数据,dataset = [[项集1],[项集2],……,[项集n]]
file = open(filename)
line = ""
line_list = []
dataset = []
while 1:
line = file.readline()
if not line:
break
else:
line_list = line.split()
dataset.append(line_list)
# for i in dataset:
# print(i)
return dataset
该处可能需要导入一些库,大家注意下。
def CountI(D):#计算字母表alphabet
I = []
for itemset in D:
for item in itemset:
if item in I:
continue
else:
I.append(item)
# print(I)
return I
def Apriori(D,I,minsup):
#初始化参数F,C[0],
F = {'0':["null"]}
C = [{0:["null",len(D)]}]
# 初始化参数C[1],用单个项初始化前缀树
C1 = {}
# 通过sup(i)<-0,使得i成为C[0]中的子集的子节点,其实也就是把项集全部添加进去
for i in range(len(I)):
C1[i] = [[I[i]],0]
C.append(C1)
k = 1#k代表层数
while C[k] != {}:
poplist = []
C[k] = ComputeSupport(C[k],D)
# print(C[k])
F[str(k)] = []
for key in C[k].keys():
#遍历叶子节点
if C[k][key][1] >= minsup:
F[str(k)].append(set(C[k][key][0]))
else:
#删除阈值低的
poplist.append(key)
for key in poplist:
del C[k][key]
# print(C[k])
C.append({})
C[k+1] = ExtendPrefixFree(C[k],F,k)
# print(C[k+1])
k+=1
print(F)
return F
def ComputeSupport(CK,D):
for K_itemset in list(CK.values()):
for itemset in D:
if set(K_itemset[0]) <= set(list(itemset)):
K_itemset[1] = K_itemset[1] + 1
return CK
def ExtendPrefixFree(CK,F,k):
CKNext = {}
count = 0
CK_list = list(CK.keys())
for i in range(len(CK_list)):
for j in range(i+1,len(CK_list)):
flag = 0
K_itemset = list(set(list(CK[CK_list[i]][0])+list(CK[CK_list[j]][0])))
# print("------------")
# print(K_itemset)
# print("_________")
for item in K_itemset:
K_temp = K_itemset[:]
# print(K_temp)
K_temp.remove(item)
# print(K_temp)
if set(K_temp) not in F[str(k)]:
flag = 1
break
if flag == 0 and [set(K_itemset),0] not in CKNext.values():
count += 1
CKNext[count] = [set(K_itemset),0]
return CKNext
minsup = 0.05
dataset = ReadDataset("retail.txt")
print("read finished")
I = CountI(dataset)
print("count finished")
Apriori(dataset,I,minsup*len(dataset))
正常来说,Apriori算法的复杂度应该在O(n^3),但是我写的感觉至少是四次方的,而且python的速度确实也和java没法比,所以在大数据集上迟迟不出结果(500w条的数据集需要跑30min)
def ReadDataset(filename):
#读入数据,dataset = [[项集1],[项集2],……,[项集n]]
file = open(filename)
line = ""
line_list = []
dataset = []
while 1:
line = file.readline()
if not line:
break
else:
line_list = line.split()
dataset.append(line_list)
# for i in dataset:
# print(i)
return dataset
def CountI(D):#计算字母表alphabet
I = []
for itemset in D:
for item in itemset:
if item in I:
continue
else:
I.append(item)
# print(I)
return I
def Apriori(D,I,minsup):
#初始化参数F,C[0],
F = {'0':["null"]}
C = [{0:["null",len(D)]}]
# 初始化参数C[1],用单个项初始化前缀树
C1 = {}
# 通过sup(i)<-0,使得i成为C[0]中的子集的子节点,其实也就是把项集全部添加进去
for i in range(len(I)):
C1[i] = [[I[i]],0]
C.append(C1)
k = 1#k代表层数
while C[k] != {}:
poplist = []
C[k] = ComputeSupport(C[k],D)
# print(C[k])
F[str(k)] = []
for key in C[k].keys():
#遍历叶子节点
if C[k][key][1] >= minsup:
F[str(k)].append(set(C[k][key][0]))
else:
#删除阈值低的
poplist.append(key)
for key in poplist:
del C[k][key]
# print(C[k])
C.append({})
C[k+1] = ExtendPrefixFree(C[k],F,k)
# print(C[k+1])
k+=1
print(F)
return F
def ComputeSupport(CK,D):
for K_itemset in list(CK.values()):
for itemset in D:
if set(K_itemset[0]) <= set(list(itemset)):
K_itemset[1] = K_itemset[1] + 1
return CK
def ExtendPrefixFree(CK,F,k):
CKNext = {}
count = 0
CK_list = list(CK.keys())
for i in range(len(CK_list)):
for j in range(i+1,len(CK_list)):
flag = 0
K_itemset = list(set(list(CK[CK_list[i]][0])+list(CK[CK_list[j]][0])))
# print("------------")
# print(K_itemset)
# print("_________")
for item in K_itemset:
K_temp = K_itemset[:]
# print(K_temp)
K_temp.remove(item)
# print(K_temp)
if set(K_temp) not in F[str(k)]:
flag = 1
break
if flag == 0 and [set(K_itemset),0] not in CKNext.values():
count += 1
CKNext[count] = [set(K_itemset),0]
return CKNext
minsup = 0.05
dataset = ReadDataset("retail.txt")
print("read finished")
I = CountI(dataset)
print("count finished")
Apriori(dataset,I,minsup*len(dataset))