import os
import types
import sys
sys.setrecursionlimit(1000000)
'''
author liuzhenhua
date 20131113
the apriori algorithm for python
'''
original = {}
mp = {}
def loaddata(dgree):
f = open("D:/apriori.txt","r")
i = 0;
while True:
st = f.readline()
if len(st) == 0: break
strs = st.split("\t")
lvals = strs[1].strip("\n").split("#")
#print lvals
original[strs[0]] = lvals
i = i+1
for word in lvals:
if mp.has_key(word):
mp[word] = mp[word] + 1
else:
mp[word] = 1
for h in mp.keys():
if mp[h] == dgree:
del mp[h]
f.close()
'''
count = 0
pm = {}
for e in mp.keys():
count = count +1
pm[e] = mp[e]
if count == int(len(mp) * dgree):break
'''
return mp
def isContain(list1,list2):
flag = True
for m in list2:
for n in list1:
#print m,n
if m == n:
flag = True
break
flag = False
if flag == False:break
return flag
def isOriginal(ori,list1):
flag = False;
for xxx in ori.keys():
#print "original:",original[xxx]
if isContain(ori[xxx],list1):
flag = True
break;
return flag
'''
sz is the size of frequency items
dgree duplicate the item is not property for the frequency items
'''
def apriori(dic,dicty,ori,sz,dgree):
jie = {}
dup = {}
kys = dic.keys()
kys2 = dicty.keys()
for a in range(0,len(kys2)):
for b in range(0,len(kys)):
if(isContain(list(kys[b]),list(kys2[a]))): continue
tem = kys2[a]+ kys[b];
#print original,list(tem)
#if isOriginal(ori,list(tem)):
teml=list(tem)
teml.sort()
tem = ''.join(teml)
if(dup.has_key(tem)): continue
else:
dup[tem] = 1
nu = 0
for cc in ori.keys():
if isContain(ori[cc],teml):
if jie.has_key(tem):
jie[tem] = jie[tem] + 1
else:
jie[tem] = 1
for d in jie.keys():
if jie[d] == dgree:
del jie[d]
nu = stop(jie)
print "frequency item:",nu,"items:",jie
if nu == sz:
return jie
else:
return apriori(jie,dicty,ori,sz,dgree)
def stop(res):
a = 0
for rh in res.keys():
a = len(rh)
break
return a
bp = loaddata(1)
print "the original data:",original
print "frequency item:",1,"items:",bp
apriori(bp,bp,original,3,1)
apriori.txt数据:
10 A#C#D
20 B#C#E
30 A#B#C#E
40 B#E