# 数据集
dataset = [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
# 让候选集每一项变成不可变集合,进而获取1-项集
def creat_c1(data_set):
c1 = []
for data in data_set:
for i in data:
if i not in c1:
c1.append(i)
c1.sort()
return list(map(frozenset,[{i} for i in c1])) # frozenset是将集合变成不可变集合,目的是最后让frozenset作为字典的key
c1 = creat_c1(dataset)
'''
[frozenset({1}),
frozenset({2}),
frozenset({3}),
frozenset({4}),
frozenset({5})]
'''
# C1(1项集):L1(1项频繁项集)
# D:数据集
# Ck:k项集
# min_support:最小支持度
def scanD(D,Ck,min_support=0.1):
support_dic = {}
# 遍历原始交易记录
for d in D:
for c in Ck:
# 判断是否是子集,是的话数量加1
if c.issubset(d):
support_dic[c] = support_dic.get(c,0) + 1 # 防止刚开始support_dic是空
support_data = {} # 所有项集的支持度
LK = [] # 频繁项集
# 计算支持度
for k,v in support_dic.items():
support = v/len(D)
support_data[k] = support
# print(support_data) # 打印支持度
# 获得频繁项集
if support >= min_support:
LK.append(k)
# 返回频繁项集、所有项集支持度:
return LK, support_data
# L1(1频繁项集) => C2(2项集)
def apriori_gen(LK):
Ck = []
for i in range(len(LK)-1):
for j in range(i+1,len(LK)):
f_set = LK[i] | LK[j]
# print(f_set)
# 不能重复,新项集只能是k+1项
if f_set not in Ck and len(f_set) == len(LK[0])+1:
Ck.append(f_set)
# print(Ck)
return Ck
import time
def apriori(D, min_support=0.1):
c1 = creat_c1(D)
L1,support1 = scanD(D,c1,min_support)
# 所有频繁项集
L_f = []
# 所有项集支持度就直接添加到support1中
# 循环
while True:
L_f.append(L1)
# 项集
C = apriori_gen(L1)
# 项集——频繁项集
L,support = scanD(D,C,min_support)
L1 = L
support1.update(support)
if len(L1)==0:
break
return L_f,support1
# 计算一个项集的所有强关联规则
# 计算置信度
# freqSet: 频繁项集
# H=[frozenset({i}) for i in freqSet]
# L, support_Data = apriori(dataset, min_support=n)
# brl = [ ] # 保存强关联规则的列表
def calculate_conf(freqSet, H, supportData, brl, minConf=0.5):
newH = [ ]
# 遍历H
for s in H:
# 置信度
conf = supportData[freqSet] / supportData[freqSet - s]
# conf(3,5->1) = P(1, 3, 5) / P(3,5)
# display(f'--- {freqSet - s} -> {s} = {conf} ---')
# 大于最小置信度的规则是强规则
if conf >= minConf:
# 保存强关联规则到brl中
brl.append( (freqSet - s, "->" , s, ' = ', conf) )
newH.append(s)
return newH
用一个2-项集测试下函数calculate_conf,发现对于2-项集,函数能够获取所有满足置信度要求的关联规则。
freqSet = frozenset({1, 3})
H = [frozenset({i}) for i in freqSet]
L, support_data = apriori(dataset, min_support=0.2)
brl = [ ] # 保存强关联规则的列表
# display(freqSet, H)
# 计算单个项集的置信度
calculate_conf(freqSet, H, support_data, brl, minConf=0.1)
brl
'''
[(frozenset({3}), '->', frozenset({1}), ' = ', 0.6666666666666666),
(frozenset({1}), '->', frozenset({3}), ' = ', 1.0)]
'''
# 3-项集
freqSet = frozenset({1, 3, 5})
H = [frozenset({i}) for i in freqSet]
L, support_data = apriori(dataset, min_support=0.2)
brl = [ ] # 保存强关联规则的列表
# display(freqSet, H)
# 计算单个项集的置信度
calculate_conf(freqSet, H, support_data, brl, minConf=0.1)
brl
'''
[(frozenset({3, 5}), '->', frozenset({1}), ' = ', 0.5),
(frozenset({1, 5}), '->', frozenset({3}), ' = ', 1.0),
(frozenset({1, 3}), '->', frozenset({5}), ' = ', 0.5)]
'''
可以发现:在3项集中出现了问题,3项集中只有2-项集作为前件的情况,没有1-项集作为前件的情况,出现了统计不完全的情况。因此为了让统计结果齐全,需要重新写个函数完善calculate_conf()函数。
# 考虑2-项集,3-项集,4-项集...
def rules_from_freq(freqSet, H, supportData, brl, minConf=0.7):
tmp = True
while tmp:
tmp = False
# 计算置信度
newH = calculate_conf(freqSet, H, supportData, brl, minConf=minConf)
# display(f'newH: {newH}')
H = apriori_gen(newH)
# display(f'H: {H}')
# print('*' * 100)
tmp = not (H==[ ] or len(H[0]) == len(freqSet))
测试:通过测试结果可以看出,完善之后的函数就能够获得所有满足要求置信度的关联规则
# 3-项集
freqSet = frozenset({1, 3, 5})
H = [frozenset({i}) for i in freqSet]
L, support_data = apriori(dataset, min_support=0.2)
brl = [ ] # 保存强关联规则的列表
# display(freqSet, H)
# 计算单个项集的置信度
rules_from_freq(freqSet, H, support_data, brl, minConf=0.1)
brl
'''
[(frozenset({3, 5}), '->', frozenset({1}), ' = ', 0.5),
(frozenset({1, 5}), '->', frozenset({3}), ' = ', 1.0),
(frozenset({1, 3}), '->', frozenset({5}), ' = ', 0.5),
(frozenset({5}), '->', frozenset({1, 3}), ' = ', 0.3333333333333333),
(frozenset({3}), '->', frozenset({1, 5}), ' = ', 0.3333333333333333),
(frozenset({1}), '->', frozenset({3, 5}), ' = ', 0.5)]
'''
def gen_rules(L, support_data, min_conf=0.5):
big_rule_list = [ ]
for i in range(1, len(L)): # 遍历所有行,第一行除外
for freqSet in L[i]: # 遍历每一行的所有元素
# display(freqSet)
H = [frozenset({i}) for i in freqSet]
# 求每个项集的强关联规则,会保存在big_rule_list中
rules_from_freq(freqSet, H, support_data, big_rule_list, minConf=min_conf)
return big_rule_list