小修改+注释
"""
# Python 2.7
# Filename: apriori.py
# Author: llhthinker
# Email: hangliu56[AT]gmail[DOT]com
# Blog: http://www.cnblogs.com/llhthinker/p/6719779.html
# Date: 2017-04-16
"""
"""
data_set = list[list[]]
L = list[set(frozenset())]
support_data = dic{frozenset()} = value (support count)
C1 = set(frozenset())
Lk = set(frozenset())
item_count = dic{frozenset()}
Lksub1 = set(frozenset())
Ck_item = frozenset()
Ck = set(frozenset())
"""
#return a list(list)
def load_data_set():
"""
Load a sample data set (From Data Mining: Concepts and Techniques, 3th Edition)
Returns:
A data set: A list of transactions. Each transaction contains several items.
"""
data_set = [['s1', 's2', 's5'], ['s2', 's4'], ['s2', 's3'],
['s1', 's2', 's4'], ['s1', 's3'], ['s2', 's3'],
['s1', 's3'], ['s1', 's2', 's3', 's5'], ['s1', 's2', 's3']]
"""
the type of the data_set is list of list-----------------------------------------------
"""
return data_set
#return a set(frozenset)
def create_C1(data_set):
"""
Create frequent candidate 1-itemset C1 by scaning data_set.
Args:
data_set: A list of transactions. Each transaction contains several items.
Returns:
C1: A set which contains all frequent candidate 1-itemsets
"""
"""
The explain of frozenset :http://www.cnblogs.com/panwenbin-logs/p/5519617.html
"""
C1 = set()
for t in data_set:
for item in t:
item_set = frozenset([item])
#print(type(item_set),item_set)
C1.add(item_set)
#print(C1)
return C1
#return a bool -> just judge **step of pruning**
def is_apriori(Ck_item, Lksub1):
"""
Judge whether a frequent candidate k-itemset satisfy Apriori property.
Args:
Ck_item: a frequent candidate k-itemset in Ck which contains all frequent
candidate k-itemsets.
Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets.
Returns:
True: satisfying Apriori property.
False: Not satisfying Apriori property.
"""
for item in Ck_item: #Ck_item is only frozenset which contains only one element(set).
#print("aaa")
#print(item) #str
#print('bbb')
#print(Ck_item) #
#print(type(Ck_item))
#print("origin")
#print(Ck_item)
sub_Ck = Ck_item - frozenset([item]) #sub_Ck is (k-1)-itemsets
#print("after pruning")
#print(sub_Ck)
if sub_Ck not in Lksub1:
#print("xxx")
#print(sub_Ck)
return False
return True
#return a set(frozenset()) **step of connection**
def create_Ck(Lksub1, k):
"""
Create Ck, a set which contains all all frequent candidate k-itemsets
by Lk-1's own connection operation.
Args:
Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets.
k: the item number of a frequent itemset.
Return:
Ck: a set which contains all all frequent candidate k-itemsets.
"""
Ck = set()
len_Lksub1 = len(Lksub1) #the numbers of the (k-1)-itemsets
#print(len_Lksub1)
list_Lksub1 = list(Lksub1) #transform (k-1)-itemsets of the set into list
#print(list_Lksub1)
for i in range(len_Lksub1):
for j in range(i+1, len_Lksub1):
l1 = list(list_Lksub1[i]) #list of the list
l2 = list(list_Lksub1[j])
l1.sort()
l2.sort()
#print(l1)
#print(l2)
if l1[0:k-2] == l2[0:k-2]:
Ck_item = list_Lksub1[i] | list_Lksub1[j] #connecting list( two (k-1)-itemsets )
#print("xxx")
#print(Ck_item)
#print(list_Lksub1) --------------
#print(type(Ck_item))
#print(type(list_Lksub1)) #process -> list_Lk = list_1 | list_2 -> tranform list_LK into Ck_item
#else pruning
if is_apriori(Ck_item, Lksub1):
Ck.add(Ck_item)
#print(Ck)
#print(type(Ck))
return Ck
#return a set(frozenset) **scaning the data set**
def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
"""
Generate Lk by executing a delete policy from Ck.
Args:
data_set: A list of transactions. Each transaction contains several items.
Ck: A set which contains all all frequent candidate k-itemsets.
min_support: The minimum support.
support_data: A dictionary. The key is frequent itemset and the value is support.
Returns:
Lk: A set which contains all all frequent k-itemsets.
"""
Lk = set()
item_count = {}
for t in data_set: # t represent a transation
for item in Ck: #item represent a candidate k-itemsets
"""
print(type(item)) class->frozenset
print(item) ->frozenset({'l2'}),which can be the key of the dictionary
print(type(t)) class->list
print(t) [lx,lx,...lx]
"""
if item.issubset(t): # the set of item is the subset of the list of t
#print("Yes")
if item not in item_count:
item_count[item] = 1
else:
item_count[item] += 1
# else:
# print("No")
t_num = float(len(data_set)) # total numbers of transations
for item in item_count:
if (item_count[item] / t_num) >= min_support:
Lk.add(item)
#print(Lk)
support_data[item] = item_count[item] #/ t_num
return Lk
#return L = list(set(frozenset)) , support_data = dic()
def generate_L(data_set, k, min_support):
"""
Generate all frequent itemsets.
Args:
data_set: A list of transactions. Each transaction contains several items.
k: Maximum number of items for all frequent itemsets.
min_support: The minimum support.
Returns:
L: The list of Lk.
support_data: A dictionary. The key is frequent itemset and the value is support.
"""
support_data = {}
C1 = create_C1(data_set)
L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
Lksub1 = L1.copy()
#print(Lksub1)
L = []
L.append(Lksub1)
#print(L)
for i in range(2, k+1):
Ci = create_Ck(Lksub1, i)
Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
Lksub1 = Li.copy()
L.append(Lksub1) #every time append a set(frozenset) where contain k-itemsets
return L, support_data
def generate_big_rules(L, support_data, min_conf):
"""
Generate big rules from frequent itemsets.
Args:
L: The list of Lk.
support_data: A dictionary. The key is frequent itemset and the value is support.
min_conf: Minimal confidence.
Returns:
big_rule_list: A list which contains all big rules. Each big rule is represented
as a 3-tuple.
"""
big_rule_list = []
sub_set_list = []
for i in range(0, len(L)):
for freq_set in L[i]:
for sub_set in sub_set_list:
if sub_set.issubset(freq_set):
conf = support_data[freq_set] / support_data[freq_set - sub_set]
big_rule = (freq_set - sub_set, sub_set, conf)
if conf >= min_conf and big_rule not in big_rule_list:
# print freq_set-sub_set, " => ", sub_set, "conf: ", conf
big_rule_list.append(big_rule)
sub_set_list.append(freq_set)
return big_rule_list
if __name__ == "__main__":
"""
Test
"""
data_set = load_data_set() #load data
L, support_data = generate_L(data_set, k=3, min_support=0.2)
for Lk in L:
print ("="*50)
print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")
print ("="*50)
for freq_set in Lk:
print (freq_set, support_data[freq_set])
print ()
"""
big_rules_list = generate_big_rules(L, support_data, min_conf=0.7)
print ("Big Rules")
for item in big_rules_list:
print (item[0], "=>", item[1], "conf: ", item[2])
"""