Python3.4 Apriori算法

'''
思路:      1、生成L1;
      2、Ck生成Lk;
      3、Lk生成Ck+1;
      4、得到频繁项集;(前面只用到支持度);
      5、通过频繁项集生成关联规则。(用到置信度).
'''


import collections

datasets = [
    ["I1","I2","I5"],
    ["I2","I4"],
    ["I2","I3"],
    ["I1","I2","I4"],
    ["I1","I3"],
    ["I2","I3"],
    ["I1","I3"],
    ["I1","I2","I3","I5"],
    ["I1","I2","I3"]
    ]

#使用绝对最小支持度,即为频数,相对支持度表示概率,用前面的频数除以记录条数n
min_support = 2
min_confidence = 0.5
Lk_all = {}

#将每条记录内容进行连接,最终以字符串形式连接每条记录里的各个元素,它们之间使用逗号隔开
def getdatasetsString():
    result = []
    for transation in datasets:
        tmp = ""
        for item in transation:
            tmp+=item
            if item != transation[-1]:
                tmp += ","
        result.append(tmp)
    return result

#print(getdatasetsString())

#得到L1
def getL1():
    tmp = []
    L1 = {}
    for transaction in datasets:
        for item in transaction:
            tmp.append(item)  
    rs = collections.Counter(tmp)      #使用库函数直接统计出现的次数,避免手动计数。
    for key in sorted(rs):
        if(rs[key] >= min_support):      #通过最小支持度进行剪枝操作,得到L1
            L1[key] = rs[key]
    return L1
#L1 = getL1()
#print("L1:",L1)

#Ck生成Lk
def getLk(Ck):
    Lk = {}
    result = getdatasetsString()
    #print(result)
    for items in Ck:
        tmpitems = items.split(",")
        count = 0
        for r in result:
            flag = True
            for item in tmpitems:
                if item in r:
                    continue
                else:
                    flag = False
                    break
            if flag:
                count = count + 1
        if (count >= min_support):
            Lk[items] = count
    return Lk

#由Lk生成Ck+1
def getCk1(Lk): 
    Ck= []
    keys = list(Lk.keys())
    keys1 = list(Lk.keys())
    for key in keys:
        keys1.remove(key)       #自连接时去掉前面出现过的元素
        for key1 in keys1:
            key_last = key.split(",")[-1]        #获取最后一个元素
            key1_last = key1.split(",")[-1]
            key_pre = key.replace(key_last,"")
            key1_pre = key1.replace(key1_last,"")
            if( key_pre == key1_pre):
                if (key_last>key1_last):
                    s = key_pre+key1_last+","+key_last      #还需要进一步判断s所有少一个元素的子串是否是频繁项集
                else:
                    s = key_pre+key_last+","+key1_last      #还需要进一步判断s所有少一个元素的子串是否是频繁项集
                subc = getSubString(s)                  #获取所有少一个元素的子串,以list形式存起来,与候选项集相同形式存储
                subl = getLk(subc)                      #通过Ck生成Lk
                if(len(subc) == len(subl)):             #如果Ck与Lk元素个数相同,则说明ck里所有元素都是频繁的
                    Ck.append(s)
    return Ck;    

#得到最终频繁项集
def getFiallyLk(L1):
    Lk = L1
    while(True):
        Ck1 = getCk1(Lk)
        Lk1 = getLk(Ck1)
        #print("Lk:",Lk)
        for key in Lk:
            Lk_all[key] = Lk[key]
        if (len(Lk1) == 0):
            break
        Lk = Lk1
    return Lk

#获取所有少一个元素的子串,以list形式存起来,与候选项集相同形式存储,比如'abc',则得到'ab'、'ac'和'bc'
def getSubString(s):
    substring = []
    items = s.split(",")
    for item in items:
        if item == items[-1]:
            substring.append(s.replace(","+item,""))
        else:
            substring.append(s.replace(item+",",""))
    return substring

#Lk = getFiallyLk(L1)
#print("最终频繁项集:",Lk)

#下面将根据频繁项集获取强关联规则
from itertools import combinations


try:
    from functools import reduce
except ImportError:
    pass

#通过函数f就可以获取列表的所有非空子集(不包括本身)
f = lambda s: reduce(list.__add__, (
    list(map(list, combinations(s, l)))
        for l in range(1, len(s))),
    [])

#获取强关联规则
def getRule(Lk):
    rules = {}
    for Lk_item in Lk:
        items = Lk_item.split(",")
        #n = len(datasets)      #由于概率过程中都除以n,所以可以约掉而不用计算
        N = Lk[Lk_item]           #P(X->Y) = p(XY)/P(X) = N(XY)/N(X),这里的N就是计算出了N(XY)
        pre_item_list = f(items)
        for pre_item in pre_item_list:
            back_item = sorted(list(set(items)^set(pre_item)) )         #得到2个list的差集
            #print(pre_item,"=>",back_item)
            pre_s = ""
            for item in pre_item:
                if len(pre_item) == 1:
                    pre_s += item
                else:
                    if item == pre_item[-1]:
                        pre_s += item
                    else:
                        pre_s += item + ","
            back_s = ""
            for item in back_item:
                if len(back_item) == 1:
                    back_s += item
                else:
                    if item == back_item[-1]:
                        back_s += item
                    else:
                        back_s += item + ","
            #下面计算N(X)
            N1 = Lk_all[pre_s]
            p = N/N1
            if(p >= min_confidence):
                rule = pre_s+" => "+back_s
                #print(rule)
                rules[rule] = p
    return rules        
#print(Lk_all)  #存放所有频繁项集里的元素出现的次数

if __name__ == "__main__":
    L1 = getL1()
    Lk = getFiallyLk(L1)
    print("最终频繁项集:",Lk)
    rules = getRule(Lk)
    for rule in rules:
        print(rule,":",rules[rule])
    '''
    C2 = getCk1(L1)
    print("C2:",C2)
    L2 = getLk(C2)
    print("L2:",L2)
    C3 = getCk1(L2)
    print("C3:",C3)
    L3 = getLk(C3)
    print("L3:",L3)
    '''

最终输出结果截图:
Python3.4 Apriori算法_第1张图片

你可能感兴趣的:(python+apriori)