数据挖掘习题选做--第六章:Apriori算法、FP-growth

数据挖掘概念与技术习题选做

第六章习题

数据挖掘习题选做--第六章:Apriori算法、FP-growth_第1张图片

(1) 用python简单实现Apriori算法

# -*- coding: utf-8 -*-
__author__ = "Yunfan Yang"

def gen_L1(TID):
    """从事务集中产生频繁1项集"""
    initial_C1 = {}        # 定义一个空字典用于统计初始项集信息,键值对形如{"M": 3}

    for tid in TID:
        for item in tid:
            if item not in initial_C1.keys():
                initial_C1[item] = 1
            else:
                initial_C1[item] += 1

    # 候选1项集
    C1 = []
    for key, value in initial_C1.items():
        tmp_tuple = ([key], value)
        C1.append(tmp_tuple)
    # C1结果为[(['M'], 3), (['O'], 4), (['N'], 2), (['K'], 5), (['E'], 4), (['Y'], 3), (['D'], 1), (['A'], 1), (['U'], 1), (['C'], 2), (['I'], 1)]

    # 频繁1项集
    L1 = []
    for item in C1:
        if item[1] / len(TID) >= min_sup:
            L1.append(item)

    # L1结果为[(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)]
    return L1

def generateC_k(Lk_1):
    """产生候选k项集的函数"""
    C_k = []

    # 执行连接步骤
    for i in range(len(Lk_1)-1):     # 遍历Lk_1中的项集
        # print(C)
        for j in range(i+1, len(Lk_1)):
            if len(Lk_1[i][0]) <= 1:    # 频繁项集只有1项,则直接连接,产生候选项
                C = Lk_1[i][0][:]  # 复制Lk_1[i][0]给C
                C.append(Lk_1[j][0][-1])
            elif len(Lk_1[i][0]) > 1:
                if if_equal(Lk_1[i][0][:-1], Lk_1[j][0][:-1]) and Lk_1[i][0][-1] != Lk_1[j][0][-1]: # 连接条件
                    C = Lk_1[i][0][:]
                    C.append(Lk_1[j][0][-1])   # 连接步
                else:
                    break
            if has_infrequent_subset(C, Lk_1):       # 剪枝步:删除非频繁候选
                pass
            else:
                C_k.append(C)
    return C_k

# Lk_1 = [(['O', 'K'], 3), (['O', 'E'], 3)]
def if_equal(A, B):
    """判断两个长度相等的列表是否相等"""
    for i in range(len(A)):
        if A[i] == B[i]:
            continue
        else:
            return False
    return True

# L1 = [(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)]
# C = ['M', 'I']
def has_infrequent_subset(C, Lk_1):
    """利用先验知识,判断候选项的子集是否有子集不属于频繁集"""
    L = []  # 创建一个空列表只包含频繁项集,形如[['M'], ['O']]
    for i in range(len(Lk_1)):
        L.append(Lk_1[i][0])
    # print(L)
    for element in subset(C):
        # print(element)
        if element not in L:   # 如果候选项的子集不在Lk_1中,返回True
            return True
    return False

def subset(C):
    """产生一个列表的长度减1项子集,比如[1,2,3]生成[[1,2],[1,3],[2,3]]"""
    result = []
    for i in range(len(C)):
        copyC = C[:]
        copyC.pop(i)
        subsetC = copyC
        result.append(subsetC)
    return result

def AinB(A, B):
    """定义函数检测列表B是否全部包含列表A的元素"""
    for a in A:
        if a in B:
            continue
        else:
            return False
    return True

def main_apriori(Lk_1):
    """Apriori主程序"""

    while Lk_1[-1] != []:
        Ck = generateC_k(Lk_1[-1])   # 产生候选项集
        # print(Ck)
        Lk = []                     # 初始化每次要生成的频繁项集
        for i in range(len(Ck)):
            count = 0               # 初始化计数
            for tid in TID:    # 扫描事务库,进行计数
                if AinB(Ck[i], tid):   # 判断候选项集中的候选项是否出现在事务集的事务中
                    count += 1
                    lk = (Ck[i], count)

            if lk[1] / len(TID) >= min_sup:  # 判断是否满足最小支持度
                Lk.append(lk)
        Lk_1.append(Lk)

    return Lk_1      # 得出每次生成的频繁项集列表

def gen_rule_set(Lk_1):
    """生成只包含关联规则项集的列表"""
    # 因为最后的频繁集的所有非空子集及支持度计数都出现在列表Lk_1中,故可以跟据Lk_1的列表寻找关联规则
    # 首先生成一个新列表只包含最终频繁集的及其非空子集
    rule_set = []
    for i in range(len(Lk_1) - 1):  # 遍历Lk_1的每个频繁项集
        for L in Lk_1[i]:  # 对于每个频繁项
            if AinB(L[0], Lk_1[-2][0][0]):  # 如果频繁项的所有元素都在最终频繁项中,则取出该列表至rule_set中
                rule_set.append(L)
    return rule_set

def gen_rule(rule_set):
    """生成规则"""
    for i in range(len(rule_set)):
        for j in range(len(rule_set)):    # 遍历规则列表rule_set
            if not AinB(rule_set[i][0], rule_set[j][0]) and not AinB(rule_set[j][0], rule_set[i][0]) and (
                len(rule_set[i][0]) + len(rule_set[j][0])) <= len(rule_set[-1][0]): # 取出没有项本身的另一个项,生成规则
                rule = [rule_set[i], rule_set[j]]
                join = rule_set[i][0] + rule_set[j][0]   # 将规则里的两个项集合并,即取并集

                for k in range(len(rule_set)):   # 再一次遍历关联规则的项集合
                    if len(join) == len(rule_set[k][0]) and AinB(join, rule_set[k][0]):  # 取出并集的支持度计数
                        rule_union = rule_set[k]
                        # print(rule_union)
                        conf = rule_union[1] / rule[0][1]     # 计算置信度
                        if conf >= min_conf:                  # 判断是否满足最小置信度
                            print(rule[0][0], '-->', rule[1][0], '置信度为:{}%'.format(conf*100))

if __name__ == "__main__":

    # 原始事务集
    t100 = ('M', 'O', 'N', 'K', 'E', 'Y')
    t200 = ('D', 'O', 'N', 'K', 'E', 'Y')
    t300 = ('M', 'A', 'K', 'E')
    t400 = ('M', 'U', 'C', 'K', 'Y')
    t500 = ('C', 'O', 'O', 'K', 'I', 'E')
    TID = [t100, t200, t300, t400, t500]

    # 设置最小支持度以及置信度
    min_sup = 0.6
    min_conf = 0.8

    # # 原始事务集
    # t100 = ('I1', 'I2', 'I5')
    # t200 = ('I2', 'I4')
    # t300 = ('I2', 'I3')
    # t400 = ('I1', 'I2', 'I4')
    # t500 = ('I1', 'I3')
    # t600 = ('I2', 'I3')
    # t700 = ('I1', 'I3')
    # t800 = ('I1', 'I2', 'I3', 'I5')
    # t900 = ('I1', 'I2', 'I3')
    # TID = [t100, t200, t300, t400, t500, t600, t700, t800, t900]
    #
    # # 设置最小支持度以及置信度
    # min_sup = 2/9
    # min_conf = 0.7

    Lk_1 = [gen_L1(TID)]   # 将每个频繁项集作为一个子列表储存在Lk_1中
    Lk_1 = main_apriori(Lk_1)
    print("\n生成的频繁项集列表为:", Lk_1[:-1])

    Lk = Lk_1[-2]
    print("最后生成的频繁项集为:", Lk)

    rule_set = gen_rule_set(Lk_1)
    print("产生关联规则的项集为:", rule_set)

    print("\n生成的强关联规则有:")
    gen_rule(rule_set)

运行结果为:

生成的频繁项集列表为: [[(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)], [(['M', 'K'], 3), (['O', 'K'], 3), (['O', 'E'], 3), (['K', 'E'], 4), (['K', 'Y'], 3)], [(['O', 'K', 'E'], 3)]]
最后生成的频繁项集为: [(['O', 'K', 'E'], 3)]
产生关联规则的项集为: [(['O'], 4), (['K'], 5), (['E'], 4), (['O', 'K'], 3), (['O', 'E'], 3), (['K', 'E'], 4), (['O', 'K', 'E'], 3)]

生成的强关联规则有:
['K'] --> ['E'] 置信度为:80.0%
['E'] --> ['K'] 置信度为:100.0%
['O', 'K'] --> ['E'] 置信度为:100.0%
['O', 'E'] --> ['K'] 置信度为:100.0%

(2) FP-growth 算法
再说吧,先往下进行,有空再写。另外Apriori写的着实太烂,还是太菜了,继续加油把。

这里写图片描述

数据挖掘习题选做--第六章:Apriori算法、FP-growth_第2张图片

你可能感兴趣的:(数据挖掘概念与技术读书笔记,python及算法学习)