数据挖掘aprioir算法python的简单实现

这周学习了数据挖掘中计算频繁项集的aprioir算法,
老师让用python实现一下,自己按照《数据挖掘概念与技术》第六章上面讲解的过程实现
如果有大佬发现问题,欢迎提出意见

实现如下

"""
aprioir算法
@author: liuyinxin
"""

# 设置支持度为2
support = 2


def load_data():
    """
    :return: 加载数据
    """
    data = []
    with open('test_data.txt', 'r') as f:
        while True:
            line = f.readline()
            if not line:
                break
            data.append([int(_) for _ in line.split()])
    return data


def freq_1_items(data, sup=2):
    """
    找到1频繁项集
    :param
    :return:
    """
    L = {}
    for item in data:
        for i in item:
            if i not in L:
                L[i] = 1
            else:
                L[i] += 1
    return sorted([[k] for k, v in L.items() if v >= sup])


def check_join(a, b, k):
    """
    检查是否可以链接, 前n-1项相同 并且 第n项不相同
    :param a:
    :param b:
    :param k:
    :return:
    """
    k -= 1
    for i in range(0, k):
        if a[i] != b[i]:
            return False
    return a[k] < b[k]


def iscut(L, c):
    """
    剪枝函数, 当c的真子集不在L中时,剪去c
    :param L:
    :param c:
    :return:
    """
    c_s = set(c)
    for l in L:
        if not l.issubset(c_s):
            return False
    return True


def aprioir_gen(L, k):
    Ck = []
    lens = len(L)
    set_L = list(map(set, L))
    for i in range(lens):
        for j in range(i + 1, lens):
            a, b = L[i], L[j]
            if check_join(a, b, k):
                c = [_ for _ in a]
                c.append(b[-1])
                if not iscut(set_L, c):
                    Ck.append(c)
    return Ck


def aprioir(data, min_sup):
    """
    aprioir算法
    :param data:
    :param min_sup:
    :return:
    """

    Lk = [freq_1_items(data, min_sup)]
    dataset = list(map(set, data))
    k = 1
    while len(Lk[k-1]) > 0:
        L = Lk[k-1]
        C = aprioir_gen(L, k)
        # 统计,除去支持度低的
        C_list = [(i, c) for i, c in enumerate(map(set, C))]
        count_dic = {}
        for d in dataset:
            for i, c in C_list:
                if c.issubset(d):
                    if count_dic.get(i, None) is None:
                        count_dic[i] = 1
                    else:
                        count_dic[i] += 1

        Ck = [C[k] for k, v in count_dic.items() if v >= min_sup]
        Ck.sort()
        Lk.append(Ck)
        k += 1
    return Lk


L = aprioir(load_data(), 2)
for i, l in enumerate(L):
    print('i = ', i + 1, '  L=', l)

输出结果

就是k频繁项集合

i =  1   L= [[1], [2], [3], [4], [5]]
i =  2   L= [[1, 2], [1, 3], [1, 5], [2, 3], [2, 4], [2, 5]]
i =  3   L= [[1, 2, 3], [1, 2, 5]]
i =  4   L= []

你可能感兴趣的:(算法,python)