频繁项集挖掘算法实现

导入哈希树包参看上一篇博客:https://blog.csdn.net/CSDNgaoqingrui/article/details/93852227

 

# coding=utf-8

"""
A simple frequent itemset mining algorithm implementation
一种简单的频繁项集挖掘算法实现
"""

import itertools
from hashtree import HashTree, sameNodes
from functools import reduce


class FreqItem(object):
    def __init__(self, transactions, sup_theta=.1):
        self.transactions = [sorted(t) for t in [x for x in transactions if x]]
        self.sup_theta = sup_theta * len(transactions)
        self.freqset = []

    def filterCandidates(self, cand):
        """
        Build a HashTree with candidates cand, then count support of these candidates to filter out
        all those that have support not lower than sup_theta / 建立候选集哈希树,过滤掉sup不低于sup_theta的候选项
        """
        hashtree = HashTree(cand)
        hashtree.count(self.transactions)
        return hashtree.getNodes(self.sup_theta)

    def freqOneSet(self):
        """
        Generate frequent 1-item sets
        """
        one_item_cand = set()
        for t in self.transactions:
            for w in t:
                one_item_cand.add(w)
        return sorted(self.filterCandidates([[i] for i in one_item_cand]), key=lambda i: i[0].name)

    def genNextCand(self, preItems):
        """
        Generate next candidates by dynamic programming
        Find range [i, j) such that items in this range have same prefix
        e.g., [1,2,3,4] and [1,2,3,5] have same prefix, so they should be in one same range
        Then, generate 2-combinations of these ranges as result
        """
        res = []
        i, j = 0, 0
        while i < len(preItems):
            if j < len(preItems) and sameNodes(preItems[j][:-1], preItems[i][:-1]):
                j += 1
            else:
                res += [pair[0] + [pair[1][-1]] for pair in itertools.combinations(preItems[i:j], 2)]
                i = j
        return [[i.name for i in items] for items in res]

    def genFreqItemSets(self):
        """
        @return Frequent item sets with their frequency
        """
        if self.freqset: return self.freqset
        cur = self.freqOneSet()
        freqKSet = []
        while cur:
            freqKSet.append(cur)
            cur = self.filterCandidates(self.genNextCand(cur))
        self.freqset = reduce(lambda res, x: res + x, freqKSet, [])
        name_freq_pairs = [[(i.name, i.val) for i in items] for items in self.freqset[::-1]]
        res = [list(zip(*items)) for items in name_freq_pairs]

        rest = []
        for pair in res:
            rest.append((list(pair[0]), pair[1][-1]))
        return rest
        # return [(list(pair[0]), pair[1][-1]) for pair in res]


if __name__ == '__main__':
    transactions = [[1, 2, 3], [1, 2, 4], [2, 4, 6, 8], [1, 3, 5, 7], [5, 7, 2], [1, 2, 3, 5]]
    freqItem = FreqItem(transactions, sup_theta=.3)
    print(freqItem.genFreqItemSets())

 

你可能感兴趣的:(NLP)