机器学习-Python编写FP-growth进行关联性分析

代码及数据集下载:FP-growth
当数据集较大时,Apriori算法需要多次扫描整个数据集,处理较慢,则需要一个增加高效的算法。FP-growth寻找频繁项集,只用扫描数据集两次。虽然该算法能够高效发现频繁项集,但是该算法并不能够发现规则。

循环一遍数据集:
    计算单个元素项出现的频度,保存到头指针表
根据最小频数筛选单个元素项
循环一遍数据集:
    将每个数据的元素按照头指针表排序
    创建FP树

创建FP树

第一项是否在树的子节点中
    如果不在就创建
    如果在就加一
第一项的头指针是否为none
    如果为none则指向当前节点
    如果不为none则沿指针找到最后一个点,并将最后一个点指向当前节点
import numpy as np
def loadSimpDat():
    simpDat = [['r', 'z', 'h', 'j', 'p'],
               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
               ['z'],
               ['r', 'x', 'n', 'o', 's'],
               ['y', 'r', 'x', 'z', 'q', 't', 'p'],
               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    return simpDat

def createInitSet(dataSet):
    retDict = {}
    for trans in dataSet:
        retDict[frozenset(trans)] = retDict.get(frozenset(trans),0) + 1
    return retDict

class treeNode:
    def __init__(self,nameValue,numOccur,parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None
        self.parent = parentNode
        self.children = {}
    def inc(self,numOccur):
        self.count += numOccur
    def disp(self,ind=1):
        print(' '*ind,self.name,' ',self.count)
        for child in self.children.values():
            child.disp(ind+1)

def createTree(dataSet,minSup=1):#生成树
    headerTable = {}
    for trans in dataSet:
        for item in trans:
            headerTable[item] = headerTable.get(item,0) + dataSet[trans]

#    for key in headerTable.keys():
#        if headerTable[key]
#            del(headerTable[key])
#    
    headerTable = {k:v for k,v in headerTable.items() if v>=minSup}        

    freqItemSet = set(headerTable.keys())
    if len(freqItemSet) == 0:
        return None,None
    for k in headerTable:
        headerTable[k] = [headerTable[k],None]

    retTree = treeNode('Null Set',0,None)

    for transSet,count in dataSet.items():
        localD = {}
        for tran in transSet:
            if tran in freqItemSet:
                localD[tran] = headerTable[tran][0]
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key = lambda k: k[1],reverse = True)]
            updateTree(orderedItems,retTree,headerTable,count)
    return retTree,headerTable



def updateTree(items,inTree,headerTable,count):#更新树
    if items[0] in inTree.children:
        inTree.children[items[0]].inc(count)
    else:
        inTree.children[items[0]] = treeNode(items[0],count,inTree)
#        print('this is ',inTree.children[items[0]].name,' items:',items)
        if headerTable[items[0]][1] == None:
            headerTable[items[0]][1] = inTree.children[items[0]]
        else:
            updateHeader(headerTable[items[0]][1],inTree.children[items[0]])
    if len(items) > 1:
        updateTree(items[1::],inTree.children[items[0]],headerTable,count)

def updateHeader(nodeToTest,targetNode):#更新头指针
    i = 0
    while(nodeToTest.nodeLink != None):
        nodeToTest = nodeToTest.nodeLink
#        print(nodeToTest.name,' ',i)
        i = i+1
    nodeToTest.nodeLink = targetNode


def ascendTree(leafNode,prefixPath):
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendTree(leafNode.parent,prefixPath)

def findPrefixPath(basePat,treeNode):#寻找条件模式基
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendTree(treeNode,prefixPath)
        if len(prefixPath) > 1:
            condPats[frozenset(prefixPath[1:])] = treeNode.count
        treeNode = treeNode.nodeLink
    return condPats

def mineTree(inTree,headerTable,minSup,preFix,freqItemList):#生成频繁项集
    bigL = [v[0] for v in sorted(headerTable.items(),key=lambda p:p[1][0])]
    for basePat in bigL:
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(basePat,headerTable[basePat][1])
        myCondTree,myHead = createTree(condPattBases,minSup)
        if myHead != None:
            mineTree(myCondTree,myHead,minSup,newFreqSet,freqItemList)



data = loadSimpDat()
data = createInitSet(data)
tree,header = createTree(data,3)
freqItems = []
mineTree(tree,header,3,set([]),freqItems)
print(freqItems)

你可能感兴趣的:(机器学习)