#FP树的数据结构
class treeNode:
def __init__(self,nameValue,numOccur,parentNode):
self.name=nameValue#节点元素名称,构造时初始化为给定值
self.count=numOccur#出现次数,构造时初始化为给定值
self.nodeLink=None#指向下一个相似节点的指针,默认为None
self.parent=parentNode#指向父节点的指针,构造时初始化为给定值
self.children={}#指向子节点的字典,子节点的元素名称为键,指向子节点的指针为值
#节点的频数
def inc(self,numOccur):
self.count+=numOccur
#输出节点和子节点的FP树结构,将树以文本的形式显示
def disp(self,ind=1):
print(' '*ind,self.name,' ',self.count)
for child in self.children.values():
child.disp(ind+1)
#构建FP树
def createTree(dataSet,minSup=1):
headerTable={}#头指针表
#遍历数据集两次
for trans in dataSet:#第一次遍历扫描数据集并统计每个元素项出现的频数
for item in trans:
headerTable[item]=headerTable.get(item,0)+dataSet[trans]#get()返回给定键item的值,没有则返回0,累加
for k in list(headerTable):
if headerTable[k]<minSup:
del (headerTable[k])#删除小于最小支持度的头指针
freqItemSet=set(headerTable.keys())
if len(freqItemSet)==0:#如果没有元素项满足要求,则退出
return None,None
for k in headerTable:
headerTable[k]=[headerTable[k],None]#对头指针表进行扩展以便可以保存计数值以及指向每种类型第一个元素项的指针。
retTree=treeNode('Null Set',1,None)
for tranSet,count in dataSet.items():#第二次遍历数据集,这次只考虑那些频繁项
localD={}
for item in tranSet:#对item进行排序
if item in freqItemSet:
localD[item]=headerTable[item][0]
if len(localD)>0:
orderItems=[v[0] for v in sorted(localD.items(),key=lambda p:p[1],reverse=True)]
updateTree(orderItems,retTree,headerTable,count)#使用排序后的频率项集对树进行填充
return retTree,headerTable
#用频繁项集更新FP树
def updateTree(items,inTree,headerTable,count):
if items[0] in inTree.children:
inTree.children[items[0]].inc(count)
else:
inTree.children[items[0]]=treeNode(items[0],count,inTree)
if headerTable[items[0]][1]==None:
headerTable[items[0]][1]=inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1],inTree.children[items[0]])
if len(items)>1:#对剩下的元素项迭代调用updateTree()函数
updateTree(items[1::],inTree.children[items[0]],headerTable,count)
#更新头指针表,确保节点链接指向树中该元素项的每一个实例
def updateHeader(nodeToTest,targetNode):
while(nodeToTest.nodeLink!=None):
nodeToTest=nodeToTest.nodeLink
nodeToTest.nodeLink=targetNode
#挖掘频繁项集
#获取条件模式基
def ascendTree(leafNode,prefixPath):#从叶节点到根节点迭代上溯整棵树
if leafNode.parent!=None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent,prefixPath)
#该函数的第一个参数为头指针表字典中的key,第二个参数为对应的value值的第二个元素,即相似项链表的指针
def findPrefixPath(basePat,treeNode):
condPats={}#创建条件模式基的字典
while treeNode!=None:
prefixPath=[]
ascendTree(treeNode,prefixPath)
if len(prefixPath)>1:
condPats[frozenset(prefixPath[1:])]=treeNode.count
treeNode=treeNode.nodeLink
return condPats
#递归查找频繁项集
def mineTree(inTree,headerTable,minSup,preFix,freqItemList):
bigL=[v[0] for v in sorted(headerTable.items(),key=lambda p:str(p[1]))]
for basePat in bigL:#从头指针表尾部开始
newFreqSet=preFix.copy()
newFreqSet.add(basePat)
freqItemList.append(newFreqSet)
condPathBases=findPrefixPath(basePat,headerTable[basePat][1])
myCondTree,myHead=createTree(condPathBases,minSup)#用条件模式基构建条件FP树
if myHead!=None:
print("conditional tree for:",newFreqSet)#输出频繁项集
mineTree(myCondTree,myHead,minSup,newFreqSet,freqItemList)
#数据集
def loadSimpDat():
simDat = [
['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']
]
return simDat
#元素项及其对应频数的字典
def createInitSet(dataSet):
retDict={}
for trans in dataSet:
retDict[frozenset(trans)]=1
return retDict
#测试FP-Growth算法
if __name__=='__main__':
minSup=3#最小支持度
simDat=loadSimpDat()
initSet=createInitSet(simDat)
myFPtree,myHeaderTab=createTree(initSet,minSup)
myFPtree.disp()#文本形式展示FP树
myFreqList=[]
mineTree(myFPtree,myHeaderTab,minSup,set([]),myFreqList)
print(myFreqList)
conditional tree for: {‘t’}
conditional tree for: {‘t’, ‘x’}
conditional tree for: {‘s’}
conditional tree for: {‘y’}
conditional tree for: {‘z’, ‘y’}
conditional tree for: {‘x’, ‘y’}
conditional tree for: {‘z’, ‘x’, ‘y’}
conditional tree for: {‘x’}
[{‘r’}, {‘t’}, {‘t’, ‘z’}, {‘t’, ‘x’}, {‘t’, ‘z’, ‘x’}, {‘s’}, {‘x’, ‘s’}, {‘y’}, {‘t’, ‘y’}, {‘z’, ‘y’}, {‘t’, ‘z’, ‘y’}, {‘x’, ‘y’}, {‘t’, ‘x’, ‘y’}, {‘z’, ‘x’, ‘y’}, {‘t’, ‘z’, ‘x’, ‘y’}, {‘x’}, {‘z’, ‘x’}, {‘z’}]