将数据集存储在一个特定的被称为FP树的结构之后发现频繁项集或者频繁项对,即常在一块出现的元素项的集合FP树。(发现频繁项集)
FP-growth算法只需要对数据库进行两次扫描,而Apriori算法对于每个潜在的频繁项集都会扫描数据集判定给定模式是否频繁。
优点:一般要快于Apriori
缺点:实现比较困难,在某些数据集上性能会下降
适用数据类型:标称型数据(由于存储的是集合,若要处理连续数据,需要量化为离散值)
class treeNode:
''' FP树终结点的类定义 '''
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None #用于链接相似元素项
self.parent = parentNode
self.children = {}
def inc(self, numOccur):
''' 对count变量增加给定值 '''
self.count = numOccur
def disp(self, ind = 1):
''' 用于将树以文本形式显示 (用于调试) '''
print ' ' * ind, self.name, ' ', self.count
for child in self.children.values():
child.disp(ind + 1)
rootNode = treeNode('pyramid', 9, None)
rootNode.children['eye'] = treeNode('eye', 13, 'pyramid')
rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
rootNode.disp()
def createTree(dataSet, minSup = 1):
''' 使用数据集以及最小支持度作为参数来构建FP树 '''
headerTable = {} #头指针表
for trans in dataSet: #第一次扫描数据集并统计每个元素项出现的频度
for item in trans:
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
for k in headerTable.keys(): #扫描头指针表删掉那些出现次数少于minSup的项
if headerTable[k] < minSup:
del(headerTable[k])
freqItemSet = set(headerTable.keys()) #频繁项集
if len(freqItemSet) == 0:
return None, None
for k in headerTable:
headerTable[k] = [headerTable[k], None] #对头指针表进行扩展,保存计数值及指向每种类型第一个元素项指针
retTree = treeNode('Null Set', 1, None)
for tranSet, count in dataSet.items():
localD = {}
for item in tranSet:
if item in freqItemSet:
localD[item] = headerTable[item][0]
if len(localD) > 0:
orderItems = [v[0] for v in sorted(localD.items(), key = lambda p : p[1], reverse = True)]
updateTree = (orderItems, retTree, headerTable, count)
return retTree, headerTable
def updateHeader(nodeToTest, targetNode):
''' 更新头指针表 '''
while nodeToTest.nodeLink != None:
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def updateTree(items, inTree, headerTable, count):
''' 让FP树生长 '''
if items[0] in inTree.children:
inTree.children[items[0].inc(count)]
else:
inTree.children[items[0]] = treeNode(items[0], count, inTree)
if headerTable[items[0]][1] == None:
headerTable[items[0]][1] == inTree.children[items[0]] #头指针表更新以指向新的节点
else:
updateTree(items, inTree, headerTable, count)
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDist = {}
for trans in dataSet:
retDist[frozenset(trans)] = 1
return retDist
simDat = loadSimpDat()
print simDat
initSet = createInitSet(simDat)
print initSet
myFPTree, myHeaderTab = createTree(initSet, 3)
print myFPTree
print myHeaderTab
myFPTree.disp()
def ascendTree(leafNode, prefixPath):
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)
def findPrefixPath(basePat, treeNode):
condPats = {}
while treeNode != None:
prefixPath = {}
ascendTree(treeNode, prefixPath)
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink
return condPats
def mineTree(inTree, headerTable, minSup, prefix, freqItemList):
bigL = [v[0] for v in sorted(headerTable.items(), key = lambda p: p[1])] #首先对头指针表中的元素项按照其出现的频率进行排序
for basePat in bigL:
newFreqSet = prefix.copy()
newFreqSet.add(basePat)
freqItemList.append(newFreqSet)
condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) #创建条件基
myCondTree, myHead = createTree(condPattBases, minSup)
if myHead != None:
mineTree(myCondTree, myHead, minSup, prefix, freqItemList)
P.S. 一个有用的过滤函数
def textParse(bigString):
import re
urlsRemoved = re.sub('(http[s]?:[/][/]|www.)([a-z][A-Z][0-9]|[.]|[-])*', '', bigString)
listOfTokens = re.split(r'\W*', urlsRemoved)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
总结一下:本章最后一节举了一个“从新闻网站点击流中挖掘”的例子,可以挖掘出点击率较多的新闻报道。