FP-growth(频繁模式增长)
- 数据库的第一遍扫描用来统计出现的频率;第二遍扫面中考虑那些频繁元素
优点:
缺点:
适用数据类型:
简单数据集及数据包装器
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
simpDat = loadSimpDat()
initSet = createInitSet(simpDat)
initSet
{frozenset({'h', 'j', 'p', 'r', 'z'}): 1,
frozenset({'s', 't', 'u', 'v', 'w', 'x', 'y', 'z'}): 1,
frozenset({'z'}): 1,
frozenset({'n', 'o', 'r', 's', 'x'}): 1,
frozenset({'p', 'q', 'r', 't', 'x', 'y', 'z'}): 1,
frozenset({'e', 'm', 'q', 's', 't', 'x', 'y', 'z'}): 1}
构建FP树
#节点数据结构
class treeNode:
def __init__(self,nameValue,numOccur,parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode
self.children = {}
def inc(self,numOccur):
self.count += numOccur
def disp(self,ind=1):
print(' '*ind,self.name,' ',self.count)
for child in self.children.values(): #self.children.values()是一个节点
child.disp(ind+1)
rootNode = treeNode('pyramid',9,None) #创建一个单节点
rootNode.children['eye'] = treeNode('eye', 13, None)
rootNode.children['phoenix'] = treeNode('phoenix', 3, None)
rootNode.disp()
pyramid 9
eye 13
phoenix 3
#构建FP树
def createTree(dataSet, minSup=1):
headerTable = {}
#第一次扫描D
for trans in dataSet:#每个事务
for item in trans:#某个事务的每个元素
headerTable[item] = headerTable.get(item, 0) + dataSet[trans] #统计每个元素出现的频率
headerTableCopy = headerTable.copy()
for k in headerTableCopy.keys(): #过滤
if headerTable[k] < minSup:
del(headerTable[k])
#单元素频繁项集
freqItemSet = set(headerTable.keys())
# print ('freqItemSet: ',freqItemSet)#freqItemSet: {'y', 'z', 's', 't', 'x', 'r'}
if len(freqItemSet) == 0: return None, None #若所有项都不频繁
for k in headerTable:
headerTable[k] = [headerTable[k], None] #格式化 headerTable
# print(headerTable)#{'z': [5, None], 'r': [3, None], 'y': [3, None], 's': [3, None], 't': [3, None], 'x': [4, None]}
retTree = treeNode('Null Set', 1, None) #创建根节点
#第二次扫描D
for tranSet, count in dataSet.items(): #遍历每一个事务
localD = {}
for item in tranSet: #遍历每一个元素
if item in freqItemSet:#若元素是频繁的
localD[item] = headerTable[item][0] #记录个数
# print(localD)#{'z': 5, 'r': 3}
if len(localD) > 0:#一个事务中,频繁项至少有一个,则增长分支
#先排序,方便增长分支,排序之后的头指针表
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
# print(orderedItems)#[1'z', 'r']
#增长分支
updateTree(orderedItems, retTree, headerTable, count)#增长
return retTree, headerTable
#增长分支
def updateTree(items, inTree, headerTable, count):
if items[0] in inTree.children:#若第一个元素是子节点,inTree.children:dict
inTree.children[items[0]].inc(count) #计数+1
else:
inTree.children[items[0]] = treeNode(items[0], count, inTree)#创建分支
if headerTable[items[0]][1] == None: #若指针为空
headerTable[items[0]][1] = inTree.children[items[0]]#指针指向本节点
else:#若指针已经有指向,则再更新,在末尾添加一个指向
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:#items[1::]:删除第一个元素,继续创建分支
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
#在末尾添加指针
def updateHeader(nodeToTest, targetNode):
while (nodeToTest.nodeLink != None): #沿着nodelink到达链表末尾
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode #添加下一个指向
myFPtree,myHeaderTab = createTree(initSet, minSup=3)
myFPtree.disp()
Null Set 1
z 5
r 1
x 3
y 2
s 2
t 2
r 1
y 1
t 1
x 1
r 1
s 1
myHeaderTab
{'r': [3, <__main__.treeNode at 0x7f5cfc1e1e80>],
'z': [5, <__main__.treeNode at 0x7f5cfc1a5c88>],
'x': [4, <__main__.treeNode at 0x7f5cfc1e1e48>],
'y': [3, <__main__.treeNode at 0x7f5cfc1e1eb8>],
's': [3, <__main__.treeNode at 0x7f5cfc1e1da0>],
't': [3, <__main__.treeNode at 0x7f5cfc1e1fd0>]}
发现以给定元素结尾的所有路径的函数
#上溯一条路径
def ascendTree(leafNode, prefixPath):
if leafNode.parent != None:#若父节点存在
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)#继续向上
#找到给定元素的所有前缀路径
def findPrefixPath(basePat, treeNode):
condPats = {}
while treeNode != None: #若节点存在
prefixPath = []
ascendTree(treeNode, prefixPath) #上溯路径
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink #跳到下一个指向的位置
return condPats
findPrefixPath('x', myHeaderTab['x'][1])
{frozenset({'z'}): 3}
myHeaderTab
{'r': [3, <__main__.treeNode at 0x7f5cfc1e1e80>],
'z': [5, <__main__.treeNode at 0x7f5cfc1a5c88>],
'x': [4, <__main__.treeNode at 0x7f5cfc1e1e48>],
'y': [3, <__main__.treeNode at 0x7f5cfc1e1eb8>],
's': [3, <__main__.treeNode at 0x7f5cfc1e1da0>],
't': [3, <__main__.treeNode at 0x7f5cfc1e1fd0>]}
递归查找频繁项集的 mineTree 函数
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
#对频繁项排序,频繁数从小到大
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[0])]
# print(bigL)#['r', 's', 't', 'x', 'y', 'z']
for basePat in bigL: #start from bottom of header table
newFreqSet = preFix.copy()#每换一次元素,都初始化一次
newFreqSet.add(basePat)
# print ('finalFrequent Item: ',newFreqSet)
freqItemList.append(newFreqSet)
# print(freqItemList)
#1. 找到条件模式基
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
# print ('condPattBases :',basePat, condPattBases)
#2. 构建条件FP树
myCondTree, myHead = createTree(condPattBases, minSup)
# print ('head from conditional tree: ', myHead)
if myHead != None: #3. mine cond. FP-tree
# print('conditional tree for: ',newFreqSet)
# myCondTree.disp(1)
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
myFPtree.disp()
Null Set 1
z 5
r 1
x 3
y 2
s 2
t 2
r 1
y 1
t 1
x 1
r 1
s 1
freqItemList = []
mineTree(myFPtree, headerTable=myHeaderTab, minSup=3, preFix=set([]), freqItemList=freqItemList)
print(freqItemList)
[{'r'}, {'s'}, {'s', 'x'}, {'t'}, {'t', 'x'}, {'t', 'y', 'x'}, {'t', 'y'}, {'t', 'z'}, {'t', 'z', 'x'}, {'y', 't', 'z', 'x'}, {'y', 't', 'z'}, {'x'}, {'z', 'x'}, {'y'}, {'y', 'x'}, {'y', 'x', 'z'}, {'y', 'z'}, {'z'}]
示例:从新闻网站点击流中挖掘
parseDat = [line.split() for line in open('../../Reference Code/Ch12/kosarak.dat').readlines()]
parseDat
[['1', '2', '3'],
['1'],
['4', '5', '6', '7'],
['1', '8'],
['9', '10'],
['11', '6', '12', '13', '14', '15', '16'],
['1', '3', '7'],
['17', '18'],
['11', '6', '19', '20', '21', '22', '23', '24'],
['1', '25', '3'],
['26', '3'],
['11',
'27',
'6',
'3',
'28',
'7',
'29',
'30',
'31',
'32',
'33',
'34',
'35',
'36',
'37'],
['6', '2', '38'],
['39',
'11',
'27',
'1',
'40',
'6',
'41',
'42',
'43',
'44',
'45',
'46',
'47',
'3',
'48',
'7',
'49',
'50',
'51'],
['52', '6', '3', '53'],
['54', '1', '6', '55'],
['11', '6', '56', '57', '58', '59', '60', '61', '62', '63', '64'],
['3'],
['1', '65', '66', '67', '68', '3'],
['69', '11', '1', '6'],
['11', '70', '6'],
['6', '3', '71'],
['72', '6', '73'],
['74'],
['75', '76'],
['6', '3', '77'],
['78', '79', '80', '81'],
['82', '6', '83', '7', '84', '85', '86', '87', '88'],
['11',
'27',
'1',
'6',
'89',
'90',
'91',
'92',
'93',
'14',
'94',
'95',
'96',
'97',
'98',
'99',
'100',
'101',
'102',
'103',
'104',
'105',
'106',
'107',
'108',
'109',
'110',
'111',
'112',
'113',
'114',
'115',
'116',
'117',
'118',
'119',
'120',
'121',
'122',
'123',
'124',
'125',
'126',
'127',
'128',
'129',
'130',
'131',
'132',
'133',
'64',
'134',
'135',
'136',
'137'],
['6', '138'],
对初始集合格式化,构建FP树,寻找至少被10w人浏览过的新闻报道
initSet = createInitSet(parseDat)
myFPtree, myHeaderTab = createTree(initSet, 100000)
myFPtree.disp()
Null Set 1
3 76514
1 12917
1 16829
6 412762
11 261773
3 117401
1 34141
1 43366
3 68888
1 13436
1 16461
11 21190
3 9718
1 1565
1 1882
myFreqList = []
mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList)
myFreqList
[{'1'},
{'1', '6'},
{'11'},
{'11', '6'},
{'3'},
{'11', '3'},
{'11', '3', '6'},
{'3', '6'},
{'6'}]