首先我们得了解一下什么是FP-growth算法,如下:
FP-Growth算法是韩嘉炜等人在2000年提出的关联分析算法,它采取如下分治策略:将提供频繁项集的数据库压缩到一棵频繁模式树(FP-tree),但仍保留项集关联信息。
在算法中使用了一种称为频繁模式树(Frequent Pattern Tree)的数据结构。FP-tree是一种特殊的前缀树,由频繁项头表和项前缀树构成。FP-Growth算法基于以上的结构加快整个挖掘过程。
我们看看FP-growth算法的python实现,如下:
#树的节点对象
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode #needs to be updated
self.children = {}
def inc(self, numOccur):
self.count += numOccur
def disp(self, ind=1):
print ' '*ind, self.name, ' ', self.count
for child in self.children.values():
child.disp(ind+1)
#创建树的方法
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
headerTable = {}
#go over dataSet twice
for trans in dataSet:#first pass counts frequency of occurance
for item in trans:
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
for k in headerTable.keys(): #remove items not meeting minSup
if headerTable[k] < minSup:
del(headerTable[k])
freqItemSet = set(headerTable.keys())
#print 'freqItemSet: ',freqItemSet
if len(freqItemSet) == 0: return None, None #if no items meet min support -->get out
for k in headerTable:
headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link
#print 'headerTable: ',headerTable
retTree = treeNode('Null Set', 1, None) #create tree
for tranSet, count in dataSet.items(): #go through dataset 2nd time
localD = {}
for item in tranSet: #put transaction items in order
if item in freqItemSet:
localD[item] = headerTable[item][0]
if len(localD) > 0:
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset
return retTree, headerTable #return tree and header table
#递归构建FP树和更新头部映射表
def updateTree(items, inTree, headerTable, count):
if items[0] in inTree.children:#check if orderedItems[0] in retTree.children
inTree.children[items[0]].inc(count) #incrament count
else: #add items[0] to inTree.children
inTree.children[items[0]] = treeNode(items[0], count, inTree)
if headerTable[items[0]][1] == None: #update header table
headerTable[items[0]][1] = inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:#call updateTree() with remaining ordered items
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
#更新头部映射表
def updateHeader(nodeToTest, targetNode): #this version does not use recursion
while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list!
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
#根据后缀上溯FP树
def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)
#遍历整颗树,找出该后缀所有条件模式基
def findPrefixPath(basePat, treeNode): #treeNode comes from header table
condPats = {}
while treeNode != None:
prefixPath = []
ascendTree(treeNode, prefixPath)
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink
return condPats
#遍历所有后缀
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table)
for basePat in bigL: #start from bottom of header table
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
#print 'finalFrequent Item: ',newFreqSet #append to set
freqItemList.append(newFreqSet)
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
#print 'condPattBases :',basePat, condPattBases
#2. construct cond FP-tree from cond. pattern base
myCondTree, myHead = createTree(condPattBases, minSup)
#print 'head from conditional tree: ', myHead
if myHead != None: #3. mine cond. FP-tree
#print 'conditional tree for: ',newFreqSet
#myCondTree.disp(1)
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
现在我们开始用java实现,首先得定义好树节点类
package com.algorithm;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class FPNode {
private String name;
private double count;
private FPNode nodeLink;
private FPNode parent;
private Map children = new HashMap();
FPNode(String name,double count){
this.name = name;
this.count = count;
}
FPNode(String name,double count,FPNode parent){
this.name = name;
this.count = count;
this.parent = parent;
}
public void inc(double numOccur) {
this.count += numOccur;
}
public void disp(int ind) {
for(int i=0;i child : this.children.entrySet()) {
child.getValue().disp(ind+1);
}
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getCount() {
return count;
}
public void setCount(double count) {
this.count = count;
}
public FPNode getNodeLink() {
return nodeLink;
}
public void setNodeLink(FPNode nodeLink) {
this.nodeLink = nodeLink;
}
public FPNode getParent() {
return parent;
}
public void setParent(FPNode parent) {
this.parent = parent;
}
public Map getChildren() {
return children;
}
public void setChildren(Map children) {
this.children = children;
}
}
然后是树的类
package com.algorithm;
import java.util.List;
import java.util.Map;
public class FPTreeInfo {
private FPNode retTree;
private Map headerTable;
FPTreeInfo(FPNode retTree,Map headerTable){
this.retTree = retTree;
this.headerTable = headerTable;
}
public FPNode getRetTree() {
return retTree;
}
public void setRetTree(FPNode retTree) {
this.retTree = retTree;
}
public Map getHeaderTable() {
return headerTable;
}
public void setHeaderTable(Map headerTable) {
this.headerTable = headerTable;
}
}
头部映射表的类
package com.algorithm;
import java.util.List;
import java.util.Map;
public class HeaderTable {
private double hTCount;
private FPNode fpNode;
HeaderTable(double hTCount,FPNode fpNode){
this.hTCount = hTCount;
this.fpNode = fpNode;
}
public double gethTCount() {
return hTCount;
}
public void sethTCount(double hTCount) {
this.hTCount = hTCount;
}
public FPNode getFpNode() {
return fpNode;
}
public void setFpNode(FPNode fpNode) {
this.fpNode = fpNode;
}
}
然后是初始化基础字典
private static Map,Double> createInitSet(List> dataSet){
Map,Double> retDict = new HashMap,Double>();
for(int i=0;i
更新头部映射表的方法
private static void updateHeader(FPNode nodeToTest,FPNode targetNode) {
if(nodeToTest.getNodeLink() != null) {
nodeToTest = nodeToTest.getNodeLink();
}
nodeToTest.setNodeLink(targetNode);
}
递归构建FP树核心方法
private static void updateTree(List items,FPNode inTree,Map headerTable,double count) {
if(inTree.getChildren().containsKey(items.get(0))) {
inTree.getChildren().get(items.get(0)).inc(count);
}else {
inTree.getChildren().put(items.get(0), new FPNode(items.get(0), count, inTree));
if(headerTable.get(items.get(0)).getFpNode() == null) {
headerTable.get(items.get(0)).setFpNode(inTree.getChildren().get(items.get(0)));
}else {
updateHeader(headerTable.get(items.get(0)).getFpNode(),inTree.getChildren().get(items.get(0)));
}
}
if(items.size() > 1) {
updateTree(items.subList(1, items.size()),inTree.getChildren().get(items.get(0)), headerTable, count);
}
}
构建FP树
public static FPTreeInfo createTree(Map,Double> dataSet,double minSup) {
Map tmp = new HashMap();
for (Map.Entry,Double> trans : dataSet.entrySet()) {
for(String item : trans.getKey()) {
if(tmp.containsKey(item)) {
tmp.replace(item, tmp.get(item)+trans.getValue());
}else {
tmp.put(item,trans.getValue());
}
}
}
Map ht = new HashMap();
for (Map.Entry trans : tmp.entrySet()) {
if(trans.getValue() >= minSup) {
List list = new ArrayList();
list.add(trans.getKey());
ht.put(trans.getKey(),new HeaderTable(trans.getValue(),null));
}
}
Set freqItemSet = ht.keySet();
//System.out.println("freqItemSet: "+freqItemSet.toString());
if(freqItemSet.size() == 0) {
return new FPTreeInfo(null,null);
}
FPNode retTree = new FPNode("Null Set", 1);
for (Map.Entry,Double> trans : dataSet.entrySet()) {
List localD = new ArrayList();
List> list = new ArrayList>(ht.entrySet());
Collections.sort(list,new Comparator>() {
public int compare(Entry o1,
Entry o2) {
if(o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
return 1;
}else if (o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
return 0;
}else {
return -1;
}
}
});
for(Map.Entry mapping:list){
if(trans.getKey().contains(mapping.getKey()) && freqItemSet.contains(mapping.getKey())) {
localD.add(mapping.getKey());
}
}
if(localD.size() > 0) {
updateTree(localD, retTree, ht, trans.getValue());
}
}
return new FPTreeInfo(retTree,ht);
}
我们先构建FP树看看效果,如下:
List datas = new ArrayList();
datas.add(new String[]{"r", "z", "h", "j", "p"});
datas.add(new String[]{"z", "y", "x", "w", "v", "u", "t", "s"});
datas.add(new String[]{"z"});
datas.add(new String[]{"r", "x", "n", "o", "s"});
datas.add(new String[]{"y", "r", "x", "z", "q", "t", "p"});
datas.add(new String[]{"y", "z", "x", "e", "q", "s", "t", "m"});
List> dataIns = new ArrayList>();
for(String[] item : datas) {
dataIns.add(Arrays.asList(item));
}
Map,Double> initSet = createInitSet(dataIns);
System.out.println(initSet.toString());
FPTreeInfo fpt = createTree(initSet,3);
fpt.getRetTree().disp(1);
构建FP树成功
然后我们用FP树寻找条件模式基和频繁项
首先是根据前缀递归上溯方法
private static void ascendTree(FPNode leafNode,List prefixPath) {
if(leafNode.getParent() != null) {
prefixPath.add(leafNode.getName());
ascendTree(leafNode.getParent(), prefixPath);
}
}
根据该前缀遍历整颗树找出所有条件模式基的方法
private static Map,Double> findPrefixPath(String basePat,FPNode treeNode){
Map,Double> condPats = new HashMap,Double>();
while(treeNode != null) {
List prefixPath = new ArrayList();
ascendTree(treeNode, prefixPath);
if(prefixPath.size() > 1) {
condPats.put(prefixPath.subList(1, prefixPath.size()), treeNode.getCount());
}
treeNode = treeNode.getNodeLink();
}
return condPats;
}
最后是遍历所有的前缀
private static void mineTree(FPNode inTree,Map headerTable,double minSup,List preFix,List> freqItemList) {
List bigL = new ArrayList();
List> list = new ArrayList>(headerTable.entrySet());
Collections.sort(list,new Comparator>() {
public int compare(Entry o1,
Entry o2) {
if(o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
return -1;
}else if (o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
return 0;
}else {
return 1;
}
}
});
for(Map.Entry mapping:list){
if(!bigL.contains(mapping.getKey()))
bigL.add(mapping.getKey());
}
for(String basePat : bigL) {
//System.out.println(basePat);
List newFreqSet = new ArrayList<>(preFix);
newFreqSet.add(basePat);
freqItemList.add(newFreqSet);
Map,Double> condPattBases = findPrefixPath(basePat, headerTable.get(basePat).getFpNode());
// System.out.println(condPattBases.toString());
FPTreeInfo ct = createTree(condPattBases,minSup);
if(ct.getHeaderTable() != null) {
System.out.println("conditional tree for: "+newFreqSet);
ct.getRetTree().disp(1);
mineTree(ct.getRetTree(), ct.getHeaderTable(), minSup, newFreqSet, freqItemList);
}
}
}
测试结果如下: