apriori算法是最基本的发现频繁项集的算法,它的名字也体现了它的思想——先验,采用逐层搜索迭代的方法,挖掘任何可能的项集,k项集用于挖掘k+1项集。
先验性质
频繁项集的所有非空子集也一定是频繁的
该性质体现了项集挖掘中的反单调性,如果k项集不是频繁的,那么k+1项集一定也不是。基于这一点,算法的基本思想为:
step 1:连接
为了搜索k项集,将k-1项集自连接产生候选的k项集,称为候选集。
为了有效的实现连接,首先对每一项进行排序。其次,若满足连接的条件,则进行连接。
连接的条件,前k-2项相同,k-1项不同
step 2:剪枝
k项集的每一个k-1项子集都存在与k-1项集,并且支持度满足最小支持度阀值。
伪代码:
C:candidata itemset of size k
L:frequent itemset of size k
L<1>=frequent items
for(k=1;L!=null;k++)
C=candidates generated from L
for transaction t in dataset
increment the count of all candidates in C that are contained in t L=candidates in C with support>=min_support
return Java代码实现方式:
抽象了一个项集实体类,并实现是否可以合并的方法,这个方法最初是使用TreeSet.headSet来实现的,但是在测试时发现性能瓶颈都产生在这个方法上,并造成OOM,很是费解,待研究清楚后总结一下。
/**
*
*/
package org.waitingfortime.datamining.association;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* @author mazhiyuan
*
*/
public class Apriori {
private int minNum;// 最小支持数
private List> records;
private String output;
private List> result = new ArrayList>();
public Apriori(double minDegree, String input, String output) {
this.output = output;
init(input);
if (records.size() == 0) {
System.err.println("不符合计算条件。退出!");
System.exit(1);
}
minNum = (int) (minDegree * records.size());
}
private void init(String path) {
// TODO Auto-generated method stub
records = new ArrayList>();
try {
BufferedReader br = new BufferedReader(new FileReader(
new File(path)));
String line = null;
Set record;
while ((line = br.readLine()) != null) {
if (!"".equals(line.trim())) {
record = new TreeSet();
String[] items = line.split(" ");
for (String item : items) {
record.add(Integer.valueOf(item));
}
records.add(record);
}
}
br.close();
} catch (IOException e) {
System.err.println("读取事务文件失败。");
}
}
private List first() {
// TODO Auto-generated method stub
List first = new ArrayList();
Map _first = new HashMap();
for (Set si : records)
for (Integer i : si) {
if (_first.get(i) == null)
_first.put(i, 1);
else
_first.put(i, _first.get(i) + 1);
}
for (Integer i : _first.keySet())
if (_first.get(i) >= minNum)
first.add(new ItemSet(i, _first.get(i)));
return first;
}
private void loop(List items) {
// TODO Auto-generated method stub
List copy = new ArrayList(items);
List res = new ArrayList();
int size = items.size();
// 连接
for (int i = 0; i < size; i++)
for (int j = i + 1; j < size; j++)
if (copy.get(i).isMerge(copy.get(j))) {
ItemSet is = new ItemSet(copy.get(i));
is.merge(copy.get(j).item.last());
res.add(is);
}
// 剪枝
pruning(copy, res);
if (res.size() != 0) {
result.add(res);
loop(res);
}
}
private void pruning(List pre, List res) {
// TODO Auto-generated method stub
// step 1 k项集的子集属于k-1项集
Iterator ir = res.iterator();
while (ir.hasNext()) {
// 获取所有k-1项子集
ItemSet now = ir.next();
List> ss = subSet(now);
// 判断是否在pre集中
boolean flag = false;
for (List li : ss) {
if (flag)
break;
for (ItemSet pis : pre) {
if (pis.item.containsAll(li)) {
flag = false;
break;
}
flag = true;
}
}
if (flag) {
ir.remove();
continue;
}
// step 2 支持度
int i = 0;
for (Set sr : records) {
if (sr.containsAll(now.item))
i++;
now.value = i;
}
if (now.value < minNum)
ir.remove();
}
}
private List> subSet(ItemSet is) {
// TODO Auto-generated method stub
List li = new ArrayList(is.item);
List> res = new ArrayList>();
for (int i = 0, j = li.size(); i < j; i++) {
List _li = new ArrayList(li);
_li.remove(i);
res.add(_li);
}
return res;
}
private void output() throws FileNotFoundException {
if (result.size() == 0) {
System.err.println("无结果集。退出!");
return;
}
FileOutputStream out = new FileOutputStream(output);
PrintStream ps = new PrintStream(out);
for (List li : result) {
ps.println("=============频繁"+li.get(0).item.size()+"项集=============");
for (ItemSet is : li)
ps.println(is.item + " : " + is.value);
ps.println("=====================================");
}
}
/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
// TODO Auto-generated method stub
long begin = System.currentTimeMillis();
Apriori apriori = new Apriori(0.25,
"/home/mazhiyuan/code/mushroom.dat",
"/home/mazhiyuan/code/mout.data");
// apriori.first();//频繁1项集
apriori.loop(apriori.first());
apriori.output();
System.out.println((System.currentTimeMillis()) - begin);
}
}
class ItemSet {
TreeSet item;
int value;
ItemSet(ItemSet is) {
this.item = new TreeSet(is.item);
}
ItemSet() {
item = new TreeSet();
}
ItemSet(int i, int v) {
this();
merge(i);
setValue(v);
}
void setValue(int i) {
this.value = i;
}
void merge(int i) {
item.add(i);
}
boolean isMerge(ItemSet other) {
if (other == null || other.item.size() != item.size())
return false;
// 前k-1项相同,最后一项不同,满足连接条件
/*
* Iterator i = item.headSet(item.last()).iterator();
* Iterator o =
* other.item.headSet(other.item.last()).iterator(); while (i.hasNext() &&
* o.hasNext()) if (i.next() != o.next()) return false;
*/
Iterator i = item.iterator();
Iterator o = other.item.iterator();
int n = item.size();
while (i.hasNext() && o.hasNext() && --n > 0)
if (i.next() != o.next())
return false;
return !(item.last() == other.item.last());
}
} 使用mushroom数据集,整个运行时间只有大概6s,性能还算满意。
这个代码只是计算了频繁项集,还没有计算关联规则和置信度,稍后补上。
=========补充了关联规则的生成======== 比想象的要麻烦一点
关联规则可以是双向的,confidence(A-->B)=P(A|B)=support(A&B)/support(A)
所以在计算k项集的关联规则时,其分母都是k项集的支持度,分子为k-1项集的支持度,以及对应1项集的支持度
/**
*
*/
package org.waitingfortime.datamining.association;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* @author mazhiyuan
*
*/
public class Apriori {
private int minNum;// 最小支持数
private double minCon;// 最小置信度
private List> records;// 原始数据
private String output;// 输出路径
private List> result = new ArrayList>();// 频繁项集结果
private List fth;// 频繁1项集
public Apriori(double minDegree, double minCon, String input, String output) {
this.output = output;
this.minCon = minCon;
init(input);
if (records.size() == 0) {
System.err.println("不符合计算条件。退出!");
System.exit(1);
}
minNum = (int) (minDegree * records.size());
}
private void init(String path) {
// TODO Auto-generated method stub
records = new ArrayList>();
try {
BufferedReader br = new BufferedReader(new FileReader(
new File(path)));
String line = null;
Set record;
while ((line = br.readLine()) != null) {
if (!"".equals(line.trim())) {
record = new TreeSet();
String[] items = line.split(" ");
for (String item : items) {
record.add(Integer.valueOf(item));
}
records.add(record);
}
}
br.close();
} catch (IOException e) {
System.err.println("读取事务文件失败。");
}
}
private void first() {
// TODO Auto-generated method stub
fth = new ArrayList();
Map first = new HashMap();
for (Set si : records)
for (Integer i : si) {
if (first.get(i) == null)
first.put(i, 1);
else
first.put(i, first.get(i) + 1);
}
for (Integer i : first.keySet())
if (first.get(i) >= minNum)
fth.add(new ItemSet(i, first.get(i)));
}
private void loop(List items) {
// TODO Auto-generated method stub
List copy = new ArrayList(items);
List res = new ArrayList();
int size = items.size();
// 连接
for (int i = 0; i < size; i++)
for (int j = i + 1; j < size; j++)
if (copy.get(i).isMerge(copy.get(j))) {
ItemSet is = new ItemSet(copy.get(i));
is.merge(copy.get(j).item.last());
res.add(is);
}
// 剪枝
pruning(copy, res);
if (res.size() != 0) {
result.add(res);
loop(res);
}
}
private void pruning(List pre, List res) {
// TODO Auto-generated method stub
// step 1 k项集的子集属于k-1项集
Iterator ir = res.iterator();
while (ir.hasNext()) {
// 获取所有k-1项子集
ItemSet now = ir.next();
Map> ss = subSet(now);
// 判断是否在pre集中
boolean flag = false;
for (List li : ss.values()) {
if (flag)
break;
for (ItemSet pis : pre) {
if (pis.item.containsAll(li)) {
flag = false;
break;
}
flag = true;
}
}
if (flag) {
ir.remove();
continue;
}
// step 2 支持度
int i = 0;
for (Set sr : records) {
if (sr.containsAll(now.item))
i++;
now.support = i;
}
if (now.support < minNum) {
ir.remove();
continue;
}
// 产生关联规则
double deno = now.support;
for (Map.Entry> me : ss.entrySet()) {
ItemCon ic = new ItemCon(me.getKey(), me.getValue());
int nume = 0;
for (ItemSet f : fth)
if (f.item.contains(me.getKey())) {
nume = f.support;
break;
}
if (deno / nume > minCon) {
now.calcon(ic);
ic.setC1(deno / nume);
}
for (ItemSet pis : pre)
if (pis.item.size() == me.getValue().size()
&& pis.item.containsAll(me.getValue())) {
nume = pis.support;
break;
}
if (deno / nume > minCon)
ic.setC2(deno / nume);
}
}
}
private Map> subSet(ItemSet is) {
// TODO Auto-generated method stub
List li = new ArrayList(is.item);
Map> res = new HashMap>();
for (int i = 0, j = li.size(); i < j; i++) {
List _li = new ArrayList(li);
_li.remove(i);
res.put(li.get(i), _li);
}
return res;
}
private void output() throws FileNotFoundException {
if (result.size() == 0) {
System.err.println("无结果集。退出!");
return;
}
FileOutputStream out = new FileOutputStream(output);
PrintStream ps = new PrintStream(out);
for (List li : result) {
ps.println("=============频繁" + li.get(0).item.size()
+ "项集=============");
for (ItemSet is : li) {
ps.println(is.item + " : " + is.support);
ps.println();
if (is.ics.size() != 0) {
ps.println("******关联规则******");
for (ItemCon ic : is.ics) {
ps.println(ic.i + " ---> " + ic.li + " con: "
+ ic.confidence1);
if (ic.confidence2 > minCon)
ps.println(ic.li + " ---> " + ic.i + " con: "
+ ic.confidence2);
}
ps.println("******************");
ps.println();
}
}
ps.println("=====================================");
}
ps.close();
}
/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
// TODO Auto-generated method stub
long begin = System.currentTimeMillis();
Apriori apriori = new Apriori(0.25, 0.5,
"/home/mazhiyuan/code/mushroom.dat",
"/home/mazhiyuan/code/mout.data");
// apriori.first();//频繁1项集
apriori.first();
apriori.loop(apriori.fth);
apriori.output();
System.out.println("共耗时:" + ((System.currentTimeMillis()) - begin)
+ "ms");
}
}
class ItemSet {
TreeSet item;
int support;
List ics = new ArrayList(); // 关联规则结果
ItemSet(ItemSet is) {
this.item = new TreeSet(is.item);
}
ItemSet() {
item = new TreeSet();
}
ItemSet(int i, int v) {
this();
merge(i);
setValue(v);
}
void setValue(int i) {
this.support = i;
}
void merge(int i) {
item.add(i);
}
void calcon(ItemCon ic) {
ics.add(ic);
}
boolean isMerge(ItemSet other) {
if (other == null || other.item.size() != item.size())
return false;
// 前k-1项相同,最后一项不同,满足连接条件
/*
* Iterator i = item.headSet(item.last()).iterator();
* Iterator o =
* other.item.headSet(other.item.last()).iterator(); while (i.hasNext()
* && o.hasNext()) if (i.next() != o.next()) return false;
*/
Iterator i = item.iterator();
Iterator o = other.item.iterator();
int n = item.size();
while (i.hasNext() && o.hasNext() && --n > 0)
if (i.next() != o.next())
return false;
return !(item.last() == other.item.last());
}
}
class ItemCon {
Integer i;
List li;
double confidence1;
double confidence2;
ItemCon(Integer i, List li) {
this.i = i;
this.li = li;
}
void setC1(double c1) {
this.confidence1 = c1;
}
void setC2(double c2) {
this.confidence2 = c2;
}
}
Apriori算法本身的性能就是一大问题,产生太多的候选集,FP-TREE算法规避了这一问题,使得频繁项集的挖掘性能提高了至少一个量级,下一篇重点介绍这个算法。