贝叶斯分类器的分类原理是通过某对象的先验概率,利用贝叶斯公式计算出其后验概率,即该对象属于某一类的概率,选择具有最大后验概率的类作为该对象所属的类。
以下为一个简单的例子:
数据:天气情况和每天是否踢足球的记录表
假设15号去踢球,踢球的概率计算过程如下:
P(踢球的概率) = 9/14
P(晴天|踢) = 踢球天数中晴天踢球的次数/踢球次数 = 2/9
P(凉爽|踢) = 踢球天数中凉爽踢球的次数/踢球次数 = 3/9
P(湿度高|踢) = 踢球天数中湿度高踢球的次数/踢球次数 = 3/9
P(风速高|踢) = 踢球天数中风速高踢球的次数/踢球次数 = 3/9
则15号踢球的概率P = 9/14 * 2/9 * 3/9 * 3/9 * 3/9 = 0.00529
按照上述步骤还可计算出15号不去踢球的概率P = 5/14 * 3/5 * 1/5 * 4/5 * 3/5 = 0.02057
可以看出,15号不去踢球的概率大于去踢球的概率,则可预测说,15号不去踢球。
理解朴素贝叶斯的流程之后,开始设计MR程序。在Mapper中,对训练数据进行拆分,也就是将这条训练数据拆分为类别和训练数据,将训练数据以自定义值类型来保存,然后传递给Reducer。
Mapper:
public class BayesMapper extends Mapper
MyWritable:
public class MyWritable implements Writable{
private int[] value;
public MyWritable() {
// TODO Auto-generated constructor stub
}
public MyWritable(int[] value){
this.setValue(value);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(value.length);
for(int i=0; i
Reducer中,需要在setup中初始化测试数据,由于训练数据与测试数据的属性中均只有0,1两种值,因此在reduce中,统计相同类别的不同属性的值的和(也就是统计出现1的次数,0的次数就是用该类别下数据的总和减去出现1的次数)。用对象CountAll来保存当前类别k、在类别k中每个属性出现1的概率、类别k中数据的条数,然后在cleanup中去计算当前测试数据出现哪种类别的概率最大,并设定这个类别为当前测试数据的类别。
public class BayesReducer extends Reducer{
Logger log = LoggerFactory.getLogger(BayesReducer.class);
private String testFilePath;
// 测试数据
private ArrayList testData = new ArrayList<>();
// 保存相同k的所有数据
private ArrayList allData = new ArrayList<>();
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
testFilePath = conf.get("TestFilePath");
Path path = new Path(testFilePath);
FileSystem fs = path.getFileSystem(conf);
readTestData(fs,path);
}
/***
* k,v => 0 {{0,1,0,1,0,0,1,...},{},{},...}
*/
@Override
protected void reduce(IntWritable key, Iterable values,
Context context)
throws IOException, InterruptedException {
Double[] myTest = new Double[testData.get(0).length-1];
for(int i=0;i labelG = new HashMap<>();
Long allSum = getSum(allData); //计算训练数据的长度
for(int i=0; i allData2) {
Long allSum = 0L;
for (CountAll countAll : allData2) {
log.info("类别:"+countAll.getK()+"数据:"+myString(countAll.getValue())+"总数:"+countAll.getSum());
allSum += countAll.getSum();
}
return allSum;
}
/***
* 得到分类的结果
* @param test
* @param labelG
* @return
*/
private int getClasify(int[] test,HashMap labelG ) {
double[] result = new double[allData.size()]; //以类别的长度作为数组的长度
for(int i = 0; i maxValue){
// maxValue = result[i];
// index = i;
// }
// }
// return allData.get(index).getK();
if(result[0] > result[1]){
return 0;
}else{
return 1;
}
}
/***
* 读取测试数据
* @param fs
* @param path
* @throws NumberFormatException
* @throws IOException
*/
private void readTestData(FileSystem fs, Path path) throws NumberFormatException, IOException {
FSDataInputStream data = fs.open(path);
BufferedReader bf = new BufferedReader(new InputStreamReader(data));
String line = "";
while ((line = bf.readLine()) != null) {
String[] str = line.split(",");
int[] myData = new int[str.length];
for(int i=0;i
CountAll:
public class CountAll {
private Long sum;
private Double[] value;
private int k;
public CountAll(){}
public CountAll(Long sum, Double[] value,int k){
this.sum = sum;
this.value = value;
this.k = k;
}
public Double[] getValue() {
return value;
}
public void setValue(Double[] value) {
this.value = value;
}
public Long getSum() {
return sum;
}
public void setSum(Long sum) {
this.sum = sum;
}
public int getK() {
return k;
}
public void setK(int k) {
this.k = k;
}
}
由于在计算P(a1|c) * P(a2|c)...*P(an|c)时,如果每个p值都比较小,当属性很多时会出现精度损失的情况,也就是最后每个类别算出的概率都会为0,。这里将其取对数,转换为ln(P(a1|c) * P(a2|c)...*P(an|c)) = ln(P(a1|c)) + ln(P(a2|c)) + ...+ln(P(an|c)),可以避免出现精度损失这种情况。
训练数据中的一部分如下:
1,0,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1
1,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1
1,1,0,1,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1
1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,1
1,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1
1,1,1,0,0,1,1,1,0,1,1,1,1,0,1,0,0,1,0,1,1,0,0
1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1
1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1
1,1,0,1,1,0,0,1,1,1,0,1,1,1,1,1,1,0,1,1,0,1,1
1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,0,1,1,1,0,1,1,1
1,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,1,1,0
1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,1,1
1,1,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0
1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
验证数据中的一部分如下:
1,1,0,0,1,1,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,0,0
1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1
1,0,1,1,1,0,0,1,0,1,0,0,1,1,1,0,1,0,0,0,0,1,0
1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,1
1,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,1,1
1,1,0,0,1,0,0,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,0,1,1,0,1,0,0,0
1,1,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,0,0,1,0,0,0
1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1
1,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0
1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0
1,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0
1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,0,1,0,0,0,1,1
1,1,0,0,0,0,1,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,1,0,1,1,0,1,0,1,1,0,0,1,0,0,0,1,1,0
1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0
1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,1,1,1
1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,1,0,1,1,0
1,1,1,0,1,1,1,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
1,1,1,1,1,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,0,0