P(c/x)=P(X/C)P(C)/P(X)
argmax(P(C/X))=argmax(P(X/C)P(C))=argmax(IIP(Xi/c)P(C))
Mapreduce 解决方案
第一阶段 用训练数据建立分类器
//key 忽略 value 一个样本,包含了各属性值以及分类
map(key,value){
String [] tokens =value.split(",");
int classIndex =tokens.length-1;
String theclass = tokens[classIndex];
for (int i=0;i<(classindex-1);i++){
Striing reducerkey = token[i]+","+theclass;
emit(reducekey,1);
}
String reducerkey="Class"+theclass;
emit(reducerkey,1);
}
reduce(key ,values){
int total =0;
for (int value:values){
total +=value
}
emit(key,total);
}
//最终生成概率表 每个属性在各个类别中出现的概率
第二阶段 使用分类器对新数据分类
map(key,value){
emit(value,1)
}
public class NaiveBayesClassfilerReducer...{
private theProbabilityTable=...;
private List
public void setup(){
theProbabilityTable=buildTheProbabilityable();
classifications=buidclassifications();
}
reduce(key,value){
//key (x1,x2,...xm)
String[] attributes=key.split(",');
String selectedclass= null;
double maxPosterior=0.0;
for(String aclass:classifications){
double posterior =theProbabilityTable.getClassProbability(aclass);
for ( int i =0; i posterior *=theProbabilityTable.getConditionalProbability(attributes[i],aclass); } if (selectedClass==null){ selectedclass=aclass; maxPosterior=posterior;} else{ if(posterior>maxPosterior){ selectedClass=aclass; maxPosterior=posterior;}} } } reduceroutputvalue= selectedclass+","+maxPosterior; emit(key,reducerOutputValue); } 对于连续数值属性的朴素贝叶斯分类器 计算P(Xi|C)时可以考虑使用高斯分布,首先从训练数据中计算出C类的Xi属性的高斯分布参数(期望和方差),对于测试数据,将属性值带入此高斯分布即可得到单个属性的条件概率 朴素贝叶斯分类器的Spark实现 阶段1 建立贝叶斯分类器 public class BuildNaiveBayes Classifier implements java.io.Serializable{ static List List for(Map.Entry list.add(new Tuple2 new DoubleWritable(entry.getValue())); } return list; } public static void main(String[] args) throws Exception{ //处理输入参数 final String trainingDataFilename=args[]; //创建spark上下文对象 JavaSparkContext ctx =SparkUtil.createJavaSparkContext("naive-bayes"); //读取训练数据 JavaRDD long trainingDataSize=training.count(); //对训练数据的所有元素实现map JavaPiarRDD String, Tuple2 public Iterable List new ArrayList(Tuple2 String[] tokens=s.split(","); int classificationIndex =tokens.length-1; String theclassification =tokens[classificationIndex]; for(int i=0; i<(classificationIndex-1);i==){ Tuple2 result.add(new Tuple2 } Tuple2 result.add(new Tuple2 return result; } }) //对训练数据的所有元素实现reduce JavaPairRDD public Integer call(Integer i1,Integer,i2){ return i1+i2;} }) //收集归约器数据为map Map //建立分类器数据结构 包括概率表和分类列表 Map List for(Map.entry Tuple2 Sting classification=k._2; if(k._1.equals("CLASS'){ PT.put(k,((double) entry.getvalue())/((double trainingDataSize)}); CLASSIFICATIONS.add(k._2); } else { Tuple2 Integer count = countsAsMap.get(k2); if (count ==null){PT.put(k,0.0);} else{ PT.put(k,((double) entry.getvalue())/((double count.intValue()))} } } //保存分类器 List JavaPairRDD ptRDD.saveAsHadoopFile("', PairofStrings.class,DoubleWritable.calss,SeqyenceFileOutputFormat.class); JavaRDD classificationsRDD.saveAsTextFIle("") // } } 阶段2 使用分类器对新数据分类 public class NaiveBayesClassifier implements java.io.serializable{ public static void main(String[] args) throws Exception{ //处理输入参数 //创建spark上下文对象 JavaSparkContext ctx = SparkUtil.createJavaSparkContext("naive-byes"); //读取要分类的新数据 JavaRDD //从hadoop中读取分类器 JavaPairRDD patyh; SequenceFileInputFoarmat.calss, PairOfStrings.class,DoubleWritable.class); JavaPairRDD new PairFunction public Tuple2 PairofStrings pair = rec._1; Tuple2 Double v2 = new Double(rec.-2.get()); return new Tuple2 }}) //缓存分类器组件 使集群中的任何节点都可以使用这些组件 Map final Broadcast JavaRDD List final Broadcast //对新数据分类 JavaPairRDD public Tuple2 Map List String [] attributes=rec.split(","); String selectedClass = null; double maxPoserior = 0.0; for (String aclass:CLASSES){ double posterior =CLASSIFIER.get(new Tuple2 for( int i =0; i Double probablility=CLASSIFIER.get(new Tuple2 if (probablity ==null){ posterior =0.0; break; } else { posterior *= probablity.doubleValue(); } } if (selectedClass == null){ selectedclass=aclass; maxPosterior=posterior; } else{ if(posteior >maxPosterior){ selectedclass=aclass; maxPosterior=posterior; } } } } return new Tuple2 }}) } } spark MLlib集成了常用的机器学习算法,可以直接调用 JavaRDD JavaRDD //给定(label feature)对的一个RDD,训练一个朴素贝叶斯模型 final NaiveBayesModel model = NaiveBayes.train(training.rdd(),thesmoothingparameter); JavaPairRDD new PairFunction @Override public Tuple2 return new Tuple2 }}); double accuracy =1.0*predictionAndLabel.filter(new Function @Override public Boolean call(Tuple2 return p1._()==p1._2();}}).count()/test.count(); > broadcastClasses= ctx.broadcast(CLASSEDS);