  • 1.基于spark来实现
  • 2.基于传统Scala来实现


1. 先构建分类器

        long trainingDataSize = training.count();

        JavaPairRDD, Integer> pairs = training.flatMapToPair(new PairFlatMapFunction, Integer>() {
            public Iterator, Integer>> call(String rec) throws Exception {
                List, Integer>> result =
                        new ArrayList, Integer>>();
                String[] tokens = rec.split(",");
                int classification = tokens.length - 1;
                String theClassfication = tokens[classification];
                for (int i = 0; i < (classification - 1); i++) {
                    Tuple2 K = new Tuple2<>(tokens[i], theClassfication);
                    result.add(new Tuple2, Integer>(K, 1));
                Tuple2 V = new Tuple2<>("CLASS", theClassfication);
                result.add(new Tuple2<>(V, 1));
                return result.iterator();
        JavaPairRDD, Integer> counts = pairs.reduceByKey(new Function2() {
            public Integer call(Integer i1, Integer i2) throws Exception {
                return i1 + i1;
        Map, Integer> countsMap = counts.collectAsMap();

        HashMap, Double> PT = new HashMap<>();
        List CLASSIFICATIONS = new ArrayList<>();

        for(Map.Entry, Integer> entry : countsMap.entrySet()){
            Tuple2 k = entry.getKey();
            String classification = k._2;
                PT.put(k, (double) (entry.getValue()/trainingDataSize));
                Tuple2 k2 = new Tuple2<>("CLASS", classification);
                Integer count = countsMap.get(k2);
                if(count == null){
                    PT.put(k, (double) (entry.getValue()/count.intValue()));
        List> list = toWritableList(PT);
        JavaRDD> ptRDD = ctx.parallelize(list);
        ptRDD.saveAsHadoopFile("/naivebayes/pt",              // name of path
                PairOfStrings.class,              // key class
                DoubleWritable.class,             // value class
                SequenceFileOutputFormat.class    // output format class
        JavaRDD classificationsRDD = ctx.parallelize(CLASSIFICATIONS);
        classificationsRDD.saveAsTextFile("/naivebayes/classes"); // name of path

        // done

     static List> toWritableList(HashMap, Double> pt) {
         List> list =
                 new ArrayList>();
         for(Map.Entry,Double> entry : pt.entrySet()){
             list.add(new Tuple2( new PairOfStrings(entry.getKey()._1,entry.getKey()._2);
             new DoubleWritable(entry.getValue());
         return list;

2. 测试训练分类器

        JavaPairRDD ptRDD = ctx.hadoopFile(nbProbTablePath, SequenceFileInputFormat.class, PairOfStrings.class, DoubleWritable.class);

        JavaPairRDD, Double> classifierRDD = ptRDD.mapToPair(
                new PairFunction<
                        Tuple2, // T
                        Tuple2,                // K2,
                        Double                                // V2
                        >() {
                    public Tuple2,Double> call(Tuple2 rec) {
                        PairOfStrings pair = rec._1;
                        Tuple2 K2 = new Tuple2(pair.getLeftElement(), pair.getRightElement());
                        Double V2 = rec._2.get();
                        return new Tuple2,Double>(K2, V2);
        Map, Double> classifier = classifierRDD.collectAsMap();
        Broadcast, Double>> broadcastClassifier = ctx.broadcast(classifier);

        JavaRDD classesRDD = ctx.textFile("/naivebayes/classes", 1);
        List CLASSES = classesRDD.collect();
        final Broadcast> broadcastClasses = ctx.broadcast(CLASSES);

        JavaPairRDD classified =  newdata.mapToPair(new PairFunction() {
            public Tuple2 call(String rec) throws Exception {
                Map, Double> CLASSIFIER = broadcastClassifier.value();
                List CLASSES = broadcastClasses.value();

                String[] arrtibutes = rec.split(",");
                String selectedClass = null;
                double maxPosterior = 0.0;
                for(String aClass : CLASSES){
                    double posterior = CLASSIFIER.get(new Tuple2<>("CLASS",aClass));
                    for (int i=0;i(arrtibutes[i], aClass));
                        if(probability == null){
                        if (selectedClass ==null){
                            maxPosterior = posterior;
                            selectedClass = aClass;
                            if (posterior> maxPosterior){
                                maxPosterior = posterior;
                                selectedClass = aClass;

                return new Tuple2(rec, selectedClass);


1. 先构建分类器

 val training = sc.textFile(input)
    val traningDataSize = training count

    //对训练数据进行拆分,CLASS+class 1  value+class 1
    val pairs = training.flatMap(line =>{
      val tokens = line.split(",")
      val theClassification = tokens.last
      (("CLASS", theClassification), 1) :: tokens.init.map(token => ((token, theClassification), 1)).toList

    val counts = pairs reduceByKey (_+_)

    val countsAsMap = counts collectAsMap
    val pt = countsAsMap.map(tuple =>{
      if(tuple._1._1 =="CLASS") (tuple._1,(tuple._2/traningDataSize.toDouble)) else{
        val count  = countsAsMap.getOrElse(("CLASS",tuple._1._2),0)
        if(count ==0)(tuple._1,0d) else(tuple._1,(tuple._2/count.toDouble))
    val ptRDD = sc.parallelize(pt.toList)

    pt.foreach(f => println(s"${f._1._1},${f._1._2},${f._2}"))

    ptRDD.saveAsObjectFile(output + "/naivebayes/pt")

2. 对数据进行分类

 val newdata = sc.textFile(input)
    val classifierRDD = sc.objectFile[Tuple2[Tuple2[String, String], Double]](nbProbabilityTablePath)

    val classifier = classifierRDD.collectAsMap();
    val broadcastClassifier = sc.broadcast(classifier);
    val classesRDD = sc.textFile(classesPath)
    val broadcastClasses = sc.broadcast(classesRDD.collect())

    val classified = newdata.map(rec =>{
      val classifier = broadcastClassifier.value
      val classes = broadcastClasses.value
      val attributes = rec.split(",")

      val class_score = classes.map(aClass =>{
        val posterior = classifier.getOrElse(("CLASS", aClass), 1d)
        val probabilitied = attributes.map(attribute =>{
      val maxClass = class_score.maxBy(_._2)

