贝叶斯 算法 实例 scala

package mlia.bayes

import breeze.linalg._

object Prep {

  def loadDataSet: (Array[Array[String]], Vector[Int]) = {

    val postingList = Array(
      Array("my", "dog", "has", "flea", "problems", "help", "please"),
      Array("maybe", "not", "take", "him", "to", "dog", "park", "stupid"),
      Array("my", "dalmation", "is", "so", "cute", "I", "love", "him"),
      Array("stop", "posting", "stupid", "worthless", "garbage"),
      Array("mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"),
      Array("quit", "buying", "worthless", "dog", "food", "stupid"))

    val classVec = DenseVector(0, 1, 0, 1, 0, 1)

    (postingList, classVec)
  }

  def createVocabList(dataSet: Array[Array[String]]): Array[String] = dataSet.flatten.distinct

  def setOfWords2Vec(vocabList: Array[String], inputSet: Array[String]): DenseVector[Int] = {
    val returnVec: DenseVector[Int] = DenseVector.zeros[Int](vocabList.size)
    inputSet.foldLeft(returnVec) { (state, word) =>
      if (vocabList.contains(word)) state(vocabList.indexOf(word)) = 1
      else println(s"the word: $word is not in my Vocabulary!")
      state
    }
  }

  def bagOfWords2VecMN(vocabList: Array[String], inputSet: Array[String]): DenseVector[Int] = {
    inputSet.foldLeft(DenseVector.zeros[Int](vocabList.size)) { (state, word) =>
      if (vocabList.contains(word)) state(vocabList.indexOf(word)) = state(vocabList.indexOf(word)) + 1
      else println(s"the word: $word is not in my Vocabulary!")
      state
    }
  }
}
package mlia.bayes

import breeze.linalg._
import breeze.numerics._

object NaiveBayes {

  case class Prob(num: Vector[Int], denom: Double) {

    def probability: Vector[Double] = num.mapValues(_.toDouble) :/ denom

    def logProbability: Vector[Double] = log(num.mapValues(_.toDouble) :/ denom)
  }

  object Prob {
    def apply(size: Int): Prob = Prob(DenseVector.ones(size), 2.0d) // avoid 0
  }

  def trainNB0(trainMatrix: DenseMatrix[Int], trainCategory: Vector[Int]): (Prob, Prob, Double) = {

    val numTrainDocs = trainMatrix.rows
    val numWords = trainMatrix.cols

    val probs = (0 until numTrainDocs).foldLeft((Prob(numWords), Prob(numWords))) { (state, i) =>
      val v: Vector[Int] = trainMatrix(i, ::).toDenseVector // [0, 1, 0, 0, 1, 0...]
      // vector addition
      if (trainCategory(i) == 1) (Prob(state._1.num + v, state._1.denom + v.sum), state._2) // add up class=1
      else (state._1, Prob(state._2.num + v, state._2.denom + v.sum))
    }
    (probs._2, probs._1, trainCategory.sum / numTrainDocs.toDouble) // probability, class=0, class=1, abusive
  }

  def classifyNB(vec2Classify: Vector[Int], p0Vec: Vector[Double], p1Vec: Vector[Double], pClass1: Double) = {
    val p1 = (vec2Classify.mapValues(_.toDouble) :* p1Vec: Vector[Double]).sum + log(pClass1)
    val p0 = (vec2Classify.mapValues(_.toDouble) :* p0Vec: Vector[Double]).sum + log(1.0 - pClass1)
    if (p1 > p0) 1 else 0
  }
}

你可能感兴趣的:(算法)