weights:每个类的权重
means:每个类的均值
sigmas:每个类的covariance matrix
package org.apache.spark.mllib.clustering
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, Vector => BV}
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Vectors, BLAS}
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils
/**
* Created by fhqplzj on 16-7-29 at 下午3:29.
*/
class MyExpectationSum(var logLikelihood: Double,
val weights: Array[Double],
val means: Array[BDV[Double]],
val sigmas: Array[BDM[Double]]) extends Serializable {
def +=(x: MyExpectationSum) = {
for (i <- weights.indices) {
weights(i) += x.weights(i)
means(i) += x.means(i)
sigmas(i) += x.sigmas(i)
}
logLikelihood += x.logLikelihood
this
}
}
object MyExpectationSum {
def zero(k: Int, d: Int) = {
new MyExpectationSum(0, Array.fill(k)(0), Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BDM.zeros(d, d)))
}
def add(weights: Array[Double], dists: Array[MultivariateGaussian])(sums: MyExpectationSum, x: BV[Double]) = {
val p = weights.zip(dists).map {
case (weight, dist) =>
MLUtils.EPSILON + weight * dist.pdf(x)
}
val pSum = p.sum
sums.logLikelihood += math.log(pSum)
for (i <- p.indices) {
p(i) /= pSum
sums.weights(i) += p(i)
sums.means(i) += p(i) * x
BLAS.syr(p(i), Vectors.fromBreeze(x), Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
}
sums
}
}