spark下实现并行kmeans算法

相比于本人上篇博客中scala实现的串行kmeans而已,这次的优点体现在并行的计算,并同时运行多组kmeans算法(选取不同的初值),选择其中效果最好的作为结果输出

作为一个初学者,这次的编程让我初步的体会到了函数式编程的魅力,让我见识到了并行计算,学习的道路还有很长啊

package zzl

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer
import breeze.linalg.norm
import org.apache.spark.mllib.linalg.Vectors

//该case类保存了每个向量本身与该向量的L2范数
case class VectorInform(var Point:Vector,var norm2:Double)
//该case类保存一个向量所对应的中心向量的id与该向量与中心向量之间的花费
case class CenterInform(val center_id:Int,val cost:Double)
class Kmeans(val data:RDD[Vector],val numClusters:Int,val MaxIterations:Int,val runs:Int = 1,
    val threshold:Double=1e-4,val savepath:String="/home/hadoop/haha")extends Serializable{
  def output(data:Array[Array[VectorInform]])
  {
    data.foreach {_.foreach {x=>x.Point.foreachActive((index,value)=>print(index+" "+value+"  "));println}}
  }
  //返回两向量的和向量
  def add(p1:Vector,p2:Vector):Vector=
  {
    var p3=new Array[Double](p1.size)
    for(i<- 0 until p1.size)
    {
      p3(i)=p1(i)+p2(i)   
    }
    Vectors.dense(p3)
  }
  //获取初始中心点
  def InitCenterPoint(data:RDD[VectorInform]):Array[Array[VectorInform]]={
    var sample=data.takeSample(false, numClusters*runs)
    Array.tabulate(runs)(r=>sample.slice(numClusters*r, numClusters*(r+1)))
  }
  //查找该点属于第k个并行模块的哪个类
  def FindClostCenter(center:Array[VectorInform],point:VectorInform):CenterInform=
  {
    var bestdistance=Double.PositiveInfinity
    var id=0
    for(i <- 0 until center.length)
    {
      var dist=Vectors.sqdist(center(i).Point, point.Point)
      if(distsc.accumulator(0.0)}//累加器
      var activecenter=activeRuns.map { x => center(x)}//这步很重要,每次迭代的时候去除那些已经收敛的计算
      var bestcenter=sc.broadcast(center)//广播,把中心的数据传输到每个分区,接下来就马上体现了并行计算的思想
      //每个分区计算计算,体会下并行计算
      var result=data.mapPartitions{points=> 
        /*
         * 获取必要的参数
         */
       val thiscenter=bestcenter.value //每个分区中获取中心点
       val runs= thiscenter.length//并行计算数量
       val n=thiscenter(0).length//n个中心点
       val dims=thiscenter(0)(0).Point.size//中心点的维度
       /*
        * 获取该分区类,每个并行计算中的每个类的向量的和与向量的个数
        */
       var sum=Array.fill(runs,n)(Vectors.zeros(dims))//保存每个并行度下每个类的向量的和
       var count=Array.fill(runs, n)(0)//保存每个并行度下每个类中向量的个数
       points.foreach { point => 
       //并行runs计算
        for(i<- 0 until runs)
          {
          val vp=FindClostCenter(thiscenter(i), point)
          count(i)(vp.center_id)+=1
          sum(i)(vp.center_id)=add(sum(i)(vp.center_id), point.Point)
          cost2(i)+=vp.cost
          }  
       }
      val result=for(i<-0 until runs; j<-0 until n)
        yield{
        ((i,j),(sum(i)(j),count(i)(j)))
      }
     result.iterator
    }.reduceByKey((a,b)=>plus(a, b)).collectAsMap()
     
      /*
       *更新中心点并判断是否满足停止的条件 
       */
      for((run,i)<-activeRuns.zipWithIndex)//注意理解这里,有的并行已经停止,run的值与i不一定相等
      {
        var change=false
        for(j<- 0 until numClusters)
        {
          val (sum,n)=result(i,j)//第i个并行计算中第j个类的向量和与向量总数    
          var newc=divide(sum, n)
          if(Vectors.sqdist(newc, center(run)(j).Point)>threshold)
            change=true
          center(run)(j).Point=newc
          }
        if(!change)
        {
          runactive(run)=false
          cost(run)=cost2(run).value
          println("Run "+run+" has stopped")
        }
      }
      activeRuns=activeRuns.filter {runactive(_)}
    }
    /*
     * 选择runs个并行中cost最小的中心点
     */
    var (mincost,bestrun)=cost.zipWithIndex.min
    (center(bestrun),mincost)
  }
  def run()
  {
    var norm2=data.map {Vectors.norm(_, 2)}
    var zipdata=data.zip(norm2).map(f=>new VectorInform(f._1,f._2))
    var center=InitCenterPoint(zipdata)
    var (endcenter,cost)=runAlgorithm(zipdata)
    println("-------------------------------")
    endcenter.foreach {x=>x.Point.foreachActive((a,b)=>print(b+" "));println}
    println("最小花费为:"+cost)
  }
}

package zzl

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.mllib.linalg.{Vectors,Vector}
object Main {
 
   def main(args: Array[String]): Unit = {
    var sc=new SparkContext(new SparkConf().setAppName("zzl").setMaster("local"))
    var data=sc.textFile("/home/hadoop/xixi", 2).map { s =>Vectors.dense(s.split(" ").map {_.toDouble})}
    
    var k=new Kmeans(data,2,40,20)
    k.run()
    
    
   }
}
本人是小白初学者,如果有哪里编写错误还望各位指正!

你可能感兴趣的:(spark,scala,机器学习)