spark求topN问题的自定义分区器的实现

 

 

package com.thy.spark

import java.net.URL
import scala.collection.mutable
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object GroupFavTeacher3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("GroupFavTeacher2").setMaster("local[2]")
    val sc: SparkContext = new SparkContext(conf)
    val data: RDD[String] = sc.textFile("E:\\hdfsDemo\\teacher(1).log")
    val subjectTeacherAndOne: RDD[((String, String), Int)] = data.map(data => {
      val index: Int = data.lastIndexOf("/")
      val teacher = data.substring(index + 1)
      val httphost: String = data.substring(0, index)
      val subject: String = new URL(httphost).getHost.split("[.]")(0)
      ((subject, teacher), 1)
    })
    //聚合,将学科和老师这个元组当作key
    val reduced: RDD[((String, String), Int)] = subjectTeacherAndOne.reduceByKey(_+_)

    //通过map函数获取subjects #记得去重,并且collect,
    val subjects: Array[String] = reduced.map(_._1._1).distinct().collect()
    //自定义分区器,按照指定的分区器进行分区
    //实例化新构建的SUbjectPartitioner类,传入上面获取的sub数组,在自定义中要用到
    val sbPartition: SubjectPartitioner = new SubjectPartitioner(subjects)
    //将聚合后的数据调用自定义的partitioner分区规则进行分区,RDD的key时(String,String)
    val partitioned: RDD[((String, String), Int)] = reduced.partitionBy(sbPartition)
    //排序  #没想起来
    //一次拿一个分区进行排序,操作一个分区
    val sorted: RDD[((String, String), Int)] = partitioned.mapPartitions(it => {
        //将迭代器转换成list在排序,在转换回来
      it.toList.sortBy(_._2).reverse.take(3).iterator
    })
    val r: Array[((String, String), Int)] = sorted.collect()
    println(r.toBuffer)
    sc.stop()
  }
}
//自定义分区器
class SubjectPartitioner(sbs:Array[String]) extends Partitioner{
  //相当于著构造器,new时候只执行一次,用于存放规则。
  //定义规则,即将每一个sub对应一个编号,在调用getPartition时,将相同编号的分到同一个partition
  val rules = new mutable.HashMap[String,Int]()
  var i =0
  for (sb<-sbs){
    rules.put(sb,1)
    i +=1
  }
  //返回分区的数量,(下一个RDD有多少分区)
  override def numPartitions: Int = sbs.length
  //根据传入的key计算分区标号
  override def getPartition(key: Any): Int = {
    //调用partition时,要先知道那个sub,通过编号归入到不同分区  #没想起来怎么获取学科名
    val subject: String = key.asInstanceOf[(String,String)]._1
    //根据规则计算分区,即根据获得的学科名和上面定义的rules,获取对应的编号
    rules(subject)
  }
}

 

你可能感兴趣的:(Spark)