Spark中的自定义Partitioner分区器

import org.apache.spark.Partitioner

import scala.collection.mutable

/**
  * @author Jacky
  *         自定义分区器
  *         自定义类Scala_HostNamePartitioner继承Partitioner分区器这个抽象类
  */
class Scala_HostNamePartitioner(hostnameArray: Array[String]) extends Partitioner {
  val map = mutable.Map[String, Int]()
  //  val map=mutable.HashMap[String,Int]()

  for (i <- 0 until (hostnameArray.length)) {
    map.put(hostnameArray(i), i)
  }
  //返回分区数
  override def numPartitions: Int = {
    map.size
  }
  //返回所属分区
  override def getPartition(key: Any): Int = {
    map.getOrElse(key.toString, 0)
  }
}
======================================================================

import java.net.URL

import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * @author Jacky
  *         需求:将每个学院的URL点击次数保存在同一个文件(分区)中
  */
object Scala_UserDefinedPartitioner {
  def main(args: Array[String]): Unit = {
    //设置Logger级别
    Logger.getLogger("org").setLevel(Level.WARN)
    //创建SparkConf对象
    val conf = new SparkConf().setAppName("Scala_UserDefinedPartitioner").setMaster("local")
    //创建SparkContext对象
    val sc = new SparkContext(conf)

    val logsRDD = sc.textFile("C:\\360.log")
    //(url,1)
    val urlRDD: RDD[(String, Int)] = logsRDD.map(line => {
      val log = line.split("\t")
      val url = log(1)
      (url, 1)
    })
    //(url,count),统计url的访问次数
    val urlCountRDD = urlRDD.reduceByKey(_ + _)
    //(hostname,(url,count))
    val resultRDD: RDD[(String, (String, Int))] = urlCountRDD.map(t2 => {
      //获取url
      val url = t2._1
      //从URL中获取hostname
      val hostname = new URL(url).getHost
      (hostname, t2)
    })
    //hostname的列表
    val hostnameList: Array[String] = resultRDD.map(x => x._1).distinct().collect()
    //通过partitionBy算子调用自定义分区器,并将分区结果写入C盘
    resultRDD.partitionBy(new Scala_HostNamePartitioner(hostnameList))
      .saveAsTextFile("C:\\out" + System.currentTimeMillis())

    sc.stop()
  }
}

 

你可能感兴趣的:(Spark,Core)