spark: sample 数据采样

package com.ws.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

/**
  * sample 采样
  */
object SampleTest {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("WordCount").setMaster("local[*]")

    val sparkContext = new SparkContext(conf)

    val data = Array(("hello", 1), ("good", 1), ("good", 1), ("good", 1), ("beautiful", 1), ("nice", 1), ("beautiful", 1), ("good", 1), ("good", 1), ("beautiful", 1), ("good", 1), ("beautiful", 1), ("good", 1), ("good", 1), ("good", 1));

    val rdd1: RDD[(String, Int)] = sparkContext.makeRDD(data)
    rdd1.cache()
    //数据倾斜使用采样
    //返回的是对象/集合,非RDD ,返回的结果数量是指定数量,如5;  参数1:是否需要重新放回  参数2:指定采取的数量
    val rdd2: Array[(String, Int)] = rdd1.takeSample(false, 5)
    println(rdd2.toBuffer)

    //返回的是RDD, 参数1:是否需要重新放回, 参数2:抽取的比例(不一定精确)
    val rdd3: RDD[(String, Int)] = rdd1.sample(false, 0.2)
    val rdd4: collection.Map[String, Long] = rdd3.countByKey()

    val buffer: mutable.Buffer[(String, Long)] = rdd4.toBuffer
    println(buffer)

    sparkContext.stop()
  }
}

你可能感兴趣的:(spark)