package rddDemo.action
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/17.
* transformation 算子collect
* 将数据从各从节点fetch到driver端进行集中处理(慎用)
*/
object CollectDemo {
def main(args : Array[String]):Unit = {
val conf = new SparkConf()
conf.setAppName("CollectDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val numbers = Array(1 to 10 : _*)
val numRdd = sc.parallelize(numbers)
/**
* collect 算子将RDD数据从各从节点fetch到driver,当RDD数据量大时,容易造成driver端内存溢出OOM
* 如果在集群模式下,当driver与client不在同一主机时,不使用collect,直接使用foreach进行打印输出,driver看不到输出结果
* 因为输出结果在各从节点中
*/
numRdd.map(x => x * 2).collect().foreach(println)
sc.stop()
}
}
package rddDemo.action
import org.apache.spark.{SparkConf, SparkContext}
/**
* ReduceByKey 算子
* action 算子
* Created by asus on 2018/7/7.
*/
object ReduceByKeyDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("ReduceByKeyDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val scores = List(("lao wang " , 10) , ("xiao wang" , 50) , ("lao zhang" , 50) , ("xiao zhang" , 100) ,
("lao wang " , 10) , ("xiao wang" , 50) , ("lao zhang" , 50) , ("xiao zhang" , 100))
val scoresRDD = sc.parallelize(scores , 3)
scoresRDD.foreach(println)
val totalScoresRDD = scoresRDD.reduceByKey(_ + _)
totalScoresRDD.foreach(println)
sc.stop()
}
}
package rddDemo.action
import org.apache.spark.{SparkConf, SparkContext}
/**
* take(n) 获取RDD中的前 n 个元素
* Created by asus on 2018/7/8.
*/
object TakeDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("TakeDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val numbers = List(0 to 10 : _*)
val numbersRDD = sc.parallelize(numbers , 3)
val take3Num = numbersRDD.take(3)
for(n <- take3Num.iterator) {
println(n)
}
sc.stop()
}
}
package rddDemo.action
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/7/8.
*/
object TakeSampleDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("TakeSampleDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val names = List("lao wang" , "xiao wang" , "lao zhang" , "xiao zhang" , "lao li" , "xiao li")
val namesRDD = sc.parallelize(names , 3)
// 无放回取样,取 3 个元素
println(">>>>>>>>>>>>>>>>>> 无放回取样,取 3 个元素 <<<<<<<<<<<<<<<<<<")
val takeSample_1 = namesRDD.takeSample(false , 3) ;
for(name <- takeSample_1) {
println(name)
}
// 有放回取样,取 3 个元素
println(">>>>>>>>>>>>>>>>>> 有放回取样,取 3 个元素 <<<<<<<<<<<<<<<<<<")
val takeSample_2 = namesRDD.takeSample(true , 3)
for(name <- takeSample_2) {
println(name)
}
// 无放回取样,取 3 个元素,设置seed种子
println(">>>>>>>>>>>>>>>>>> 无放回取样,取 3 个元素,设置seed种子 <<<<<<<<<<<<<<<<<<")
val takeSample_3 = namesRDD.takeSample(false , 3 , 100)
for(name <- takeSample_3) {
println(name)
}
// 有放回取样,取 3 个元素,设置seed种子
println(">>>>>>>>>>>>>>>>>> 有放回取样,取 3 个元素,设置seed种子 <<<<<<<<<<<<<<<<<<")
val takeSample_4 = namesRDD.takeSample(true , 3 , 100)
for(name <- takeSample_4) {
println(name)
}
sc.stop()
}
}
reduceByKey算子
package rddDemo.action
import org.apache.spark.{SparkConf, SparkContext}
/**
* ReduceByKey 算子
* action 算子
* Created by asus on 2018/7/7.
*/
object ReduceByKeyDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("ReduceByKeyDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val scores = List(("lao wang " , 10) , ("xiao wang" , 50) , ("lao zhang" , 50) , ("xiao zhang" , 100) ,
("lao wang " , 10) , ("xiao wang" , 50) , ("lao zhang" , 50) , ("xiao zhang" , 100))
val scoresRDD = sc.parallelize(scores , 3)
scoresRDD.foreach(println)
val totalScoresRDD = scoresRDD.reduceByKey(_ + _)
totalScoresRDD.foreach(println)
sc.stop()
}
}
package rddDemo.action
import org.apache.spark.{SparkConf, SparkContext}
/**
* 计算 RDD key 出现的次数
* Created by asus on 2018/7/15.
*/
object CountByKeyDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("CountByKeyDemo") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val scores = List(("lao wang" , 10) , ("lao wang" , 20) , ("lao wang" , 30) ,
("lao zhang" , 40) , ("xiao zhang" , 50) , ("xiao zhang" , 60) ,
("xiao wang" , 70))
val scoresRDD = sc.parallelize(scores , 2)
val keyCount = scoresRDD.countByKey()
for( s <- keyCount.keySet.iterator) {
println("key -> " + s + " , " + "count -> " + keyCount.get(s))
}
sc.stop()
}
}