package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* 笛卡尔积 cartesian 算子
* Created by asus on 2018/7/15.
*/
object CartesianDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("CartesianDemo") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val classOne = List("lao wang" , "lao zhang" , "lao li" , "lao zhao")
val classTwo = List("xiao wang" , "xiao zhao" , "xiao zhang" , "xiao li")
val classOneRDD = sc.parallelize(classOne , 2)
val classTwoRDD = sc.parallelize(classTwo , 2)
val classOneCartesianTwoRDD = classOneRDD.cartesian(classTwoRDD)
classOneCartesianTwoRDD.foreach{
s => {
println("( " + s._1 + " , " + s._2 + " )")
}
}
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/16.
* transformation 算子 coalesce(partitionNum : Int , shuffle : Boolean = false)
* 默认参数是否进行shuffle为false,即只在本机进行partition合并,不同过网络传输
*/
object CoalesceDemo {
def main(args : Array[String]):Unit = {
val conf = new SparkConf()
conf.setAppName("CoalesceDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val numbers = Array(1 to 100 : _*)
val numRdd = sc.parallelize(numbers , 10)
// 输出当前RDD中数据所属的partition编号
// scala 中匿名函数中 {} 与 () 的区别就是{} 是代码块 () 只能是一行语句
val numRddWithPartitionIndex = numRdd.mapPartitionsWithIndex{
(index , numIter) => {
var numString = List[String]()
while(numIter.hasNext) {
val num = numIter.next
numString = List("number " + num + " with partition index " + index) ::: numString
}
numString.iterator
}
}
numRddWithPartitionIndex.collect().foreach(println)
// 努力将RDD中partition减少为5个,不进行shuffle
val coalescedNumRdd = numRdd.coalesce(5)
// 输出coalesce之后RDD中数据所属的partition index 编号
val coalescedNumRddWithPartitionIndex = coalescedNumRdd.mapPartitionsWithIndex{
(index , numIter) => {
var numString = List[String]()
while(numIter.hasNext) {
val num = numIter.next()
numString = List("number " + num + " with partition index " + index) ::: numString
}
numString.iterator
}
}
coalescedNumRddWithPartitionIndex.collect().foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/7/18.
*/
object CogroupDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("CogroupDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val scores_1 = List(("lao wang" , 10) , ("lao zhang" , 20) , ("lao zhao" , 30) , ("lao li" , 40))
val scores_2 = List(("lao wang" , 10) , ("xiao zhang" , 20) , ("lao zhao" , 30) , ("xiao li" , 40))
val scoreOneRDD = sc.parallelize(scores_1 , 2)
val scoreTwoRDD = sc.parallelize(scores_2 , 2)
val cogroupRDD = scoreOneRDD.cogroup(scoreTwoRDD)
cogroupRDD.foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* distinct RDD 去重算子,有shuffle
* Created by asus on 2018/7/11.
*/
object DistinctDemo {
def main(args :Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("DistinctDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val names = List("lao wang" , "lao li" , "lao zhang" , "lao zhao" , "lao wang" , "lao li" , "lao zhang" , "lao zhao")
val nameRDD = sc.parallelize(names , 4)
nameRDD.mapPartitionsWithIndex{
(index , names) => {
var nameWithIndex = List[String]()
while(names.hasNext) {
val name = names.next()
nameWithIndex = List("name " + name + " with index " + index) ::: nameWithIndex
}
nameWithIndex.iterator
}
}.foreach(println)
val distinctNameRDD = nameRDD.distinct()
distinctNameRDD.mapPartitionsWithIndex{
(index , names) => {
var nameWithIndex = List[String]()
while(names.hasNext) {
val name = names.next()
nameWithIndex = List("name " + name + " with index " + index) ::: nameWithIndex
}
nameWithIndex.iterator
}
}.foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/16.
* transformation 算子 filter(f : U => bool)
* 只保留 f 返回 为true的元素
* filter算子通常与coalesce算子一起使用,进行partition合并,减少partition的数量,缓解filter造成的数据倾斜
*/
object FilterDemo {
def main(args : Array[String]):Unit = {
val conf = new SparkConf()
conf.setAppName("FilterDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val numbers = Array(1 to 10 : _*)
val numRdd = sc.parallelize(numbers)
val numLarger5 = numRdd.filter(n => n > 5)
def printNumber(n : Int) : Unit = {
println("number : " + n)
}
numRdd.foreach(printNumber)
numLarger5.foreach(printNumber)
sc.stop
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/16.
* transformation 算子 flatMap(f : U => U)
* 先执行map 然后进行 flat 进行扁平化
*/
object FlatMapDemo {
def main(args : Array[String]):Unit = {
val conf = new SparkConf()
conf.setAppName("FlatMapDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val sentences = Array("today is a nice day" , "i love you" , "who am i")
val sentenceRdd = sc.parallelize(sentences , 2)
sentenceRdd.collect().foreach(println)
val wordRdd = sentenceRdd.flatMap(s => s.split(" "))
wordRdd.collect().foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/17.
* transformation 算子 groupByKey (k , v) => (k , Itarable(V1 , V2 , ...))
* 产生shuffle
*/
object GroupByKeyDemo {
def main(args : Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("GroupByKeyDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val userInfo = List(("zhang" , 100) , ("wang" , 90) , ("li" , 80) ,
("zhang" , 101) , ("wang" , 91) , ("li" , 81) ,
("zhang" , 102) , ("wang" , 92) , ("li" , 82))
val userRdd = sc.parallelize(userInfo)
userRdd.foreach(println)
userRdd.reduceByKey(_ + _).collect().foreach(println)
// 按照key 进行分组
val userRddGroupByKey = userRdd.groupByKey()
userRddGroupByKey.foreach(println)
// 计算每个分组score的总数
val userTotalScore = userRddGroupByKey.map{
t => {
var sum = 0
for(s <- t._2) {
sum += s
}
(t._1 , sum)
}
}
userTotalScore.collect().foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* RDD 求交集 intersection 算子
* Created by asus on 2018/7/15.
*/
object IntersectionDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("IntersectionDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val classOne = List("lao wang" , "lao wang" , "lao zhang" , "lao li" , "lao zhao")
val classTwo = List("lao wang" , "lao wang" , "lao zhao" , "lao zhao" , "xiao wang" , "xiao zhao")
val classOneRDD = sc.parallelize(classOne , 2)
val classTwoRDD = sc.parallelize(classTwo , 2)
val classOneAndTwoRDD = classOneRDD.intersection(classTwoRDD)
classOneAndTwoRDD.foreach {
s => {
println(s)
}
}
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/7/18.
*/
object JoinDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("CogroupDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val scores_1 = List(("lao wang" , 10) , ("lao zhang" , 20) , ("lao zhao" , 30) , ("lao li" , 40))
val scores_2 = List(("lao wang" , 10) , ("xiao zhang" , 20) , ("lao zhao" , 30) , ("xiao li" , 40))
val scoreOneRDD = sc.parallelize(scores_1 , 2)
val scoreTwoRDD = sc.parallelize(scores_2 , 2)
val joinRDD = scoreOneRDD.join(scoreTwoRDD)
joinRDD.foreach{
t => {
println(t._1 + " -> " + t._2._1 + " , " + t._2._2)
}
}
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/16.
* transformation 算子 mapPartitions(f : Iterator[T] => Iterator[U])
* 将 f 以 Partiotions分区为单位,应用于整个partitions,其作用与map算子相同,在RDD中数据量较小的情况下可以提升执行速度
* 数据量大会导致内存溢出 OOM
*/
object MapPartitionsDemo {
def main(args : Array[String]):Unit = {
println("MapPartitionsDemoScala")
val conf = new SparkConf()
conf.setAppName("MapPartitionsDemoScala")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val names = List[String]("xuruyun" , "liangjingru" , "wangfei")
val scoreMap = Map("xuruyun" -> 150 , "liangjingru" -> 100 , "wangfei" -> 90)
val nameRdd = sc.parallelize(names)
/**
* 自定义mapPartitions的处理方法 Iterator[String] => Iterator[Int]
* @param names
* @return
*/
def getUserScore(names: Iterator[String]) : Iterator[Int] = {
var userScore = List[Int]()
while(names.hasNext) {
val name = names.next()
val score = scoreMap(name)
userScore = List(score) ::: userScore
}
userScore.iterator
}
// 自定义打印方法
def printScore(score : Int) {println("score :" + score)}
val scoreRdd = nameRdd.mapPartitions(getUserScore)
scoreRdd.foreach(printScore)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/16.
* transformation 算子 mapPartitionsWithIndex(f : (Integer , INtorator[U]) => Intorator[T])
* 该算子与map、mapPartitions算子类似,知识多带了一个partitions的编号
*/
object MapPartitionsWithIndexDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("MapPartitionsWithIndexDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val names = Array("xuruyun" , "liangjingru" , "wangfei")
val nameRdd = sc.parallelize(names)
def getNamePartitionIndex(index: Int , names : Iterator[String]) : Iterator[String] = {
var newNames = List[String]()
while(names.hasNext) {
val name = names.next()
newNames = List("Hello " + name + " with index " + index) ::: newNames
}
newNames.iterator
}
val nameRddWithPartitionsIndex = nameRdd.mapPartitionsWithIndex(getNamePartitionIndex)
nameRddWithPartitionsIndex.foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/5/20.
* transformation 算子 map(f : U => U)
* 将 f 应用与 RDD的每个元素
*/
object MapRddDemo {
def printNum(num : Int) : Unit = println(num)
def main(args: Array[String]): Unit = {
println(args)
val conf = new SparkConf().setAppName("map rdd demo").setMaster("local[2]")
System.setProperty("hadoop.home.dir", "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val numRdd = sc.parallelize(Array(1 , 2 , 3 , 4))
numRdd.map(num => num + 1).foreach(num => printNum(num))
val sum = numRdd.reduce(_ + _)
println("sum = " + sum)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/6/16.
* transformation 算子 repartition(num)
* 用于提高并行度 , 会进行shuffle <==> coalesce(num , true)
* 建议:减少partition数量使用coalsece算子,增加partition数量使用repartition算子
* 产生shuffle
*/
object RepartitionDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("RepartitionDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val numbers = Array(1 to 100 : _*)
val numRdd = sc.parallelize(numbers , 10)
val numRddWithPartitionIndex = numRdd.mapPartitionsWithIndex{
(index , numIter) => {
var numString = List[String]()
while(numIter.hasNext) {
val num = numIter.next()
numString = List("number " + num + "with partition index " + index) ::: numString
}
numString.iterator
}
}
numRddWithPartitionIndex.collect().foreach(println)
val numRddRepartition = numRdd.repartition(5)
val numRddRepartitionWithPartitionIndex = numRddRepartition.mapPartitionsWithIndex{
(index , numIter) => {
var numString = List[String]()
while(numIter.hasNext) {
val num = numIter.next()
numString = List("number " + num + "with partition index " + index) ::: numString
}
numString.iterator
}
}
numRddRepartitionWithPartitionIndex.collect().foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* sample(replace:boolean , fraction:float , seed : long) 取样算子
* Created by asus on 2018/7/8.
*/
object SampleDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("SampleDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val names = List("lao wang" , "xiao wang" , "lao zhang" , "xiao zhang" , "lao li" , "xiao li")
val namesRDD = sc.parallelize(names , 3)
// 无放回取样50%
println(">>>>>>>>>>>>>>>>>> 无放回取样50% <<<<<<<<<<<<<<<<<")
namesRDD.sample(false , 0.5).foreach(println)
// 有放回取样50%
println(">>>>>>>>>>>>>>>>>> 有放回取样50% <<<<<<<<<<<<<<<<<")
namesRDD.sample(true , 0.5).foreach(println)
// 无放回取样50%,指定seed,取样唯一
println(">>>>>>>>>>>>>>>>>> 无放回取样50%,指定seed,取样唯一 <<<<<<<<<<<<<<<<<")
namesRDD.sample(false , 0.5 , 100).foreach(println)
// 有放回取样50%,指定seed,取样唯一
println(">>>>>>>>>>>>>>>>>> 有放回取样50%,指定seed,取样唯一 <<<<<<<<<<<<<<<<<")
namesRDD.sample(true , 0.5 , 100).foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/7/11.
*/
object SortByKeyDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("SortByKeyDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" ,"E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val scores = List[(String , Integer)](("lao wang" , 10) , ("lao zhang" , 20) , ("lao li" , 30) , ("lao zhao" , 40))
val scoreRDD = sc.parallelize(scores , 2)
scoreRDD.foreach{
s => {
println("name -> " + s._1 + " , score -> " + s._2)
}
}
// 升序
println(">>>>>>>>>>>>>>>>>>>>>>>>>>> asc(升序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
val scoreSortByKeyAscRDD = scoreRDD.sortByKey(true)
scoreSortByKeyAscRDD.foreach{
s => {
println("name -> " + s._1 + " , score -> " + s._2)
}
}
// 降序
println(">>>>>>>>>>>>>>>>>>>>>>>>>>> desc(降序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
val scoreSortByKeyDescRDD = scoreRDD.sortByKey(true)
scoreSortByKeyDescRDD.foreach{
s => {
println("name -> " + s._1 + " , score -> " + s._2)
}
}
// 按照分数升序排序
println(">>>>>>>>>>>>>>>>>>>>>>>>>>> asc(按照分数升序排序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
val scoreAscRDD = scoreRDD.map(s => (s._2 , s._1)).sortByKey(true).map(s => (s._2 , s._1)).foreach{
s => {
println("name -> " + s._1 + " , score -> " + s._2)
}
}
// 按照分数降序排序
println(">>>>>>>>>>>>>>>>>>>>>>>>>>> desc(按照分数降序排序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
val scoreDescRDD = scoreRDD.map(s => (s._2 , s._1)).sortByKey(false).map(s => (s._2 , s._1)).foreach{
s => {
println("name -> " + s._1 + " , score -> " + s._2)
}
}
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* rdd_1.union(rdd_2)
* 直接合并RDD的partition
* 如果rdd_1,rdd_2分别有两个partition,则合并后的RDD有4个partition,即union过程中不会shuffle
* Created by asus on 2018/7/8.
*/
object UnionDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("UnionDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val oldMan = List("lao wang" , "lao zhang" , "lao li" , "lao zhao")
val youngMan = List("xiao wang" , "xiao zhang" , "xiao li" , "xiao zhao")
/**
* name : lao li with index 1
* name : lao zhao with index 1
* name : lao wang with index 0
* name : lao zhang with index 0
*/
// oldManRDD 有两个 partition
val oldManRDD = sc.parallelize(oldMan , 2)
val oldManWithIndex = oldManRDD.mapPartitionsWithIndex{
(index , names) => {
var nameWithIndex = List[String]()
while(names.hasNext) {
nameWithIndex = List("name : " + names.next() + " with index " + index) ::: nameWithIndex
}
nameWithIndex.iterator
}
}
oldManWithIndex.foreach(println)
/**
* name : xiao li with index 1
* name : xiao zhao with index 1
* name : xiao wang with index 0
* name : xiao zhang with index 0
*/
// youngManRDD 有两个 partition
val youngManRDD = sc.parallelize(youngMan , 2)
val youngManRDDWithIndex = youngManRDD.mapPartitionsWithIndex{
(index , names) => {
var nameWithIndex = List[String]()
while(names.hasNext) {
nameWithIndex = List("name : " + names.next() + " with index " + index) ::: nameWithIndex
}
nameWithIndex.iterator
}
}
youngManRDDWithIndex.foreach(println)
/**
* name : lao li with index 1
* name : lao zhao with index 1
* name : xiao wang with index 2
* name : xiao zhang with index 2
* name : xiao li with index 3
* name : xiao zhao with index 3
* name : lao wang with index 0
* name : lao zhang with index 0
*/
// 合并 oldManRDD 和 youngManRDD , 合并后的RDD中有 4 个 partition,且每个partition 中的内容与未合并前一样
val unionOldAndYoungRDD = oldManRDD.union(youngManRDD)
val unionOldAndYoungRDDWithIndex = unionOldAndYoungRDD.mapPartitionsWithIndex{
(index , names) => {
var nameWithIndex = List[String]()
while(names.hasNext) {
nameWithIndex = List("name : " + names.next() + " with index " + index) ::: nameWithIndex
}
nameWithIndex.iterator
}
}
unionOldAndYoungRDDWithIndex.foreach(println)
sc.stop()
}
}
package rddDemo.transformation
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by asus on 2018/7/15.
*/
object SaveAsTextFileDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("SaveAsTextFileDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val numbers = Array(1 to 10 : _*)
val numberRDD = sc.parallelize(numbers , 2)
// 将数据文件保存到本地文件系统(前提事目录不能事先存在)
numberRDD.saveAsTextFile("src/main/scala/rddDemo/saveAsTextFilePath")
// 将数据文件保存到HDFS(前提事目录不能事先存在)
// numberRDD.saveAsTextFile("hdfs://ip:9000/saveAsTextFilePath")
sc.stop()
}
}