Spark常用transformation算子操作 —— Scala版

  • cartesian算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * 笛卡尔积 cartesian 算子
  * Created by asus on 2018/7/15.
  */
object CartesianDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("CartesianDemo") ;
    conf.setMaster("local[2]") ;

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val classOne = List("lao wang" , "lao zhang" , "lao li" , "lao zhao")
    val classTwo = List("xiao wang" , "xiao zhao" , "xiao zhang" , "xiao li")

    val classOneRDD = sc.parallelize(classOne , 2)
    val classTwoRDD = sc.parallelize(classTwo , 2)

    val classOneCartesianTwoRDD = classOneRDD.cartesian(classTwoRDD)
    classOneCartesianTwoRDD.foreach{
      s => {
        println("( " + s._1 + " , " + s._2 + " )")
      }
    }
    sc.stop()
  }

}
  • coalesce算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/6/16.
  * transformation 算子 coalesce(partitionNum : Int , shuffle : Boolean = false)
  * 默认参数是否进行shuffle为false,即只在本机进行partition合并,不同过网络传输
  */
object CoalesceDemo {
  def main(args : Array[String]):Unit = {
    val conf = new SparkConf()
    conf.setAppName("CoalesceDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
    val sc = new SparkContext(conf)

    val numbers = Array(1 to 100 : _*)
    val numRdd = sc.parallelize(numbers , 10)

    // 输出当前RDD中数据所属的partition编号
    // scala 中匿名函数中 {} 与 () 的区别就是{} 是代码块 () 只能是一行语句
    val numRddWithPartitionIndex = numRdd.mapPartitionsWithIndex{
      (index , numIter) => {
        var numString = List[String]()
        while(numIter.hasNext) {
          val num = numIter.next
          numString = List("number " + num + " with partition index " + index) ::: numString
        }
        numString.iterator
      }
    }
    numRddWithPartitionIndex.collect().foreach(println)

    // 努力将RDD中partition减少为5个,不进行shuffle
    val coalescedNumRdd = numRdd.coalesce(5)

    // 输出coalesce之后RDD中数据所属的partition index 编号
    val coalescedNumRddWithPartitionIndex = coalescedNumRdd.mapPartitionsWithIndex{
      (index , numIter) => {
        var numString = List[String]()
        while(numIter.hasNext) {
          val num = numIter.next()
          numString = List("number " + num + " with partition index " + index) ::: numString
        }
        numString.iterator
      }
    }
    coalescedNumRddWithPartitionIndex.collect().foreach(println)

    sc.stop()
  }
}
  • cogroup算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/7/18.
  */
object CogroupDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("CogroupDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)


    val scores_1 = List(("lao wang" , 10) , ("lao zhang" , 20) , ("lao zhao" , 30) , ("lao li" , 40))
    val scores_2 = List(("lao wang" , 10) , ("xiao zhang" , 20) , ("lao zhao" , 30) , ("xiao li" , 40))

    val scoreOneRDD = sc.parallelize(scores_1 , 2)
    val scoreTwoRDD = sc.parallelize(scores_2 , 2)

    val cogroupRDD = scoreOneRDD.cogroup(scoreTwoRDD)

    cogroupRDD.foreach(println)

    sc.stop()
  }
}
  • distinct算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * distinct RDD 去重算子,有shuffle
  * Created by asus on 2018/7/11.
  */
object DistinctDemo {
  def main(args :Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("DistinctDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val names = List("lao wang" , "lao li" , "lao zhang" , "lao zhao" , "lao wang" , "lao li" , "lao zhang" , "lao zhao")

    val nameRDD = sc.parallelize(names , 4)
    nameRDD.mapPartitionsWithIndex{
      (index , names) => {
        var nameWithIndex = List[String]()
        while(names.hasNext) {
          val name = names.next()
          nameWithIndex = List("name " + name + " with index " + index) ::: nameWithIndex
        }
        nameWithIndex.iterator
      }
    }.foreach(println)

    val distinctNameRDD = nameRDD.distinct()
    distinctNameRDD.mapPartitionsWithIndex{
      (index , names) => {
        var nameWithIndex = List[String]()
        while(names.hasNext) {
          val name = names.next()
          nameWithIndex = List("name " + name + " with index " + index) ::: nameWithIndex
        }
        nameWithIndex.iterator
      }
    }.foreach(println)

    sc.stop()

  }

}
  • filter算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/6/16.
  * transformation 算子 filter(f : U => bool)
  * 只保留 f 返回 为true的元素
  * filter算子通常与coalesce算子一起使用,进行partition合并,减少partition的数量,缓解filter造成的数据倾斜
  */
object FilterDemo {
  def main(args : Array[String]):Unit = {
    val conf = new SparkConf()
    conf.setAppName("FilterDemo")
    conf.setMaster("local[2]")
    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val numbers = Array(1 to 10 : _*)
    val numRdd = sc.parallelize(numbers)

    val numLarger5 = numRdd.filter(n => n > 5)

    def printNumber(n : Int) : Unit = {
      println("number : " + n)
    }

    numRdd.foreach(printNumber)
    numLarger5.foreach(printNumber)

    sc.stop
  }

}
  • flatMap算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/6/16.
  * transformation 算子 flatMap(f : U => U)
  * 先执行map 然后进行 flat 进行扁平化
  */
object FlatMapDemo {
  def main(args : Array[String]):Unit = {
    val conf = new SparkConf()
    conf.setAppName("FlatMapDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
    val sc = new SparkContext(conf)

    val sentences = Array("today is a nice day" , "i love you" , "who am i")
    val sentenceRdd = sc.parallelize(sentences , 2)
    sentenceRdd.collect().foreach(println)

    val wordRdd = sentenceRdd.flatMap(s => s.split(" "))
    wordRdd.collect().foreach(println)

    sc.stop()
  }
}
  • groupByKey算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/6/17.
  * transformation 算子 groupByKey (k , v) => (k , Itarable(V1 , V2 , ...))
  * 产生shuffle
  */
object GroupByKeyDemo {

  def main(args : Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("GroupByKeyDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
    val sc = new SparkContext(conf)

    val userInfo = List(("zhang" , 100) , ("wang" , 90) , ("li" , 80) ,
                        ("zhang" , 101) , ("wang" , 91) , ("li" , 81) ,
                        ("zhang" , 102) , ("wang" , 92) , ("li" , 82))
    val userRdd = sc.parallelize(userInfo)
    userRdd.foreach(println)

    userRdd.reduceByKey(_ + _).collect().foreach(println)

    // 按照key 进行分组
    val userRddGroupByKey = userRdd.groupByKey()
    userRddGroupByKey.foreach(println)

    // 计算每个分组score的总数
    val userTotalScore = userRddGroupByKey.map{
      t => {
        var sum = 0
        for(s <- t._2) {
          sum += s
        }
        (t._1 , sum)
      }
    }
    userTotalScore.collect().foreach(println)

    sc.stop()
  }

}
  • intersection算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * RDD 求交集 intersection 算子
  * Created by asus on 2018/7/15.
  */
object IntersectionDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("IntersectionDemoJava") ;
    conf.setMaster("local[2]") ;

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val classOne = List("lao wang" , "lao wang" , "lao zhang" , "lao li" , "lao zhao")
    val classTwo = List("lao wang" , "lao wang" , "lao zhao" , "lao zhao" , "xiao wang" , "xiao zhao")

    val classOneRDD = sc.parallelize(classOne , 2)
    val classTwoRDD = sc.parallelize(classTwo , 2)

    val classOneAndTwoRDD = classOneRDD.intersection(classTwoRDD)
    classOneAndTwoRDD.foreach {
      s => {
        println(s)
      }
    }

    sc.stop()
  }

}
  • join算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/7/18.
  */
object JoinDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("CogroupDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)


    val scores_1 = List(("lao wang" , 10) , ("lao zhang" , 20) , ("lao zhao" , 30) , ("lao li" , 40))
    val scores_2 = List(("lao wang" , 10) , ("xiao zhang" , 20) , ("lao zhao" , 30) , ("xiao li" , 40))

    val scoreOneRDD = sc.parallelize(scores_1 , 2)
    val scoreTwoRDD = sc.parallelize(scores_2 , 2)

    val joinRDD = scoreOneRDD.join(scoreTwoRDD)

    joinRDD.foreach{
      t => {
        println(t._1 + " -> " + t._2._1 + " , " + t._2._2)
      }
    }

    sc.stop()
  }

}
  • mapPartitions算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/6/16.
  * transformation 算子 mapPartitions(f : Iterator[T] => Iterator[U])
  * 将 f 以 Partiotions分区为单位,应用于整个partitions,其作用与map算子相同,在RDD中数据量较小的情况下可以提升执行速度
  * 数据量大会导致内存溢出 OOM
  */
object MapPartitionsDemo {
  def main(args : Array[String]):Unit = {
    println("MapPartitionsDemoScala")

    val conf = new SparkConf()
    conf.setAppName("MapPartitionsDemoScala")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val names = List[String]("xuruyun" , "liangjingru" , "wangfei")

    val scoreMap = Map("xuruyun" -> 150 , "liangjingru" -> 100 , "wangfei" -> 90)

    val nameRdd = sc.parallelize(names)

    /**
      * 自定义mapPartitions的处理方法 Iterator[String] => Iterator[Int]
      * @param names
      * @return
      */
    def getUserScore(names: Iterator[String]) : Iterator[Int] = {
      var userScore = List[Int]()
      while(names.hasNext) {
        val name = names.next()
        val score = scoreMap(name)
        userScore = List(score) ::: userScore
      }
      userScore.iterator
    }
    // 自定义打印方法
    def printScore(score : Int) {println("score :" + score)}

    val scoreRdd = nameRdd.mapPartitions(getUserScore)
    scoreRdd.foreach(printScore)
    sc.stop()
  }


}
  • mapPartitionsWithIndex算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/6/16.
  * transformation 算子 mapPartitionsWithIndex(f : (Integer , INtorator[U]) => Intorator[T])
  * 该算子与map、mapPartitions算子类似,知识多带了一个partitions的编号
  */
object MapPartitionsWithIndexDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("MapPartitionsWithIndexDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val names = Array("xuruyun" , "liangjingru" , "wangfei")
    val nameRdd = sc.parallelize(names)

    def getNamePartitionIndex(index: Int , names : Iterator[String]) : Iterator[String] = {
      var newNames = List[String]()
      while(names.hasNext) {
        val name = names.next()
        newNames = List("Hello " + name + " with index " + index) ::: newNames
      }
      newNames.iterator
    }

    val nameRddWithPartitionsIndex = nameRdd.mapPartitionsWithIndex(getNamePartitionIndex)
    nameRddWithPartitionsIndex.foreach(println)
    sc.stop()
  }

}
  • map算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/5/20.
  * transformation 算子 map(f : U => U)
  * 将 f 应用与 RDD的每个元素
  */
object MapRddDemo {
  def printNum(num : Int) : Unit = println(num)

  def main(args: Array[String]): Unit = {
    println(args)

    val conf = new SparkConf().setAppName("map rdd demo").setMaster("local[2]")
    System.setProperty("hadoop.home.dir", "E:\\hadoop-2.6.0")
    val sc = new SparkContext(conf)

    val numRdd = sc.parallelize(Array(1 , 2 , 3 , 4))

    numRdd.map(num => num + 1).foreach(num => printNum(num))
    val sum = numRdd.reduce(_ + _)
    println("sum = " + sum)
    sc.stop()
  }
}
  • repartition算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/6/16.
  * transformation 算子 repartition(num)
  * 用于提高并行度 , 会进行shuffle <==> coalesce(num , true)
  * 建议:减少partition数量使用coalsece算子,增加partition数量使用repartition算子
  * 产生shuffle
  */
object RepartitionDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("RepartitionDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
    val sc = new SparkContext(conf)

    val numbers = Array(1 to 100 : _*)
    val numRdd = sc.parallelize(numbers , 10)

    val numRddWithPartitionIndex = numRdd.mapPartitionsWithIndex{
      (index , numIter) => {
        var numString = List[String]()
        while(numIter.hasNext) {
          val num = numIter.next()
          numString = List("number " + num + "with partition index " + index) ::: numString
        }
        numString.iterator
      }
    }
    numRddWithPartitionIndex.collect().foreach(println)

    val numRddRepartition = numRdd.repartition(5)

    val numRddRepartitionWithPartitionIndex = numRddRepartition.mapPartitionsWithIndex{
      (index , numIter) => {
        var numString = List[String]()
        while(numIter.hasNext) {
          val num = numIter.next()
          numString = List("number " + num + "with partition index " + index) ::: numString
        }
        numString.iterator
      }
    }
    numRddRepartitionWithPartitionIndex.collect().foreach(println)

    sc.stop()
  }
}
  • sample算子

package rddDemo.transformation

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
  * sample(replace:boolean , fraction:float , seed : long) 取样算子
  * Created by asus on 2018/7/8.
  */
object SampleDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("SampleDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val names = List("lao wang" , "xiao wang" , "lao zhang" , "xiao zhang" , "lao li" , "xiao li")
    val namesRDD = sc.parallelize(names , 3)

    // 无放回取样50%
    println(">>>>>>>>>>>>>>>>>> 无放回取样50% <<<<<<<<<<<<<<<<<")
    namesRDD.sample(false , 0.5).foreach(println)

    // 有放回取样50%
    println(">>>>>>>>>>>>>>>>>> 有放回取样50% <<<<<<<<<<<<<<<<<")
    namesRDD.sample(true , 0.5).foreach(println)

    // 无放回取样50%,指定seed,取样唯一
    println(">>>>>>>>>>>>>>>>>> 无放回取样50%,指定seed,取样唯一 <<<<<<<<<<<<<<<<<")
    namesRDD.sample(false , 0.5 , 100).foreach(println)

    // 有放回取样50%,指定seed,取样唯一
    println(">>>>>>>>>>>>>>>>>> 有放回取样50%,指定seed,取样唯一 <<<<<<<<<<<<<<<<<")
    namesRDD.sample(true , 0.5 , 100).foreach(println)

    sc.stop()
  }
}
  • sortByKey算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/7/11.
  */
object SortByKeyDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("SortByKeyDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" ,"E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val scores = List[(String , Integer)](("lao wang" , 10) , ("lao zhang" , 20) , ("lao li" , 30) , ("lao zhao" , 40))
    val scoreRDD = sc.parallelize(scores , 2)
    scoreRDD.foreach{
      s => {
        println("name -> " + s._1 + " , score -> " + s._2)
      }
    }

    // 升序
    println(">>>>>>>>>>>>>>>>>>>>>>>>>>> asc(升序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
   val scoreSortByKeyAscRDD = scoreRDD.sortByKey(true)
    scoreSortByKeyAscRDD.foreach{
      s => {
        println("name -> " + s._1 + " , score -> " + s._2)
      }
    }

    // 降序
    println(">>>>>>>>>>>>>>>>>>>>>>>>>>> desc(降序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
    val scoreSortByKeyDescRDD = scoreRDD.sortByKey(true)
    scoreSortByKeyDescRDD.foreach{
      s => {
        println("name -> " + s._1 + " , score -> " + s._2)
      }
    }

    // 按照分数升序排序
    println(">>>>>>>>>>>>>>>>>>>>>>>>>>> asc(按照分数升序排序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
    val scoreAscRDD = scoreRDD.map(s => (s._2 , s._1)).sortByKey(true).map(s => (s._2 , s._1)).foreach{
      s => {
        println("name -> " + s._1 + " , score -> " + s._2)
      }
    }

    // 按照分数降序排序
    println(">>>>>>>>>>>>>>>>>>>>>>>>>>> desc(按照分数降序排序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
    val scoreDescRDD = scoreRDD.map(s => (s._2 , s._1)).sortByKey(false).map(s => (s._2 , s._1)).foreach{
      s => {
        println("name -> " + s._1 + " , score -> " + s._2)
      }
    }

    sc.stop()
  }

}
  • union算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * rdd_1.union(rdd_2)
  * 直接合并RDD的partition
  * 如果rdd_1,rdd_2分别有两个partition,则合并后的RDD有4个partition,即union过程中不会shuffle
  * Created by asus on 2018/7/8.
  */
object UnionDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("UnionDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val oldMan = List("lao wang" , "lao zhang" , "lao li" , "lao zhao")
    val youngMan = List("xiao wang" , "xiao zhang" , "xiao li" , "xiao zhao")

    /**
      * name : lao li with index 1
      * name : lao zhao with index 1
      * name : lao wang with index 0
      * name : lao zhang with index 0
      */
    // oldManRDD 有两个 partition
    val oldManRDD = sc.parallelize(oldMan , 2)
    val oldManWithIndex = oldManRDD.mapPartitionsWithIndex{
      (index , names) => {
        var nameWithIndex = List[String]()
        while(names.hasNext) {
          nameWithIndex = List("name : " + names.next() + " with index " + index) ::: nameWithIndex
        }
        nameWithIndex.iterator
      }
    }
    oldManWithIndex.foreach(println)

    /**
      * name : xiao li with index 1
      * name : xiao zhao with index 1
      * name : xiao wang with index 0
      * name : xiao zhang with index 0
      */
    // youngManRDD 有两个 partition
    val youngManRDD = sc.parallelize(youngMan , 2)
    val youngManRDDWithIndex = youngManRDD.mapPartitionsWithIndex{
      (index , names) => {
        var nameWithIndex = List[String]()
        while(names.hasNext) {
          nameWithIndex = List("name : " + names.next() + " with index " + index) ::: nameWithIndex
        }
        nameWithIndex.iterator
      }
    }
    youngManRDDWithIndex.foreach(println)

    /**
      * name : lao li with index 1
      * name : lao zhao with index 1
      * name : xiao wang with index 2
      * name : xiao zhang with index 2
      * name : xiao li with index 3
      * name : xiao zhao with index 3
      * name : lao wang with index 0
      * name : lao zhang with index 0
      */
    // 合并 oldManRDD 和 youngManRDD , 合并后的RDD中有 4 个 partition,且每个partition 中的内容与未合并前一样
    val unionOldAndYoungRDD = oldManRDD.union(youngManRDD)
    val unionOldAndYoungRDDWithIndex = unionOldAndYoungRDD.mapPartitionsWithIndex{
      (index , names) => {
        var nameWithIndex = List[String]()
        while(names.hasNext) {
          nameWithIndex = List("name : " + names.next() + " with index " + index) ::: nameWithIndex
        }
        nameWithIndex.iterator
      }
    }
    unionOldAndYoungRDDWithIndex.foreach(println)

    sc.stop()
  }

}
  • saveAsTextFile算子

package rddDemo.transformation

import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by asus on 2018/7/15.
  */
object SaveAsTextFileDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("SaveAsTextFileDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val numbers = Array(1 to 10 : _*)

    val numberRDD = sc.parallelize(numbers , 2)

    // 将数据文件保存到本地文件系统(前提事目录不能事先存在)
    numberRDD.saveAsTextFile("src/main/scala/rddDemo/saveAsTextFilePath")
    // 将数据文件保存到HDFS(前提事目录不能事先存在)
//    numberRDD.saveAsTextFile("hdfs://ip:9000/saveAsTextFilePath")
    sc.stop()
  }

}

 

你可能感兴趣的:(spark)