spark常见转换算子(transformation)的操作

package com.meng.nan.day717

import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.rdd.RDD

import scala.collection.mutable.ArrayBuffer

//常见转换算子(transformation)的操作例如

//map,flatmap,join,sample等

object flatMapClass {

def main(args: Array[String]): Unit = {

Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)

Logger.getLogger("org.spark_project").setLevel(Level.WARN)

val  conf=new SparkConf()

.setAppName("flatMapClass")

.setMaster("local[*]");

val sc=new SparkContext(conf)

val list=List("safdasd")

val rdd:RDD[String]=sc.parallelize(list)

//字符串拆分flatMap()

    val chrdd:RDD[(Char,Int)]=rdd.flatMap(str=>{

val ab=ArrayBuffer[(Char,Int)]()

for (ch<- str){

ab.append((ch,1))

}

ab

})

//    chrdd.foreach(ch=>println(ch))

//字符串一一映射map

    val list1=1 to9

    val lsrdd:RDD[Int]=sc.parallelize(list1)

//        val mapRDD=lsrdd.map(num=>num * 7)

//        println("partitions"+mapRDD.getNumPartitions)

//        mapRDD.foreach(println)

//        val mapRDD=lsrdd.map(num=>(num,1))

//        val mapRDD=lsrdd.map((_,1))

//        for (sc<-mapRDD){

//        println(sc)

//        }

//filter过滤

//        val flterRdd=lsrdd.filter(fl=>(fl%2==0))

//        flterRdd.foreach(println)

//union联合操作

      val list2=List(3,1,2,2,11,22,33,44)

val lr:RDD[Int]=sc.parallelize(list2)

val unRDD:RDD[Int]=lsrdd.union(lr)

unRDD.foreach(println)

//      抽样算子sample

//      val smRDD=lsrdd.sample(false,0.2)

//      smRDD.foreach(println)

//      println(smRDD.count())

  }

}

package com.meng.nan.day717

import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.rdd.RDD

object JionClass {

def main(args: Array[String]): Unit = {

Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)

Logger.getLogger("org.spark_project").setLevel(Level.WARN)

//转换算子join操作所有的join操作,必须要求的RDD的类型时

//    * K就是关联字段

val conf=new SparkConf()

.setMaster("local[*]")

.setAppName("JionClass")

val sc=new SparkContext(conf)

jionOps(sc)

}

def jionOps(sc:SparkContext): Unit ={

val stu=List(

"1 刘冯曼娜 22 bd-1901-bj",

"2 常师 25 bd-1901-bj",

"3 张熹 24 bd-1901-sz",

"4 胡盼 18 bd-1901-wh"

)

val scores =List(

"1 1 math 82",

"2 1 english 0",

"3 2 chinese 85.5",

"4 3 PE 99",

"5 10 math 99"

    )

//将字符串转为RDD形式

    val stuRDD:RDD[String]=sc.parallelize(stu)

val scoresRDD:RDD[String]=sc.parallelize(scores)

//查询所有学生信息

//将学生表转化为map集合

    val stuMapRDD:RDD[(String,String)]=stuRDD.map(stringLine=>{

val sid=stringLine.substring(0,1)

val stuInf=stringLine.substring(1).trim

(sid,stuInf)

})

//将成绩表转化为map集合

    val scoresMapRDD:RDD[(String,String)]=scoresRDD.map(scorLine=>{

val filter=scorLine.split("\\s+")

val sid=filter(1)

val scoreInfo=filter(2)+" "+filter(3)

(sid,scoreInfo)

})

//    //join操作

    val joinRDD:RDD[(String,(String,String))]=stuMapRDD.join(scoresMapRDD)

joinRDD.foreach{case (sid,(stuInf,scoreInfo))=>{

println(s"sid:${sid}\tstuInf:${stuInf}\tscoreInfo:${scoreInfo}")

}}

println("======================")

//左连接查询

    val leftJoin:RDD[(String,(String,Option[String]))]=stuMapRDD.leftOuterJoin(scoresMapRDD)

leftJoin.foreach {case (sid, (stuInf, scoreInfo)) => {

println(s"sid${sid}\tstuInf:${stuInf}\tscoreInfo:${scoreInfo}")

}

}

println("================================")

//全连接

val fullMapJoin:RDD[(String,(Option[String],Option[String]))]=stuMapRDD.fullOuterJoin(scoresMapRDD)

fullMapJoin.foreach{case (sid,(stuInf,scoreInfo))=>{

println(s"sid:${sid}\tstuInfo:${stuInf}\tscoreInfo:${scoreInfo}")

}}

}

}

你可能感兴趣的:(spark常见转换算子(transformation)的操作)