1.SparkRDD join案例

1.本实例主要假设有两个数据源,一个是使用了SparkSQL的API读取JSON文件,另一个数据源来自于自己构造的数据信息(实例中的peopleInformation)。

2.实例演示了JSON文件的读取,和RDD的JOIN操作,以及Schema,RDD和ROW之间的有趣关系;

3.具体实例如下:

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.DataTypes
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.RowFactory
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.IntegerType

object SparkSQLWithJoin {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("SparkSQLWithJoin");
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    
    //1.从json文件中读取json并生成DataFrame
    val df = sqlContext.read.json("xxxxxx\\spark-1.6.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peoples.json")
    
    //2.将DataFrame注册成临时表
    df.registerTempTable("peopleScores")
    
    //3.执行SparkSQL查找出成绩>90的人的姓名和成绩
    val execellentStudentDF = sqlContext.sql("select name,score from peopleScores where score > 90")
    
    //4.将名字提取数组
    val execellentStudent = execellentStudentDF.rdd.map(row => row(0)).collect()
    
    //5.模拟jsons文件生成
    val peopleInformation = Array("{\"name\":\"Michael\", \"age\":20}","{\"name\":\"Andy\", \"age\":17}","{\"name\":\"Justin\", \"age\":19}","{\"name\":\"zhangsan\", \"age\":25}")
    
    for(item <- peopleInformation) println(item)
    
    //6.将json读进来的Array转换成RDD
    val peopleInformationRDD = sc.parallelize(peopleInformation)
    
    //7.将RDD转换成DataFrame
    val peopleInformationDF = sqlContext.read.json(peopleInformationRDD)
    
    //8.注册临时表
    peopleInformationDF.registerTempTable("peopleInformations")
    
    //9.拼接sql
    var sqltext = "select name,age from peopleInformations where name in ("
    for(i <- 0 until execellentStudent.size) {
      sqltext += "'" + execellentStudent(i) + "'"
      if(i < execellentStudent.size - 1) {
        sqltext += ","
      }
    }
    sqltext += ")"
    
    println(sqltext)
    
    //10.执行sql提取DataFrame
    val execellentNameAgeDF = sqlContext.sql(sqltext)
    
    //11.将execellentNameAgeDF转换成RDD并执行join
    
    val execellentNameAgeRDD = execellentNameAgeDF.rdd.map(row => (row.getString(0),row.getLong(1).toInt))
    
    val execellentNameScoresRDD = execellentStudentDF.rdd.map(row => (row.getString(0),row.getLong(1).toInt))
    
    val resultRDD = execellentNameAgeRDD join execellentNameScoresRDD
    
    val resultRowRDD = resultRDD.map(item => Row(item._1,item._2._1,item._2._1))
    
    import scala.collection.mutable.ArrayBuffer
    
    //12.构建schema
    val schema = StructType(StructField("name",StringType,true)::StructField("age",IntegerType,true)::StructField("score",IntegerType,true):: Nil)
    
    val personInfoDF = sqlContext.createDataFrame(resultRowRDD, schema)
    personInfoDF.show
    
    val resultList = personInfoDF.rdd.collect()
    
    for(item <- resultList) println(item)
    
  }
}


你可能感兴趣的:(Spark)