当前IDEA中使用的SPARK包是:spark-assembly-1.5.1-hadoop2.4.0
因此SparkSQL使用比较老的类:SQLContext 现在应该用:SparkSession
为了使用方便直接在object的main方法中编写代码来实现,首先添加要引用的包
在主函数中,首先要初始化我们要用到的spark相关的变量信息
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions.{sum}
object Xuan {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Xual").setMaster("local")
val ss = new SparkContext(conf)
val spark= new SQLContext(ss)//也可以用SparkSession.Builder()
//因为代码中要用到agg函数做聚合,因此需要导入隐式转换包
import spark.implicits._
}
}
//统计用的模拟数据,每个班级各科考试总分数 C1:一班 S1:科目1
val scores = Array(
"C1,S1,78","C1,S2,59","C1,S1,99","C1,S2,70","C1,S1,39","C1,S2,71","C1,S3,60","C1,S3,90","C1,S3,55",
"C2,S2,90","C2,S1,60","C2,S1,70","C2,S1,73","C2,S2,80","C2,S2,70","C2,S3,48","C2,S3,80","C2,S3,70",
"C3,S2,66","C3,S1,68","C3,S1,88","C3,S1,72","C3,S2,80","C3,S2,80","C3,S3,52","C3,S3,80","C3,S3,99"
)
完整代码如下
//将测试数据生成RDD
val scoresRDD = ss.parallelize(scores,3)
//将RDD转化为Row的形式,为了生成DataFrame使用
val scoresRowRDD = scoresRDD.map(v=>(
Row(v.split(",")(0),v.split(",")(1),v.split(",")(2).toInt)
))
//创建DataFrame的各字段名称及类型的schema
val scoresSchema = StructType(Array(
StructField("class", StringType, true),
StructField("subject", StringType, true),
StructField("score", IntegerType,true)
))
//创建DataFrame, 通过上面的RowRDD和Schema
val scoresDF = spark.createDataFrame(scoresRowRDD,scoresSchema)
//通过分组和聚合及内部函数sum求得每班每科的总分数
val resDF = scoresDF.groupBy("class","subject").agg('subject,sum('score))
//按总分数由高到低进行排序,然后创建新的Row形式的结果RDD
val resRowRDD = resDF.map(v=>Tuple3(v(0).toString,v(1).toString,v(3).toString.toInt)).sortBy(v=>v._3,false)
.map(r=>Row(r._1,r._2,r._3))
//创建保存结果的DataFrame的各字段名称及类型的schema
val resultSchema = StructType(Array(
StructField("class", StringType, true),
StructField("subject", StringType, true),
StructField("totals", IntegerType,true)
))
//通过结果RDD和结果schema创建保存结果数据的DataFrame
val resDFForShow = spark.createDataFrame(resRowRDD,resultSchema)
//直接显示结果数据
resDFForShow.show()
+-----+-------+------+
|class|subject|totals|
+-----+-------+------+
| C2| S2| 240|
| C3| S3| 231|
| C3| S1| 228|
| C3| S2| 226|
| C1| S1| 216|
| C1| S3| 205|
| C2| S1| 203|
| C1| S2| 200|
| C2| S3| 198|
+-----+-------+------+