大数据Spark “蘑菇云”行动第47课程 Spark 2.0实战之Dataset:collect_list、collect_set、avg、sum、countDistinct等
{"name":"Michael", "age":16}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
{"name":"Justin", "age":29}
{"name":"Michael", "age":46}
运行结果
16/09/17 22:22:15 INFO CodeGenerator: Code generated in 20.317672 ms
+-------+--------+--------+--------+--------+-------------------+--------+--------------+
| name|sum(age)|avg(age)|max(age)|min(age)|count(DISTINCT age)|avg(age)|current_date()|
+-------+--------+--------+--------+--------+-------------------+--------+--------------+
|Michael| 62| 31.0| 46| 16| 2| 31.0| 2016-09-17|
| Andy| 30| 30.0| 30| 30| 1| 30.0| 2016-09-17|
| Justin| 48| 24.0| 29| 19| 2| 24.0| 2016-09-17|
+-------+--------+--------+--------+--------+-------------------+--------+--------------+
16/09/17 22:22:15 INFO FileSourceStrategy: Pruning directories with:
16/09/17 22:22:15 INFO FileSourceStrategy: Post-Scan Filters:
16/09/17 22:22:16 INFO TaskSchedulerImpl: Removed TaskSet 13.0, whose tasks have all completed, from pool
16/09/17 22:22:16 INFO CodeGenerator: Code generated in 5.69955 ms
+-------+---+-----------------+
| name|age|concat(name, age)|
+-------+---+-----------------+
| Justin| 29| Justin29|
| Andy| 30| Andy30|
|Michael| 16| Michael16|
|Michael| 46| Michael46|
| Justin| 19| Justin19|
+-------+---+-----------------+
16/09/17 22:22:19 INFO CodeGenerator: Code generated in 12.500815 ms
+-------+------------------+-----------------+
| name|collect_list(name)|collect_set(name)|
+-------+------------------+-----------------+
|Michael|[Michael, Michael]| [Michael]|
| Andy| [Andy]| [Andy]|
| Justin| [Justin, Justin]| [Justin]|
+-------+------------------+-----------------+
package com.dt.spark200
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.sql.SparkSession
object DataSetsops {
case class Person(name:String,age:Long)
case class Score(n:String,score:Long)
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("DatasetOps")
.master("local")
.config("spark.sql.warehouse.dir", "file:///G:/IMFBigDataSpark2016/IMFScalaWorkspace_spark200/Spark200/spark-warehouse")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
val personDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\people.json")
val personScoresDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peopleScores.json")
val personDS = personDF.as[Person]
val personScoresDS =personScoresDF.as[Score]
personDS.groupBy($"name").agg(sum($"age"),avg($"age"),max($"age"),min($"age")
,countDistinct($"age"),mean($"age"),current_date()).show
personDS.groupBy($"name",$"age").agg(concat($"name", $"age")).show()
personDS.groupBy($"name")
.agg(collect_list($"name"),collect_set($"name"))
.collect().foreach { println }
personDS.groupBy($"name")
.agg(collect_list($"name"),collect_set($"name"))
.show
spark.stop()
}
}