第45课 Spark 2.0实战之Dataset:map、flatMap、mapPartitions、dropDuplicate、coalesce、repartition等
package com.dt.spark200
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ArrayBuffer
object DataSetsops {
case class Person(name:String,age:Long)
def main(args: Array[String]): Unit = {
val spark = SparkSession
.config("spark.sql.warehouse.dir", "file:///G:/IMFBigDataSpark2016/IMFScalaWorkspace_spark200/Spark200/spark-warehouse")
import spark.implicits._
import org.apache.spark.sql.functions._
val personDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\people.json")
val personScoresDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peopleScores.json")
val personDS = personDF.as[Person]
/* personDS.map{person=>
(person.name,if (person.age == null) 0 else person.age +100 )
personDS.mapPartitions{persons =>
val result = ArrayBuffer[(String,Long)]()
val person = persons.next()
result +=((person.name,person.age+10000))
val repartitionDs= personDS.repartition(4)
val coalesced= repartitionDs.coalesce(2)
// personDF.show()
// personDF.collect().foreach (println)
// println(personDF.count())
//val personDS = personDF.as[Person]
// personDS.show()
// personDS.printSchema()
//val dataframe=personDS.toDF()
/* personDF.createOrReplaceTempView("persons")
spark.sql("select * from persons where age > 20").show()
spark.sql("select * from persons where age > 20").explain()
// val personScoresDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peopleScores.json")
// personDF.join(personScoresDF,$"name"===$"n").show()
/* personDF.filter("age > 20").join(personScoresDF,$"name"===$"n").show()
personDF.filter("age > 20")
while(true) {}
