转载link:http://www.cnblogs.com/BYRans/p/5005342.html 写的不错
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
/** * */ object sparkDataframe {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("anti_Join")
val sqlcontext = new SQLContext(new SparkContext(conf))
import sqlcontext.implicits._
val scoreDF = Seq((1, "sk", 99), (2, "jim", 72), (1, "sk", 99)).toDF("id", "name", "score")
val stuDF = Seq((1, "sk12", 99), (2, "9jim", ""), (3, "jiem", 82)).toDF("id", "name", "score")
implicit val df = Seq(
("1", "This is my country", Seq(1, 2), "2016-09-21"),
("2", "我们热爱自己的祖国", Seq(3, 4), "2016-09-21"),
("3", "劳动人民最可爱", Seq(4, 5), "2016-09-21"),
("4", "劳动人民最可爱", Seq(7, 9), "2016-09-21")
).toDF("id", "name", "agelist", "time")
val sname = scoreDF.mapPartitions(r => {
r.map(ro => {
ro.getAs[String]("name")
})
})
val names = sname.take(3).toSeq
//判断是否存在 //stuDF.filter(!$"name".isin(names: _*)).show() //like并不类似与模糊查询 //stuDF.filter($"name".like("s")).show() //RLIKE正则表达式 stuDF.filter($"name".rlike("""[A-Za-z]+$""")).show()
//cast支持的索引类型 /* * * Casts the column to a different data type, using the canonical string representation * of the type. The supported types are: `string`, `boolean`, `byte`, `short`, `int`, `long`, * `float`, `double`, `decimal`, `date`, `timestamp`. * */ //scoreDF.selectExpr("cast(score as double)","name").show() //scoreDF.select($"score".cast("double").as("nscore"), $"name").show() //添加列名并设置默认值 //scoreDF.select($"name").withColumn("city", lit("ShangHai")).show() //left join coalesce 如果为null给默认值 scoreDF.as("a").join(stuDF.as("b"), $"a.id" === $"b.id")
.select($"a.name", coalesce($"b.score", lit(0).as("score")))
.show()
//读出schema的字段然后去掉不需要的字段,filterNot方法
val names = df.schema.fieldNames.filterNot(_ == "agelist")
//data_format时间格式,如date_format($"time","yyyyMMdd") coalesce只能为Null的情况
df.select(date_format($"time", "dd"),coalesce($"name",lit("劳动"))).show()
//连接各个字段
df.selectExpr(s"concat_ws('::::',${names.mkString(",")}) line").show()
df.select(concat_ws("&", $"id", $"name").as("data")).show()
//map collect数据到driver端,以防序列化到问题 val ids: Array[String] = df.map(_.getAs[String]("id")).collect()
//对数据进行广播 val broadcastNames = sqlContext.sparkContext.broadcast(names)
}}