SparkSql 函数的使用

 
转载link:http://www.cnblogs.com/BYRans/p/5005342.html 写的不错
 
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._

/**  *  */ object sparkDataframe {


  def main(args: Array[String]): Unit = {


    val conf = new SparkConf().setMaster("local[*]").setAppName("anti_Join")

    val sqlcontext = new SQLContext(new SparkContext(conf))

    import sqlcontext.implicits._

    val scoreDF = Seq((1, "sk", 99), (2, "jim", 72), (1, "sk", 99)).toDF("id", "name", "score")

    val stuDF = Seq((1, "sk12", 99), (2, "9jim", ""), (3, "jiem", 82)).toDF("id", "name", "score")
  
 implicit val df = Seq(
  ("1", "This is my country", Seq(1, 2), "2016-09-21"),
  ("2", "我们热爱自己的祖国", Seq(3, 4), "2016-09-21"),
  ("3", "劳动人民最可爱", Seq(4, 5), "2016-09-21"),
  ("4", "劳动人民最可爱", Seq(7, 9), "2016-09-21")
  ).toDF("id", "name", "agelist", "time")

    val sname = scoreDF.mapPartitions(r => {
      r.map(ro => {
        ro.getAs[String]("name")
      })
    })

    val names = sname.take(3).toSeq

    //判断是否存在  //stuDF.filter(!$"name".isin(names: _*)).show()   //like并不类似与模糊查询  //stuDF.filter($"name".like("s")).show()   //RLIKE正则表达式  stuDF.filter($"name".rlike("""[A-Za-z]+$""")).show()


    //cast支持的索引类型  /*  *  * Casts the column to a different data type, using the canonical string representation  * of the type. The supported types are: `string`, `boolean`, `byte`, `short`, `int`, `long`,  * `float`, `double`, `decimal`, `date`, `timestamp`.  * */   //scoreDF.selectExpr("cast(score as double)","name").show()  //scoreDF.select($"score".cast("double").as("nscore"), $"name").show()   //添加列名并设置默认值  //scoreDF.select($"name").withColumn("city", lit("ShangHai")).show()   //left join coalesce 如果为null给默认值  scoreDF.as("a").join(stuDF.as("b"), $"a.id" === $"b.id")
      .select($"a.name", coalesce($"b.score", lit(0).as("score")))
       .show()
 
 
  //读出schema的字段然后去掉不需要的字段,filterNot方法
 val names = df.schema.fieldNames.filterNot(_ == "agelist")
  //data_format时间格式,如date_format($"time","yyyyMMdd")  coalesce只能为Null的情况
  df.select(date_format($"time", "dd"),coalesce($"name",lit("劳动"))).show()
  //连接各个字段
  df.selectExpr(s"concat_ws('::::',${names.mkString(",")}) line").show()

  df.select(concat_ws("&", $"id", $"name").as("data")).show()
 //map collect数据到driver端,以防序列化到问题  val ids: Array[String] = df.map(_.getAs[String]("id")).collect()

 //对数据进行广播  val broadcastNames = sqlContext.sparkContext.broadcast(names)


}}

你可能感兴趣的:(spark,函数,对象,大数据)