SparkSQL DSL语法大全

object DataSetDemo1 {
  def main(args: Array[String]): Unit = {
    val session = SparkSession.builder()
      .master("local")
      .appName(this.getClass.getSimpleName)
      .getOrCreate()
    import session.implicits._

    val dataSet: Dataset[String] = session.read.textFile("D:\\abc\\person\\input\\person2.txt")
    val ds = dataSet.map(line => {
      val splits = line.split(",")
      Student(splits(0), splits(1).toInt, splits(2).toInt)
    })

    val rdd: RDD[String] = session.sparkContext.makeRDD(Array("aa","bb","cc"))
    val ds2: Dataset[String] = session.createDataset(rdd)

    //查询
    ds.select("name","age").show()
    ds.select($"name",$"age").show()
    //别名
    ds.selectExpr("name as newname").show()
    //过滤 peopleDs.where($"age" > 15)
    ds.where($"age" > 18).show()
    ds.filter($"age" > 18).show()

    //给age+1  ds.select(expr("value + 1").as[Int])
    import org.apache.spark.sql.functions._
    ds.select(expr("age + 1").as[Int]).show()
    ds.select($"age" + 1).show()

    //排序
    //ds.sort("sortcol")
    //ds.sort($"sortcol")
    //ds.sort($"sortcol".asc)
    ds.sort($"age".desc,$"score").show()

    //求总条数
    println(ds.count())

    //聚合操作
    ds.groupBy("name").agg(sum("score")).show()
    ds.groupBy("name").count().show()

    val df1 = session.read.textFile("D:\\abc\\join\\student.txt")
      .map(line => {
        val splits = line.split(",")
        (splits(0).toInt, splits(1))
      }).toDF("id", "name")

    val df2 = session.read.textFile("D:\\abc\\join\\weigth.txt")
      .map(line => {
        val splits = line.split(",")
        (splits(0).toInt, splits(1).toFloat)
      }).toDF("uid", "weight")

    //join  on的字段名相同
    //df1.join(df2, "user_id")
    //df1.join(df2,"id").show()

    //on的字段名不相同
    //df1.join(df2, $"df1Key" === $"df2Key")
    //df1.join(df2).where($"df1Key" === $"df2Key")
    //这里的相等,是三个等于号
    df1.join(df2,$"id"=== $"uid").show()

    //默认是内连接
    //df1.join(df2, $"df1Key" === $"df2Key", "outer")
    //`inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,
    //                `right`, `right_outer`, `left_semi`, `left_anti`.
    df1.join(df2,$"id" === $"uid","left_outer").show()

    //笛卡尔积
    df1.crossJoin(df2).show()

    //字段的重命名
    df1.withColumnRenamed("id","newid").show()

    session.stop()
  }
}

你可能感兴趣的:(SparkSQL DSL语法大全)