SparkSQL案例

Spark SQL UDF 和 DUAF

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

/**
 * @Autho: Administrator and wind
 * @Version: 2019/11/21 & 1.0
 *
 */
object SparkSQLUDFUDAF {

  def main(args: Array[String]): Unit = {

    //减少日志打印
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    
    val conf = new SparkConf().setMaster("local").setAppName("SparkSQLUDFUDAF")

    val sc = new SparkContext(conf)

    val sqlContext = new SQLContext(sc)

    val bigData = Array("Spark","Spark","Spark","spark","spark","spark","spark","spark","Hadoop","Hadoop","Hadoop","Hadoop")

    //创建Dataframe
    val bigDataRDD = sc.parallelize(bigData)

    val bigDataRDDRow = bigDataRDD.map(item => Row(item))

    val structType = StructType(Array(
      new StructField("word",StringType)
    ))

    val bigDataDF = sqlContext.createDataFrame(bigDataRDDRow,structType)

    bigDataDF.createOrReplaceTempView("bigDataTable")

    //UDF  最多22个输入参数
    sqlContext.udf.register("length",(input:String) => input.length())

    //UDAF
    sqlContext.udf.register("wordcount",new MyUDAF)

    sqlContext.sql("select word,wordcount(word) as count from bigDataTable group by word").show()

    sqlContext.sql("select word,length(word) from bigDataTable").show()

    sc.stop()
  }

}

class MyUDAF extends UserDefinedAggregateFunction {

  /**
   * 该方法指定具体输入数据类型
   * @return
   */
  override def inputSchema: StructType = StructType(Array(StructField("input",StringType,true)))

  /**
   * 在进行聚合操作,所要处理的数据的结果类型
   * 储存中间结果,和最后输出结果类型一致
   * @return
   */
  override def bufferSchema: StructType = StructType(Array(StructField("count",IntegerType,true)))

  /**
   * 指定UDAF函数计算后的结果类型
   * @return
   */
  override def dataType: DataType = IntegerType

  /**
   * 确保一致性,一般都用true
   * @return
   */
  override def deterministic: Boolean = true

  /**
   * 在Aggregate之前每组数据的初始化结果
   * @param buffer
   */
  override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = 0 }

  /**
   * 在聚合的时候,每当有新的值进来,对分组的聚合如何进行计算
   * 本地的集合操作,相当于Hadoop MapReduce模型中的combiner
   * @param buffer
   * @param input
   */
  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer(0) = buffer.getAs[Int](0) + 1
  }

  /**
   * 最后在分布式节点进行local reduce完成后需要进行全局级别的merge操作
   * @param buffer1
   * @param buffer2
   */
  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Int](0) + buffer2.getAs[Int](0)
  }

  /**
   * 返回UDAF最后的计算结果
   * @param buffer
   * @return
   */
  override def evaluate(buffer: Row): Any = buffer.getAs[Int](0)
}

Spark SQL 常用命令

import java.text.SimpleDateFormat

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

/**
 * @Autho: Administrator and wind
 * @Version: 2019/11/19 & 1.0
 *
 * SparkSQL相关案例
 *
 * 学号
 * 姓名
 * 性别
 * 生日
 * 所在班级
 */
case class Student(sno:String,sname:String,ssex:String,sbirthday:String,sclass:String)

//tno 是教工编号
case class Course(cno:String,cname:String,tno:String)

//sno学号 cno课程号 degree成绩
case class Score(sno:String,cno:String,degree:String)

//tprof职称 tdepart教工所在部门
case class Teacher(tno:String,tname:String,tsex:String,tbirthday:String,tprof:String,tdepart:String)
object SparkSQLExample {
  def main(args: Array[String]): Unit = {

    //Hadoop环境变量 运行不报错
    //System.setProperty("hadoop.home.dir","E:\\hadoop-2.8.4")

    //减少日志打印
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    //Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

    /**
     * 获取当前时间的函数
     */
      def getDate(time:String) = {
        val now : Long = System.currentTimeMillis()
        val df : SimpleDateFormat = new SimpleDateFormat(time)
        df.format(now)
      }

    //Spark环境
    val spark = SparkSession.builder()
      .master("local")
      .appName("SparkSQLExample")
      .getOrCreate()

    //读取数据
    import spark.sqlContext.implicits._

    spark.sparkContext
      .textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Student.csv")
      .map(_.split(","))
      .map(x => Student(x(0),x(1),x(2),x(3),x(4)) )
      .toDF
      .createOrReplaceTempView("Student")

    spark.sparkContext
      .textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Course.csv")
      .map(_.split(","))
      .map(x => Course(x(0),x(1),x(2)))
      .toDF()
      .createOrReplaceTempView("Course")

    spark.sparkContext
        .textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Score.csv")
        .map(_.split(","))
        .map(x => Score(x(0),x(1),x(2)))
        .toDF()
        .createOrReplaceTempView("Score")

    spark.sparkContext
        .textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Teacher.csv")
        .map(_.split(","))
        .map(x => Teacher(x(0),x(1),x(2),x(3),x(4),x(5)))
        .toDF()
        .createOrReplaceTempView("Teacher")

    //spark.sql("select * from Teacher").show()

    //查询Student表中所有记录 sanme ssex sclass 列
    //spark.sql("select sname,ssex,sclass from Student").show()

    //查询教师表中不重复depart列
    //spark.sql("select distinct tdepart from Teacher").show(false)
    //spark.sql("select tdepart from Teacher group by tdepart").show(false)

    //查询score表中成绩在60 80 之间的所有记录
    //spark.sql("select * from score where degree >=60 and degree <=80").show()
    //spark.sql("select * from score where degree between 60 and 80").show()

    //查询score表中成绩为85 86 或者是 88 的记录
    //spark.sql("select * from score where degree = '85' or degree = '86' OR degree = '88'").show()

    //以class降序升序查询
    //spark.sql("select * from student order by sclass desc").show()
    //spark.sql("select * from student order by sclass").show()

    //以cno升序degree降序查询score表中数据
    //spark.sql("select * from score t order by t.cno asc, t.degree desc").show()

    //查询score表中的最高分的学生的学号和课程号
    //spark.sql("select * from score order by Int(degree) desc limit 1").show()
    //spark.sql("select * from score order by Int(degree) desc").show()

    //查询每门课程的平均成绩 调用数值类型函数的会自动转化
    //spark.sql("select cno, avg(degree) from score group by cno").show()

    //查询score表中至少有5名学生选修的课,并且名字以3开头的课程 的平均分数
    //spark.sql("select cno,avg(degree) from score where cno like '3%' group by cno having count(cno) >= 5").show()

    //查询所有学生中的 sname cname degree
//    spark.sql("select s.sname, c.cname, t.degree from score t " +
//      "join student s on t.sno = s.sno " +
//      "join course c on c.cno = t.cno").show(false)
    //spark.sql("select s.sname, c.cname, t.degree from score t, student s, course c where t.sno = s.sno and c.cno = t.cno").show(false)

    //查询score表中选择多门课程的同学,分数为非最高分成绩的记录
//    spark.sql("select * from score where " +
//    "sno in (select sno from score t group by t.sno having count(cno) > 1) " +
//    "and degree != (select max(degree) from score)").show()

    //查询和学号为108的同学同年出生的所有同学的sno sname sbirthday列
//    spark.sql("select sno, sname, sbirthday from student where substring(sbirthday,0,4) = " +
//    "(select substring(sbirthday,0,4) from student where sno = '108')").show()

    //查询选修某课程的同学人数大于5名的教师姓名
//    spark.sql("select tname from teacher t, " +
//    "(select cno from score group by cno having count(cno) > 5) s, " +
//      "course c where " +
//    "t.tno = c.tno and c.cno = s.cno").show()
//    spark.sql("select tname from teacher t " +
//    "join course c on t.tno = c.tno " +
//    "join (select cno from score group by cno having count(cno) > 5) s on s.cno = c.cno").show()

    //查询成绩比该课程平均成绩低的同学的成绩表
    //spark.sql("select * from score s where s.degree < (select avg(degree) from score c where c.cno = s.cno)").show()

    //查询所有没有讲课的教师的 tname 和 depart
    //spark.sql("select tname, tdepart from teacher t where t.tno not in (select tno from course c where c.cno in (select cno from score))").show(false)

    //查询至少有两名男生的班号
    //spark.sql("select sclass from student t where ssex  = 'male' group by sclass having count(ssex) >= 2").show()

    //查询student表中不姓 王 的同学的记录
    //spark.sql("select * from student t where sname not like 'Wang%'").show()

    //查询student 表中每个学生的姓名和年纪
    //spark.sql("select sname, (cast(" + getDate("yyyy") + " as int) - cast(substring(sbirthday,0,4) as int)) as age from student s").show()

    spark.close()
  }

}

你可能感兴趣的:(大数据,Spark,案例)