Spark SQL UDF 和 DUAF
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Autho: Administrator and wind
* @Version: 2019/11/21 & 1.0
*
*/
object SparkSQLUDFUDAF {
def main(args: Array[String]): Unit = {
//减少日志打印
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
val conf = new SparkConf().setMaster("local").setAppName("SparkSQLUDFUDAF")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val bigData = Array("Spark","Spark","Spark","spark","spark","spark","spark","spark","Hadoop","Hadoop","Hadoop","Hadoop")
//创建Dataframe
val bigDataRDD = sc.parallelize(bigData)
val bigDataRDDRow = bigDataRDD.map(item => Row(item))
val structType = StructType(Array(
new StructField("word",StringType)
))
val bigDataDF = sqlContext.createDataFrame(bigDataRDDRow,structType)
bigDataDF.createOrReplaceTempView("bigDataTable")
//UDF 最多22个输入参数
sqlContext.udf.register("length",(input:String) => input.length())
//UDAF
sqlContext.udf.register("wordcount",new MyUDAF)
sqlContext.sql("select word,wordcount(word) as count from bigDataTable group by word").show()
sqlContext.sql("select word,length(word) from bigDataTable").show()
sc.stop()
}
}
class MyUDAF extends UserDefinedAggregateFunction {
/**
* 该方法指定具体输入数据类型
* @return
*/
override def inputSchema: StructType = StructType(Array(StructField("input",StringType,true)))
/**
* 在进行聚合操作,所要处理的数据的结果类型
* 储存中间结果,和最后输出结果类型一致
* @return
*/
override def bufferSchema: StructType = StructType(Array(StructField("count",IntegerType,true)))
/**
* 指定UDAF函数计算后的结果类型
* @return
*/
override def dataType: DataType = IntegerType
/**
* 确保一致性,一般都用true
* @return
*/
override def deterministic: Boolean = true
/**
* 在Aggregate之前每组数据的初始化结果
* @param buffer
*/
override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = 0 }
/**
* 在聚合的时候,每当有新的值进来,对分组的聚合如何进行计算
* 本地的集合操作,相当于Hadoop MapReduce模型中的combiner
* @param buffer
* @param input
*/
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer(0) = buffer.getAs[Int](0) + 1
}
/**
* 最后在分布式节点进行local reduce完成后需要进行全局级别的merge操作
* @param buffer1
* @param buffer2
*/
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.getAs[Int](0) + buffer2.getAs[Int](0)
}
/**
* 返回UDAF最后的计算结果
* @param buffer
* @return
*/
override def evaluate(buffer: Row): Any = buffer.getAs[Int](0)
}
Spark SQL 常用命令
import java.text.SimpleDateFormat
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
/**
* @Autho: Administrator and wind
* @Version: 2019/11/19 & 1.0
*
* SparkSQL相关案例
*
* 学号
* 姓名
* 性别
* 生日
* 所在班级
*/
case class Student(sno:String,sname:String,ssex:String,sbirthday:String,sclass:String)
//tno 是教工编号
case class Course(cno:String,cname:String,tno:String)
//sno学号 cno课程号 degree成绩
case class Score(sno:String,cno:String,degree:String)
//tprof职称 tdepart教工所在部门
case class Teacher(tno:String,tname:String,tsex:String,tbirthday:String,tprof:String,tdepart:String)
object SparkSQLExample {
def main(args: Array[String]): Unit = {
//Hadoop环境变量 运行不报错
//System.setProperty("hadoop.home.dir","E:\\hadoop-2.8.4")
//减少日志打印
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
/**
* 获取当前时间的函数
*/
def getDate(time:String) = {
val now : Long = System.currentTimeMillis()
val df : SimpleDateFormat = new SimpleDateFormat(time)
df.format(now)
}
//Spark环境
val spark = SparkSession.builder()
.master("local")
.appName("SparkSQLExample")
.getOrCreate()
//读取数据
import spark.sqlContext.implicits._
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Student.csv")
.map(_.split(","))
.map(x => Student(x(0),x(1),x(2),x(3),x(4)) )
.toDF
.createOrReplaceTempView("Student")
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Course.csv")
.map(_.split(","))
.map(x => Course(x(0),x(1),x(2)))
.toDF()
.createOrReplaceTempView("Course")
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Score.csv")
.map(_.split(","))
.map(x => Score(x(0),x(1),x(2)))
.toDF()
.createOrReplaceTempView("Score")
spark.sparkContext
.textFile("E:\\test\\tmp_files\\spark_sql_test_data\\Teacher.csv")
.map(_.split(","))
.map(x => Teacher(x(0),x(1),x(2),x(3),x(4),x(5)))
.toDF()
.createOrReplaceTempView("Teacher")
//spark.sql("select * from Teacher").show()
//查询Student表中所有记录 sanme ssex sclass 列
//spark.sql("select sname,ssex,sclass from Student").show()
//查询教师表中不重复depart列
//spark.sql("select distinct tdepart from Teacher").show(false)
//spark.sql("select tdepart from Teacher group by tdepart").show(false)
//查询score表中成绩在60 80 之间的所有记录
//spark.sql("select * from score where degree >=60 and degree <=80").show()
//spark.sql("select * from score where degree between 60 and 80").show()
//查询score表中成绩为85 86 或者是 88 的记录
//spark.sql("select * from score where degree = '85' or degree = '86' OR degree = '88'").show()
//以class降序升序查询
//spark.sql("select * from student order by sclass desc").show()
//spark.sql("select * from student order by sclass").show()
//以cno升序degree降序查询score表中数据
//spark.sql("select * from score t order by t.cno asc, t.degree desc").show()
//查询score表中的最高分的学生的学号和课程号
//spark.sql("select * from score order by Int(degree) desc limit 1").show()
//spark.sql("select * from score order by Int(degree) desc").show()
//查询每门课程的平均成绩 调用数值类型函数的会自动转化
//spark.sql("select cno, avg(degree) from score group by cno").show()
//查询score表中至少有5名学生选修的课,并且名字以3开头的课程 的平均分数
//spark.sql("select cno,avg(degree) from score where cno like '3%' group by cno having count(cno) >= 5").show()
//查询所有学生中的 sname cname degree
// spark.sql("select s.sname, c.cname, t.degree from score t " +
// "join student s on t.sno = s.sno " +
// "join course c on c.cno = t.cno").show(false)
//spark.sql("select s.sname, c.cname, t.degree from score t, student s, course c where t.sno = s.sno and c.cno = t.cno").show(false)
//查询score表中选择多门课程的同学,分数为非最高分成绩的记录
// spark.sql("select * from score where " +
// "sno in (select sno from score t group by t.sno having count(cno) > 1) " +
// "and degree != (select max(degree) from score)").show()
//查询和学号为108的同学同年出生的所有同学的sno sname sbirthday列
// spark.sql("select sno, sname, sbirthday from student where substring(sbirthday,0,4) = " +
// "(select substring(sbirthday,0,4) from student where sno = '108')").show()
//查询选修某课程的同学人数大于5名的教师姓名
// spark.sql("select tname from teacher t, " +
// "(select cno from score group by cno having count(cno) > 5) s, " +
// "course c where " +
// "t.tno = c.tno and c.cno = s.cno").show()
// spark.sql("select tname from teacher t " +
// "join course c on t.tno = c.tno " +
// "join (select cno from score group by cno having count(cno) > 5) s on s.cno = c.cno").show()
//查询成绩比该课程平均成绩低的同学的成绩表
//spark.sql("select * from score s where s.degree < (select avg(degree) from score c where c.cno = s.cno)").show()
//查询所有没有讲课的教师的 tname 和 depart
//spark.sql("select tname, tdepart from teacher t where t.tno not in (select tno from course c where c.cno in (select cno from score))").show(false)
//查询至少有两名男生的班号
//spark.sql("select sclass from student t where ssex = 'male' group by sclass having count(ssex) >= 2").show()
//查询student表中不姓 王 的同学的记录
//spark.sql("select * from student t where sname not like 'Wang%'").show()
//查询student 表中每个学生的姓名和年纪
//spark.sql("select sname, (cast(" + getDate("yyyy") + " as int) - cast(substring(sbirthday,0,4) as int)) as age from student s").show()
spark.close()
}
}