// 单行查询
var userDF = List((1, "张三", true, 18, 15000, 1))
.toDF("id", "name", "sex", "age", "salary", "dept")
userDF.createTempView("t_employee")
val sql = "select * from t_employee where name = '张三'"
spark.sql(sql)
.show()
+---+----+----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+----+---+------+----+
| 1|张三|true| 18| 15000| 1|
+---+----+----+---+------+----+
var userDF= List((1,"张三",true,18,15000,1))
.toDF("id","name","sex","age","salary","dept")
userDF.createTempView("t_employee")
val sql="select * from t_employee where name like '%三%'"
spark.sql(sql)
.show()
+---+----+----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+----+---+------+----+
| 1|张三|true| 18| 15000| 1|
+---+----+----+---+------+----+
var userDF = List((1, "张三", true, 18, 15000, 1), (2, "ls", false, 18, 12000, 1))
.toDF("id", "name", "sex", "age", "salary", "dept")
//构建视图
userDF.createTempView("t_employee")
val sql =
"""
|select * from t_employee where salary > 10000 order by salary desc
"""
.stripMargin
spark.sql(sql)
.show()
+---+----+-----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+-----+---+------+----+
| 1|张三| true| 18| 15000| 1|
| 2|李四|false| 18| 12000| 1|
+---+----+-----+---+------+----+
var userDF= List( (1,"张三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//构建视图
userDF.createTempView("t_employee")
val sql=
"""
|select * from t_employee where salary > 10000 order by salary desc limit 2
""".stripMargin
spark.sql(sql)
.show()
+---+----+-----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+-----+---+------+----+
| 3|王五|false| 18| 16000| 2|
| 1|张三| true| 18| 15000| 1|
+---+----+-----+---+------+----+
var userDF= List( (1,"张三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//构建视图
userDF.createTempView("t_employee")
val sql=
"""
|select dept ,avg(salary) as avg_slalary from t_employee
|group by dept order by avg_slalary desc
""".stripMargin
spark.sql(sql)
.show()
+----+-----------+
|dept|avg_slalary|
+----+-----------+
| 2| 16000.0|
| 1| 13500.0|
+----+-----------+
var userDF= List( (1,"张三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//构建视图
userDF.createTempView("t_employee")
val sql=
"""
| select dept ,avg(salary) as avg_slalary
| from t_employee group by dept
| having avg_slalary > 13500
| order by avg_slalary desc
""".stripMargin
spark.sql(sql)
.show()
+----+-----------+
|dept|avg_slalary|
+----+-----------+
| 2| 16000.0|
+----+-----------+
var userDF= List( (1,"张三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//构建视图
userDF.createTempView("t_employee")
val sql=
"""
|select id,name,case sex when true then '男' else '女' end as sex_alias
|from t_employee
""".stripMargin
spark.sql(sql)
.show()
+---+----+---------+
| id|name|sex_alias|
+---+----+---------+
| 1|张三| 男|
| 2|李四| 女|
| 3|王五| 女|
+---+----+---------+
// 行转列
var scoreDF = List(
(1, "语文", 100),
(1, "数学", 100),
(1, "英语", 100),
(2, "数学", 79),
(2, "语文", 80),
(2, "英语", 100)
).toDF("id", "course", "score")
scoreDF.createOrReplaceTempView("t_course")
val sql =
"""
| select id,
| max(case course when '数学' then score else 0 end) as math,
| max(case course when '英语' then score else 0 end) as english,
| max(case course when '语文' then score else 0 end) as chinese
| from t_course group by id
""".stripMargin
spark.sql(sql)
.show()
+---+----+-------+-------+
| id|math|english|chinese|
+---+----+-------+-------+
| 1| 100| 100| 100|
| 2| 79| 100| 80|
+---+----+-------+-------+
var scoreDF = List(
(1, "语文", 100),
(1, "数学", 100),
(1, "英语", 100),
(2, "数学", 79),
(2, "语文", 80),
(2, "英语", 100)
).toDF("id", "course", "score")
scoreDF.createOrReplaceTempView("t_course")
val sql =
"""
|select *
|from t_course
|pivot(max(score) for course in ('数学' ,'语文','英语'))
|
""".stripMargin
spark.sql(sql)
.show()
+---+----+----+----+
| id|数学|语文|英语|
+---+----+----+----+
| 1| 100| 100| 100|
| 2| 79| 80| 100|
+---+----+----+----+
在书写SQL的时候除去聚合字段和输出列明字段,其他字段作为groupby后的隐藏字段。
// Cube计算
val frame = List(
(110, 50, 80, 80),
(120, 60, 95, 75),
(120, 50, 96, 70)
) .toDF("height", "weight", "uiq", "ueq")
frame.createTempView("t_user")
val sql=
"""
|select height,weight,avg(uiq),avg(ueq)
|from t_user
|group by cube(height,weight)
""".stripMargin
spark.sql(sql)
.show()
+------+------+-----------------+--------+
|height|weight| avg(uiq)|avg(ueq)|
+------+------+-----------------+--------+
| 110| 50| 80.0| 80.0|
| 120| null| 95.5| 72.5|
| 120| 60| 95.0| 75.0|
| null| 60| 95.0| 75.0| // weight 是60的所有数据 的uiq、ueq平均值
| null| null|90.33333333333333| 75.0| // 所有数据的uiq、ueq平均值
| 120| 50| 96.0| 70.0|
| 110| null| 80.0| 80.0|
| null| 50| 88.0| 75.0|
+------+------+-----------------+--------+
// join
val userCatagoryCostDF=List(
(1,"电脑配件",100),
(1,"母婴用品",100),
(1,"生活用品",100),
(2,"居家美食",79),
(2,"消费电子",80),
(2,"生活用品",100)
).toDF("uid","category","cost")
val usersDF= List(
(1,"张晓三",true,18,15000),
(2,"李晓四",true,18,18000),
(3,"王晓五",false,18,10000)
).toDF("id","name","sex","age","salary")
usersDF.createTempView("t_user")
userCatagoryCostDF.createTempView("t_user_cost")
val sql =
"""
|select u.*,o.*
|from t_user u
|left join t_user_cost o
|on u.id=o.uid
|where uid is not null
""".stripMargin
spark.sql(sql)
.show()
+---+------+----+---+------+---+--------+----+
| id| name| sex|age|salary|uid|category|cost|
+---+------+----+---+------+---+--------+----+
| 1|张晓三|true| 18| 15000| 1|电脑配件| 100|
| 1|张晓三|true| 18| 15000| 1|母婴用品| 100|
| 1|张晓三|true| 18| 15000| 1|生活用品| 100|
| 2|李晓四|true| 18| 18000| 2|居家美食| 79|
| 2|李晓四|true| 18| 18000| 2|消费电子| 80|
| 2|李晓四|true| 18| 18000| 2|生活用品| 100|
+---+------+----+---+------+---+--------+----+
// 子查询
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|(select avg(salary) from t_employee t2 where t1.dept=t2.dept) as avg_salary
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+------------------+
| id|name|salary|dept| avg_salary|
+---+----+------+----+------------------+
| 2| ls| 18000| 2| 16000.0|
| 3| ww| 14000| 2| 16000.0|
| 5|win7| 16000| 1|16333.333333333334|
| 1| zs| 15000| 1|16333.333333333334|
| 4| zl| 18000| 1|16333.333333333334|
+---+----+------+----+------------------+
在spark SQL不允许在子查询中使用非等值连接。(MySQL|Oracle支持)
在正常的统计分析中 ,通常使用聚合函数作为分析,聚合分析函数的特点是将n行记录合并成一行,在数据库的统计当中 还有一种统计称为开窗统计,开窗函数可以实现将一行变成多行。可以将数据库查询的每一条记录比作是一幢高楼的一 层, 开窗函数就是在每一层开一扇窗, 让每一层能看到整装楼的全貌或一部分。
// 开窗函数
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|count(id) over(partition by dept order by salary desc) as rank,
|(count(id) over(partition by dept order by salary desc rows between current row and unbounded following) - 1) as low_than_me,
|avg(salary) over(partition by dept rows between unbounded preceding and unbounded following) as avg_salary,
|avg(salary) over() as all_avg_salary
|from t_employee t1 order by dept desc
""".stripMargin
spark.sql(sql)
.show()
spark.stop()
+---+----+------+----+----+-----------+------------------+--------------+
| id|name|salary|dept|rank|low_than_me| avg_salary|all_avg_salary|
+---+----+------+----+----+-----------+------------------+--------------+
| 2| ls| 18000| 2| 1| 1| 16000.0| 16200.0|
| 3| ww| 14000| 2| 2| 0| 16000.0| 16200.0|
| 4| zl| 18000| 1| 1| 2|16333.333333333334| 16200.0|
| 5|win7| 16000| 1| 2| 1|16333.333333333334| 16200.0|
| 1| zs| 15000| 1| 3| 0|16333.333333333334| 16200.0|
+---+----+------+----+----+-----------+------------------+--------------+
select id,name,salary,dept,
# 按部门分组、工资倒叙排序展示 当前部门的id总数
count(id) over(partition by dept order by salary desc) as rank,
# 按部门分组、工资倒叙排序展示当前行至最后一行id总数-1
(count(id) over(partition by dept order by salary desc rows between current row and unbounded following) - 1) as low_than_me,
# 按部门分组展示首行至尾行的平均工资 如:2部门平均工资16000 1部门平均工资16333.333333333334
avg(salary) over(partition by dept rows between unbounded preceding and unbounded following) as avg_salary,
# 展示所有员工的平均工资
avg(salary) over() as all_avg_salary
from t_employee t1 order by dept desc
聚合函数(字段) over ([[partition by 字段] order by 字段 asc [rows between 起始行偏移量 and 终止偏移量]] )
preceding:用于累加前N行(分区之内)。若是从分区第一行头开始,则为 unbounded。 N为:相对当前行向前 的偏移量
负数
。
following:与preceding相反,累加后N行(分区之内)。若是累加到该分区结束则为unbounded。N为:相对当 前行向后的偏移量正数
current row:顾名思义,当前行,偏移量为0
统计当前记录所在的行号
// ROW_NUM
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|ROW_NUMBER() over(partition by dept order by salary desc) as rank
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+----+
| id|name|salary|dept|rank|
+---+----+------+----+----+
| 2| ls| 18000| 2| 1|
| 3| ww| 14000| 2| 2|
| 4| zl| 18000| 1| 1|
| 5|win7| 16000| 1| 2|
| 1| zs| 15000| 1| 3|
+---+----+------+----+----+
如果部门存在相同薪资此时ROW_NUMBER只能表示当前记录在窗口行标
// RANK
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|RANK() over(partition by dept order by salary desc) as rank
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+----+
| id|name|salary|dept|rank|
+---+----+------+----+----+
| 2| ls| 18000| 2| 1|
| 3| ww| 14000| 2| 2|
| 4| zl| 18000| 1| 1|
| 6| zl1| 18000| 1| 1|
| 5|win7| 16000| 1| 3| //因为出现两个排名为1的,所有这里是3,故而排名序号不连续
| 1| zs| 15000| 1| 4|
+---+----+------+----+----+
与ROW_NUM相比,排名特点是不连续。
// DENSE_RANK/密集排名
var df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(6, "zl1", true, 1, 18000),
(5, "win7", false, 1, 16000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
val sql =
"""
|select id,name,salary,dept,
|DENSE_RANK() over(partition by dept order by salary desc) as rank
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+----+
| id|name|salary|dept|rank|
+---+----+------+----+----+
| 3| ww| 14000| 2| 2|
| 2| ls| 18000| 2| 1|
| 4| zl| 18000| 1| 1|
| 6| zl1| 18000| 1| 1|
| 1| zs| 15000| 1| 3|
| 5|win7| 16000| 1| 2|
+---+----+------+----+----+
// 自定义单行函数
var df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(6, "zl1", true, 1, 18000),
(5, "win7", false, 1, 16000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
spark.udf
.register("convertSex", (sex: Boolean) => {
sex match {
case true => "男"
case false => "女"
}
})
val sql =
"""
|select id,name,convertSex(sex) as usex
|from t_employee
""".stripMargin
spark.sql(sql)
.show()
+---+----+----+
| id|name|usex|
+---+----+----+
| 1| zs| 男|
| 2| ls| 女|
| 3| ww| 女|
| 4| zl| 女|
| 6| zl1| 男|
| 5|win7| 女|
+---+----+----+
只需要写一个类继承 UserDefinedAggregateFunction
即可。
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{
MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{
DataType, DoubleType, StructType}
class CustomSum extends UserDefinedAggregateFunction {
//1.输入的字段类型信息 name属性 叫什么无所谓
override def inputSchema: StructType = {
new StructType().add("salary", DoubleType)
}
//2.中间结果变量类型
override def bufferSchema: StructType = {
new StructType().add("taotalsalary", DoubleType)
}
//3.最终返回结果的类型
override def dataType: DataType = DoubleType
//4.设置返回结果类型是否固定
override def deterministic: Boolean = true
//5.初始化中间结果
override def initialize(buffer: MutableAggregationBuffer): Unit = {
//第0个位置元素是0.0
buffer.update(0, 0.0)
}
//6.将传如的数值添加到中间结果变量中
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val history = buffer.getAs[Double](0)
val current = input.getAs[Double](0)
buffer.update(0, history + current)
}
//7.将局部结果聚合到buffer1中
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val result = buffer1.getAs[Double](0) + buffer2.getAs[Double](0)
buffer1.update(0, result)
}
//8.返回最终结果
override def evaluate(buffer: Row): Any = {
buffer.getAs[Double](0)
}
}
// 自定义聚合函数(untyped)
var df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(6, "zl1", true, 1, 18000),
(5, "win7", false, 1, 16000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
spark.udf
.register("customSum", new CustomSum)
val sql =
"""
|select dept,customSum(salary)
|from t_employee
|group by dept
""".stripMargin
spark.sql(sql)
.show()
+----+---------------------------------+
|dept|customsum(CAST(salary AS DOUBLE))|
+----+---------------------------------+
| 1| 67000.0|
| 2| 32000.0|
+----+---------------------------------+
Parquet是面向分析型业务的列式存储格式,由Twitter和Cloudera合作开发,2015年5月从Apache的孵化器里毕业成为Apache顶级项目
http://parquet.apache.org/
// paquet
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("parquet")
.save("file:Users/mashikang/IdeaProjects/spark_sql/src/main/resources/parquet")
spark.read
.parquet("file:Users/mashikang/IdeaProjects/spark_sql/src/main/resources/parquet")
.show()
+---+----+-----+----+------+
| id|name| sex|dept|salary|
+---+----+-----+----+------+
| 5|win7|false| 1| 16000|
| 6| zl1| true| 1| 18000|
| 4| zl|false| 1| 18000|
| 3| ww|false| 2| 14000|
| 1| zs| true| 1| 15000|
| 2| ls|false| 2| 18000|
+---+----+-----+----+------+
// json
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("json")
.save("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/json")
spark.read
.json("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/json")
.show()
+----+---+----+------+-----+
|dept| id|name|salary| sex|
+----+---+----+------+-----+
| 1| 5|win7| 16000|false|
| 2| 3| ww| 14000|false|
| 1| 4| zl| 18000|false|
| 2| 2| ls| 18000|false|
| 1| 6| zl1| 18000| true|
| 1| 1| zs| 15000| true|
+----+---+----+------+-----+
// ORC
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("orc")
.save("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/orc")
spark.read
.orc("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/orc")
.show()
+---+----+-----+----+------+
| id|name| sex|dept|salary|
+---+----+-----+----+------+
| 5|win7|false| 1| 16000|
| 4| zl|false| 1| 18000|
| 3| ww|false| 2| 14000|
| 6| zl1| true| 1| 18000|
| 1| zs| true| 1| 15000|
| 2| ls|false| 2| 18000|
+---+----+-----+----+------+
// CSV
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("csv")
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.save("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/csv")
spark.read
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.csv("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/csv")
.show()
+---+----+-----+----+------+
| id|name| sex|dept|salary|
+---+----+-----+----+------+
| 5|win7|false| 1| 16000|
| 4| zl|false| 1| 18000|
| 3| ww|false| 2| 14000|
| 2| ls|false| 2| 18000|
| 6| zl1| true| 1| 18000|
| 1| zs| true| 1| 15000|
+---+----+-----+----+------+
// JDBC
val usersDF = List(
(1, "张晓三", 1, 15000),
(2, "李晓四", 1, 18000),
(3, "王晓五", 1, 10000)
).toDF("id", "name", "dept", "salary")
usersDF.write
.format("jdbc")
.mode(SaveMode.Overwrite)
.option("user", "root")
.option("password", "root")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "t_user")
.save()
val props = new Properties()
props.put("user", "root")
props.put("password", "root")
spark.read
.jdbc("jdbc:mysql://localhost:3306/test", "t_user", props)
.show()
或者
val usersDF = List(
(1, "张晓三", 1, 15000),
(2, "李晓四", 1, 18000),
(3, "王晓五", 1, 10000)
).toDF("id", "name", "dept", "salary")
usersDF.write
.format("jdbc")
.mode(SaveMode.Overwrite)
.option("user", "root")
.option("password", "root")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "t_user")
.save()
spark.read.format("jdbc")
.option("user", "root")
.option("password", "root")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "t_user")
.load()
.show()
val usersDF = List(
(1, "张晓三", 1, 15000.0),
(2, "李晓四", 1, 18000.0),
(3, "王晓五", 1, 10000.0)
).toDF("id", "name", "dept", "salary")
usersDF.rdd.foreachPartition(its => {
its.foreach(row => {
val id = row.getAs[Int]("id")
val name = row.getAs[String]("name")
val salary = row.getAs[Double]("salary")
println(s"$id,$name,$salary")
})
})
2,李晓四,18000.0
3,王晓五,10000.0
1,张晓三,15000.0