场景一:
大表join小表: 把小表broadcast,和cache 到内存,并且大表加了distribute by rand()
然后在spark-submit中加一个conf:spark.sql.autoBroadcastJoinThreshold=200000000。此配置限定小表大小,单位为字节,只要表大小小于此取值(此处约为200m),且被执行过cache table的小表,在做join时,都会启用hash join。
---------------------
场景二:groupBy,orderby,只能调整业务,或者改成rdd,目前在sparkSQL层面除了用udf,其他还没有什么解决办法,转成RDD是一种选择
代码如下
import java.util.Random
import org.apache.spark.sql.SparkSession
object TestUDF {
def main(args: Array[String]): Unit = {
val spark =
SparkSession.builder()
.appName("TestUDF")
.enableHiveSupport()
.getOrCreate()
spark.udf.register("random_prefix", (value: Int, num: Int) => randomPrefixUDF(value, num))
spark.udf.register("remove_random_prefix", (value: String) => removeRandomPrefixUDF(value))
// 加随机前缀
val sql1 =
s"""
|select
| random_prefix(name, 6) product,
| id
|from
| ggg.test
""".stripMargin
// 分组求和
val sql2 =
s"""
|select
| product,
| sum(id) click
|from
| (
| select
| random_prefix(name, 6) product,
| id
| from
| ggg.test
| ) t1
|group by
| product
""".stripMargin
// 去掉随机前缀
val sql3 =
s"""
|select
| remove_random_prefix(product) product,
| click
|from
| (
| select
| product,
| sum(id) click
| from
| (
| select
| random_prefix(name, 6) product,
| id
| from
| ggg.test
| ) t1
| group by
| product
| ) t2
|
""".stripMargin
// 分组求和
val sql4 =
s"""
|select
| product,
| sum(click) click
|from
| (
| select
| remove_random_prefix(product) product,
| click
| from
| (
| select
| product,
| sum(id) click
| from
| (
| select
| random_prefix(name, 6) product,
| id
| from
| ggg.test
| ) t1
| group by
| product
| ) t2
| ) t3
|group by
| product
""".stripMargin
// spark.sql(sql1).show()
// spark.sql(sql2).show()
// spark.sql(sql3).show()
spark.sql(sql4).show()
}
def randomPrefixUDF(value: Int, num: Int): String = {
new Random().nextInt(num).toString + "_" + value
}
def removeRandomPrefixUDF(value: String): String = {
value.toString.split("_")(1)
}
}