Optimizer的主要职责是将Analyzer给Resolved的Logical Plan根据不同的优化策略Batch,来对语法树进行优化,优化逻辑计划节点(Logical Plan)以及表达式(Expression),也是转换成物理执行计划的前置。它的工作原理和analyzer一致,也是通过其下的batch里面的Rule[LogicalPlan]来进行处理的。
object DefaultOptimizer extends Optimizer {
val batches =
// SubQueries are only needed for analysis and can be removed before execution.
Batch("Remove SubQueries", FixedPoint(100),
EliminateSubQueries) ::
Batch("Operator Reordering", FixedPoint(100),
UnionPushdown,
//递归合并相邻的两个过滤条件 Filter Pushdown 过滤器下推
CombineFilters,
//把从表达式里面的过滤替换成,先做过滤再取表达式,并且掉过滤里面的别名属性
//典型的例子 select * from (select a,b from table) where a=1
//替换成select * from (select a,b from table where a=1)
PushPredicateThroughProject,
//这个语句可以改写为 select a,b from x where x.a > 0 join (select * from y where y.b >0) on x.id = y.id
PushPredicateThroughJoin,
PushPredicateThroughGenerate,
//去掉一些用不上的列
ColumnPruning,
ProjectCollapsing,
//递归合并相邻的两个limit
/*
* select * from (select * from c_picrecord limit 100)a limit 10
* Limit if ((100 < 10)) 100 else 10
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,ca
* */
CombineLimits) ::
Batch("ConstantFolding", FixedPoint(100),//常量合并
// 替换null值
// NullPropagation是一个能将Expression Expressions替换为等价的Literal值的优化,并且能够避免NULL值在SQL语法树的传播。
NullPropagation,
OptimizeIn,
//替换一些简单的常量表达式,比如 1 in (1,2) 直接返回一个true就可以了
//常量合并是属于Expression优化的一种,对于可以直接计算的常量,不用放到物理执行里去生成对象来计算了,直接可以在计划里就计算出来:
/*
* plan->
Project [(((1 + 2) + 3) + 4) AS c0#46]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42,car_numandcolor#43,compare_id#44L,compare_speed#45] org.apache.spark.sql.parquet.ParquetRelation2@6813b117
result->
Project [10 AS c0#46]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42,car_numandcolor#43,compare_id#44L,compare_speed#45] org.apache.spark.sql.parquet.ParquetRelation2@6813b117
rule->
org.apache.spark.sql.catalyst.optimizer.ConstantFolding$@5c99cab8
*
* */
ConstantFolding,
//简化like语句,避免全表扫描,目前支持'%demo%', '%demo','demo*','demo'
LikeSimplification,
//简化过滤条件,比如true and score > 0 直接替换成score > 0
BooleanSimplification,
//简化filter,比如where 1=1 或者where 1=2,前者直接去掉这个过滤,后者这个查询就没必要做了
SimplifyFilters,
//简化转换,比如两个比较字段的数据类型是一样的,就不需要转换了
SimplifyCasts,
//简化大小写转换,比如Upper(Upper('a'))转为认为是Upper('a')
SimplifyCaseConversionExpressions) ::
Batch("Decimal Optimizations", FixedPoint(100),
DecimalAggregates) ::
Batch("LocalRelation", FixedPoint(100),
ConvertToLocalRelation) :: Nil
}
接下来讲解几个典型的Rule[LogicalPlan]
去除子查询
object EliminateSubQueries extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case Subquery(_, child) => child
}
}
其实就是把Subquery抹掉,转化为其chcild
例如sql语句如下:
String sql = "SELECT id,dev_chnid,dev_chnname,car_num,car_speed,car_direct from test";
则日志打印如下:
plan->
Project [id#0L,dev_chnid#26,dev_chnname#4,car_num#5,car_speed#8,car_direct#12]
Subquery test
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [id#0L,dev_chnid#26,dev_chnname#4,car_num#5,car_speed#8,car_direct#12]//去掉Subquery节点
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.analysis.EliminateSubQueries$@7032a1b4
把操作符放置到union的两边
object UnionPushdown extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
// Push down filter into union
case Filter(condition, u @ Union(left, right)) =>//把filter放置到union的两边
val rewrites = buildRewrites(u)
Union(
Filter(condition, left),
Filter(pushToRight(condition, rewrites), right))
// Push down projection into union
case Project(projectList, u @ Union(left, right)) =>//把project放置到union的两边
val rewrites = buildRewrites(u)
Union(
Project(projectList, left),
Project(projectList.map(pushToRight(_, rewrites)), right))
}
}
例如sql语句如下:
String sql = "select * from (select id from test union all select id from test)aa";
则日志打印如下:
plan->
Project [id#0L]
Union
Project [id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Project [id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Union//把project放置到union的两边
Project [id#0L]
Project [id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Project [id#0L]
Project [id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.UnionPushdown$@1173be4f
合并2个相邻的filter
object CombineFilters extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case ff @ Filter(fc, nf @ Filter(nc, grandChild)) => Filter(And(nc, fc), grandChild)//合并2个filter
}
}
例如sql语句如下:
String sql = "select id from (select id from test where id >100)a where id > 80";
则日志打印如下:
16-07-22 16:12:16,201 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Project
Filter (id#0L > CAST(80, LongType))
Filter (id#0L > CAST(100, LongType))
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project
Filter ((id#0L > CAST(100, LongType)) && (id#0L > CAST(80, LongType)))
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.CombineFilters$@99569a7
把filter操作符放置到Project里面,即先做过滤,再选择
object PushPredicateThroughProject extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case filter @ Filter(condition, project @ Project(fields, grandChild)) =>
val sourceAliases = fields.collect { case a @ Alias(c, _) =>
(a.toAttribute: Attribute) -> c
}.toMap
project.copy(child = filter.copy(//把外层的filter放置进project里面
replaceAlias(condition, sourceAliases),
grandChild))
}
def replaceAlias(condition: Expression, sourceAliases: Map[Attribute, Expression]): Expression = {
condition transform {
case a: AttributeReference => sourceAliases.getOrElse(a, a)
}
}
}
例如sql语句如下:
String sql = "select * from (select id,car_speed from test)aa where id=1";
则日志打印如下:
16-07-22 16:28:16,850 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Project [id#0L,car_speed#8]
Filter (id#0L = CAST(1, LongType))
Project [id#0L,car_speed#8]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [id#0L,car_speed#8]
Project [id#0L,car_speed#8]
Filter (id#0L = CAST(1, LongType))//将filter放置进Project里面
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.PushPredicateThroughProject$@7678c6b4
下推filter至join的左边和右边
object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
/**
* Splits join condition expressions into three categories based on the attributes required
* to evaluate them.
* @return (canEvaluateInLeft, canEvaluateInRight, haveToEvaluateInBoth)
*/
private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = {
val (leftEvaluateCondition, rest) =
condition.partition(_.references subsetOf left.outputSet)
val (rightEvaluateCondition, commonCondition) =
rest.partition(_.references subsetOf right.outputSet)
(leftEvaluateCondition, rightEvaluateCondition, commonCondition)
}
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
// push the where condition down into join filter
case f @ Join(left, right, joinType, joinCondition) =>
val (leftJoinConditions, rightJoinConditions, commonJoinCondition) =
split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right)
joinType match {
case _ @ (Inner | LeftSemi) =>
// push down the single side only join filter for both sides sub queries
val newLeft = leftJoinConditions.
reduceLeftOption(And).map(Filter(_, left)).getOrElse(left)
val newRight = rightJoinConditions.
reduceLeftOption(And).map(Filter(_, right)).getOrElse(right)
val newJoinCond = commonJoinCondition.reduceLeftOption(And)
Join(newLeft, newRight, joinType, newJoinCond)//把部分join的条件下推至其左右两边
……
}
}
}
例如sql语句如下:
String sql = " select a.id , b.id from test a join test b on a.id = b.id and a.id > 0 and b.id > 0";
则日志打印如下:
16-07-22 17:07:21,526 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Project [id#0L,id#43L]
Join Inner, Some((((id#0L = id#43L) && (id#0L > CAST(0, LongType))) && (id#43L > CAST(0, LongType))))
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Relation[id#43L,dev_id#44,dev_chnnum#45L,dev_name#46,dev_chnname#47,car_num#48,car_numtype#49,car_numcolor#50,car_speed#51,car_type#52,car_color#53,car_length#54L,car_direct#55,car_way_code#56,cap_time#57L,cap_date#58L,inf_note#59,max_speed#60,min_speed#61,car_img_url#62,car_img1_url#63,car_img2_url#64,car_img3_url#65,car_img4_url#66,car_img5_url#67,rec_stat#68,dev_chnid#69,car_img_count#70,save_flag#71,dc_cleanflag#72,pic_id#73,car_img_plate_top#74L,car_img_plate_left#75L,car_img_plate_bottom#76L,car_img_plate_right#77L,car_brand#78L,issafetybelt#79,isvisor#80,bind_stat#81,car_num_pic#82,combined_pic_url#83,verify_memo#84,rec_stat_tmp#85] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Aggregate [COUNT(1) AS count#86L]
Project [id#0L,id#43L]
Join Inner, Some((id#0L = id#43L))
Filter (id#0L > CAST(0, LongType))//先筛选,再做join
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Filter (id#43L > CAST(0, LongType)) //先筛选,再做join
Relation[id#43L,dev_id#44,dev_chnnum#45L,dev_name#46,dev_chnname#47,car_num#48,car_numtype#49,car_numcolor#50,car_speed#51,car_type#52,car_color#53,car_length#54L,car_direct#55,car_way_code#56,cap_time#57L,cap_date#58L,inf_note#59,max_speed#60,min_speed#61,car_img_url#62,car_img1_url#63,car_img2_url#64,car_img3_url#65,car_img4_url#66,car_img5_url#67,rec_stat#68,dev_chnid#69,car_img_count#70,save_flag#71,dc_cleanflag#72,pic_id#73,car_img_plate_top#74L,car_img_plate_left#75L,car_img_plate_bottom#76L,car_img_plate_right#77L,car_brand#78L,issafetybelt#79,isvisor#80,bind_stat#81,car_num_pic#82,combined_pic_url#83,verify_memo#84,rec_stat_tmp#85] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.PushPredicateThroughJoin$@6593bce2
裁剪列
object ColumnPruning extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
// Eliminate attributes that are not needed to calculate the specified aggregates.
// select 1+1 as a1, dev_chnid from (select dev_chnid, id from test)a group by dev_chnid
case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty =>
a.copy(child = Project(a.references.toSeq, child))//child的输出列多余聚合的列,则进行裁剪
// select dev_chnid from (select dev_chnid,MAX(id) from test group by dev_chnid)aa
case p @ Project(projectList, a @ Aggregate(groupingExpressions, aggregateExpressions, child))
if (a.outputSet -- p.references).nonEmpty =>//只聚合Project需要的列
Project(
projectList,
Aggregate(
groupingExpressions,
aggregateExpressions.filter(e => p.references.contains(e)),
child))
// Eliminate unneeded attributes from either side of a Join.
//select ai,bi from (select a.id as ai,a.car_speed,b.id as bi,b.car_speed from test a join test b on a.id = b.id)aa
case Project(projectList, Join(left, right, joinType, condition)) =>//只join Project需要的列
// Collect the list of all references required either above or to evaluate the condition.
val allReferences: AttributeSet =
AttributeSet(
projectList.flatMap(_.references.iterator)) ++
condition.map(_.references).getOrElse(AttributeSet(Seq.empty))
/** Applies a projection only when the child is producing unnecessary attributes */
def pruneJoinChild(c: LogicalPlan): LogicalPlan = prunedChild(c, allReferences)
Project(projectList, Join(pruneJoinChild(left), pruneJoinChild(right), joinType, condition))
// Eliminate unneeded attributes from right side of a LeftSemiJoin.
//select a.*,b.id from test a left join test b on a.id=b.id
case Join(left, right, LeftSemi, condition) =>//裁剪右节点
// Collect the list of all references required to evaluate the condition.
val allReferences: AttributeSet =
condition.map(_.references).getOrElse(AttributeSet(Seq.empty))
Join(left, prunedChild(right, allReferences), LeftSemi, condition)
//select id from (select * from test limit 10)aa
case Project(projectList, Limit(exp, child)) =>//limit节点上移,供相邻的两个Project节点进行列裁剪
Limit(exp, Project(projectList, child))
// push down project if possible when the child is sort
//select * from (select * from test order by id)aa
case p @ Project(projectList, s @ Sort(_, _, grandChild))//如果sort的表达式是Project的输出的子集,则把Sort上移
if s.references.subsetOf(p.outputSet) =>
s.copy(child = Project(projectList, grandChild))
// Eliminate no-op Projects
case Project(projectList, child) if child.output == projectList => child//合并没有op的project
}
例如sql语句如下:
String sql = "SELECT 1+1 as a1, dev_chnid from (select dev_chnid, id from test)a group by dev_chnid";
则日志打印如下:
16-07-23 13:56:17,721 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Aggregate [dev_chnid#26], [(1 + 1) AS a1#43,dev_chnid#26]
Project [dev_chnid#26,id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Aggregate [dev_chnid#26], [(1 + 1) AS a1#43,dev_chnid#26]
Project [dev_chnid#26]//只筛选出需要的列
Project [dev_chnid#26,id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ColumnPruning$@58b18d61
又比如sql语句如下:
String sql = " select dev_chnid from (select dev_chnid,MAX(id) from test group by dev_chnid)aa";
则日志打印如下:
16-07-23 14:26:37,430 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Project [dev_chnid#26]
Aggregate [dev_chnid#26], [dev_chnid#26,MAX(id#0L) AS c1#46L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [dev_chnid#26]
Aggregate [dev_chnid#26], [dev_chnid#26]//只聚合Project需要的列
Project [dev_chnid#26]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ColumnPruning$@70b908c4
又比如sql语句如下:
String sql = "select ai,bi from (select a.id as ai,a.car_speed,b.id as bi,b.car_speed from test a join test b on a.id = b.id)aa";
则日志打印如下:
16-07-23 14:51:05,757 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Project [id#0L AS ai#49L,id#51L AS bi#50L]
Join Inner, Some((id#0L = id#51L))
Project [id#0L,car_speed#8]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Project [id#51L,car_speed#59]
Relation[id#51L,dev_id#52,dev_chnnum#53L,dev_name#54,dev_chnname#55,car_num#56,car_numtype#57,car_numcolor#58,car_speed#59,car_type#60,car_color#61,car_length#62L,car_direct#63,car_way_code#64,cap_time#65L,cap_date#66L,inf_note#67,max_speed#68,min_speed#69,car_img_url#70,car_img1_url#71,car_img2_url#72,car_img3_url#73,car_img4_url#74,car_img5_url#75,rec_stat#76,dev_chnid#77,car_img_count#78,save_flag#79,dc_cleanflag#80,pic_id#81,car_img_plate_top#82L,car_img_plate_left#83L,car_img_plate_bottom#84L,car_img_plate_right#85L,car_brand#86L,issafetybelt#87,isvisor#88,bind_stat#89,car_num_pic#90,combined_pic_url#91,verify_memo#92,rec_stat_tmp#93] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [id#0L AS ai#49L,id#51L AS bi#50L]
Join Inner, Some((id#0L = id#51L))
Project [id#0L] //只筛选出需要的列
Project [id#0L,car_speed#8]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Project [id#51L] //只筛选出需要的列
Project [id#51L,car_speed#59]
Relation[id#51L,dev_id#52,dev_chnnum#53L,dev_name#54,dev_chnname#55,car_num#56,car_numtype#57,car_numcolor#58,car_speed#59,car_type#60,car_color#61,car_length#62L,car_direct#63,car_way_code#64,cap_time#65L,cap_date#66L,inf_note#67,max_speed#68,min_speed#69,car_img_url#70,car_img1_url#71,car_img2_url#72,car_img3_url#73,car_img4_url#74,car_img5_url#75,rec_stat#76,dev_chnid#77,car_img_count#78,save_flag#79,dc_cleanflag#80,pic_id#81,car_img_plate_top#82L,car_img_plate_left#83L,car_img_plate_bottom#84L,car_img_plate_right#85L,car_brand#86L,issafetybelt#87,isvisor#88,bind_stat#89,car_num_pic#90,combined_pic_url#91,verify_memo#92,rec_stat_tmp#93] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ColumnPruning$@3b244fea
又比如sql语句如下:
String sql = " select a.*,b.id from test a left join test b on a.id=b.id";
则日志打印如下:
16-07-23 15:13:23,208 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Project [id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42,id#43L]
Join LeftOuter, Some((id#0L = id#43L))
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Relation[id#43L,dev_id#44,dev_chnnum#45L,dev_name#46,dev_chnname#47,car_num#48,car_numtype#49,car_numcolor#50,car_speed#51,car_type#52,car_color#53,car_length#54L,car_direct#55,car_way_code#56,cap_time#57L,cap_date#58L,inf_note#59,max_speed#60,min_speed#61,car_img_url#62,car_img1_url#63,car_img2_url#64,car_img3_url#65,car_img4_url#66,car_img5_url#67,rec_stat#68,dev_chnid#69,car_img_count#70,save_flag#71,dc_cleanflag#72,pic_id#73,car_img_plate_top#74L,car_img_plate_left#75L,car_img_plate_bottom#76L,car_img_plate_right#77L,car_brand#78L,issafetybelt#79,isvisor#80,bind_stat#81,car_num_pic#82,combined_pic_url#83,verify_memo#84,rec_stat_tmp#85] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42,id#43L]
Join LeftOuter, Some((id#0L = id#43L))
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
Project [id#43L] //裁剪右节点
Relation[id#43L,dev_id#44,dev_chnnum#45L,dev_name#46,dev_chnname#47,car_num#48,car_numtype#49,car_numcolor#50,car_speed#51,car_type#52,car_color#53,car_length#54L,car_direct#55,car_way_code#56,cap_time#57L,cap_date#58L,inf_note#59,max_speed#60,min_speed#61,car_img_url#62,car_img1_url#63,car_img2_url#64,car_img3_url#65,car_img4_url#66,car_img5_url#67,rec_stat#68,dev_chnid#69,car_img_count#70,save_flag#71,dc_cleanflag#72,pic_id#73,car_img_plate_top#74L,car_img_plate_left#75L,car_img_plate_bottom#76L,car_img_plate_right#77L,car_brand#78L,issafetybelt#79,isvisor#80,bind_stat#81,car_num_pic#82,combined_pic_url#83,verify_memo#84,rec_stat_tmp#85] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ColumnPruning$@354ebd72
又比如sql语句如下:
String sql = "select id from (select * from test limit 10)aa";
则日志打印如下:
16-07-23 15:21:26,190 INFO org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer(Logging.scala:59) ##
plan->
Project [id#0L]
Limit 10
Project[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Limit 10//limit上移
Project [id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ColumnPruning$@20a8022
又比如sql语句如下:
String sql = " select * from (select * from test order by id)aa";
则日志打印如下:
plan->
Project [id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42]
Sort [id#0L ASC], true
Project [id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Sort [id#0L ASC], true//Sort的表达式id是其父节点的输出,则Sort上移
Project [id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ColumnPruning$@1aa7d923
折叠Project,顾名思义就是把相邻的Project合并成一个Project
object ProjectCollapsing extends Rule[LogicalPlan] {
/** Returns true if any expression in projectList is non-deterministic. */
private def hasNondeterministic(projectList: Seq[NamedExpression]): Boolean = {
projectList.exists(expr => expr.find(!_.deterministic).isDefined)
}
def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
// We only collapse these two Projects if the child Project's expressions are all
// deterministic.
case Project(projectList1, Project(projectList2, child))
if !hasNondeterministic(projectList2) =>
// Create a map of Aliases to their values from the child projection.
// e.g., 'SELECT ... FROM (SELECT a + b AS c, d ...)' produces Map(c -> Alias(a + b, c)).
val aliasMap = AttributeMap(projectList2.collect {
case a @ Alias(e, _) => (a.toAttribute, a)
})
// Substitute any attributes that are produced by the child projection, so that we safely
// eliminate it.
// e.g., 'SELECT c + 1 FROM (SELECT a + b AS C ...' produces 'SELECT a + b + 1 ...'
// TODO: Fix TransformBase to avoid the cast below.
val substitutedProjection = projectList1.map(_.transform {
case a: Attribute if aliasMap.contains(a) => aliasMap(a)
}).asInstanceOf[Seq[NamedExpression]]
Project(substitutedProjection, child)//合并2个Project的子集
}
}
例如sql语句如下:
String sql = "select id from (select id,dev_chnid from test)aa";
则日志打印如下:
plan->
Project [id#0L]
Project [id#0L,dev_chnid#26]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [id#0L]//合并2个相邻的Project
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ProjectCollapsing$@6fe94a75
合并2个相邻的limit
object CombineLimits extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
////ll为当前Limit,le为其expression, nl是ll的grandChild,ne是nl的expression
case ll @ Limit(le, nl @ Limit(ne, grandChild)) =>
Limit(If(LessThan(ne, le), ne, le), grandChild)//expression比较,如果ne比le小则表达式为ne,否则为le
}
}
例如sql语句如下:
String sql = " select id from (select id,dev_chnid from test limit 100)aa limit 10";
则日志打印如下:
plan->
Limit 10
Limit 100
Project [id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Limit if ((100 < 10)) 100 else 10//合并2个相邻的Limit
Project [id#0L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.CombineLimits$@35fb4cf5
替换null表达式
object NullPropagation extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case q: LogicalPlan => q transformExpressionsUp {
case e @ Count(Literal(null, _)) => Cast(Literal(0L), e.dataType)//如果count(null)则转化为count(0)
……
}
例如sql语句如下:
String sql = " select COUNT(null) from test";
则日志打印如下:
plan->
Aggregate [COUNT(null) AS c0#43L]
Project
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Aggregate [CAST(0, LongType) AS c0#43L]//null替换为0
Project
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.NullPropagation$@44b3ac1f
优化In操作符
object OptimizeIn extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case q: LogicalPlan => q transformExpressionsDown {
case In(v, list) if !list.exists(!_.isInstanceOf[Literal]) =>
val hSet = list.map(e => e.eval(null))
InSet(v, HashSet() ++ hSet)
}
}
}
例如sql语句如下:
String sql = "select * from test where id IN(1,2)";
则日志打印如下:
Filter id#0L IN (1,2)
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Filter id#0L INSET (1,2)//将IN优化为INSET
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.OptimizeIn$@33944f6b
合并常量
object ConstantFolding extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {//先对plan进行transform
case q: LogicalPlan => q transformExpressionsDown { //对每个plan的expression进行transform
// Skip redundant folding of literals. This rule is technically not necessary. Placing this
// here avoids running the next rule for Literal values, which would create a new Literal
// object and running eval unnecessarily.
case l: Literal => l
// Fold expressions that are foldable.
// [ret : 10 | e : (((1 + 2) + 3) + 4) | e.class : class org.apache.spark.sql.catalyst.expressions.Add | e.dataType : (((1 + 2) + 3) + 4).dataType]
// logInfo(s"[ret : $ret | e : $e | e.class : $eclass | e.dataType : $e.dataType]")
case e if e.foldable => {
e.getClass()
Literal.create(e.eval(null), e.dataType)
}//调用eval方法计算结果
// Fold "literal in (item1, item2, ..., literal, ...)" into true directly.
case In(Literal(v, _), list) if list.exists {
case Literal(candidate, _) if candidate == v => true
case _ => false
} => Literal.create(true, BooleanType)
}
}
}
例如sql语句如下:
String sql = " select 1+2+3+id from test";
则日志打印如下:
plan->
Project [(CAST(((1 + 2) + 3), LongType) + id#0L) AS c0#43L]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [(6 + id#0L) AS c0#43L]//合并1+2+3=6
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.ConstantFolding$@4e8a45cd
简化Like表达式
object LikeSimplification extends Rule[LogicalPlan] {
// if guards below protect from escapes on trailing %.
// Cases like "something\%" are not optimized, but this does not affect correctness.
val startsWith = "([^_%]+)%".r
val endsWith = "%([^_%]+)".r
val contains = "%([^_%]+)%".r
val equalTo = "([^_%]*)".r
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
case Like(l, Literal(utf, StringType)) =>
utf.toString match {
case startsWith(pattern) if !pattern.endsWith("\\") =>
StartsWith(l, Literal(pattern))
case endsWith(pattern) =>
EndsWith(l, Literal(pattern))
case contains(pattern) if !pattern.endsWith("\\") =>
Contains(l, Literal(pattern))
case equalTo(pattern) =>
EqualTo(l, Literal(pattern))
case _ =>
Like(l, Literal.create(utf, StringType))
}
}
}
例如sql语句如下:
String sql = " select * from test where car_num LIKE 'N%'";
则日志打印如下:
plan->
Filter (car_num#5 LIKE N%)
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Filter StartsWith(car_num#5, N)//将N%转换为StartsWith
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.LikeSimplification$@37638d4f
简化Boolean表达式,就是说如果能预先知道Boolean表达式的值的话,则不计算其他部分了
object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case q: LogicalPlan => q transformExpressionsUp {
case and @ And(left, right) => (left, right) match {
// true && r => r
case (Literal(true, BooleanType), r) => r//左边为true,则求右边
// l && true => l
case (l, Literal(true, BooleanType)) => l//右边为true,则求左边
// false && r => false
case (Literal(false, BooleanType), _) => Literal(false)//只要有1个为false,则都为false
// l && false => false
case (_, Literal(false, BooleanType)) => Literal(false) //只要有1个为false,则都为false
// a && a => a
case (l, r) if l fastEquals r => l//如果左右都相等,则只求1边
// (a || b) && (a || c) => a || (b && c)
case _ =>
……
}
例如sql语句如下:
String sql = " select * from test where id > 0 and id > 0 and 1 > 0";
则日志打印如下:
plan->
Filter (((id#0L > 0) && (id#0L > 0)) && true)
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Filter (id#0L > 0)
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.BooleanSimplification$@48c9076b
简化Filter表达式
object SimplifyFilters extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
// If the filter condition always evaluate to true, remove the filter.
case Filter(Literal(true, BooleanType), child) => child//如果Filter的表达式为true,则去掉Filter
// If the filter condition always evaluate to null or false,
// replace the input with an empty relation.
case Filter(Literal(null, _), child) => LocalRelation(child.output, data = Seq.empty) //如果Filter的表达式为false,则去掉Filter
case Filter(Literal(false, BooleanType), child) => LocalRelation(child.output, data = Seq.empty)
}
}
例如sql语句如下:
String sql = " select * from test where 1 > 0";
则日志打印如下:
plan->
Filter true
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->//去除Filter
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.SimplifyFilters$@47fce61b
简化Cast,如果数据类型和要转换的类型一致,则去掉Cast
object SimplifyCasts extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
case Cast(e, dataType) if e.dataType == dataType => e//如果类型一致,则去掉Cast
}
}
例如sql语句如下:
String sql = " select CAST(dev_chnid as String) from test";
则日志打印如下:
plan->
Project [CAST(dev_chnid#26, StringType) AS c0#43]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [dev_chnid#26 AS c0#43]//由于dev_chnid本身就是String,则去掉Cast
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.SimplifyCasts$@2b287e24
简化大小写表达式
object SimplifyCaseConversionExpressions extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case q: LogicalPlan => q transformExpressionsUp {
case Upper(Upper(child)) => Upper(child)
case Upper(Lower(child)) => Upper(child)
case Lower(Upper(child)) => Lower(child)
case Lower(Lower(child)) => Lower(child)
}
}
}
例如sql语句如下:
String sql = " select UPPER(LOWER(dev_chnid)) from test ";
则日志打印如下:
plan->
Project [Upper(Lower(dev_chnid#26)) AS c0#43]
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
result->
Project [Upper(dev_chnid#26) AS c0#43]//简化大小写表达式
Relation[id#0L,dev_id#1,dev_chnnum#2L,dev_name#3,dev_chnname#4,car_num#5,car_numtype#6,car_numcolor#7,car_speed#8,car_type#9,car_color#10,car_length#11L,car_direct#12,car_way_code#13,cap_time#14L,cap_date#15L,inf_note#16,max_speed#17,min_speed#18,car_img_url#19,car_img1_url#20,car_img2_url#21,car_img3_url#22,car_img4_url#23,car_img5_url#24,rec_stat#25,dev_chnid#26,car_img_count#27,save_flag#28,dc_cleanflag#29,pic_id#30,car_img_plate_top#31L,car_img_plate_left#32L,car_img_plate_bottom#33L,car_img_plate_right#34L,car_brand#35L,issafetybelt#36,isvisor#37,bind_stat#38,car_num_pic#39,combined_pic_url#40,verify_memo#41,rec_stat_tmp#42] org.apache.spark.sql.parquet.ParquetRelation2@2a400010
rule->org.apache.spark.sql.catalyst.optimizer.SimplifyCaseConversionExpressions$@1a09681