Hive 执行计划 & 性能调优

一、执行计划

  • explain,查看基本信息

  • explain dependency,查看依赖信息

  • explain authorization,查看权限信息

1.explain 用法

-- explain + 执行语句
例如:
explain
select 
	s2.s_id ,avg(s2.s_score) as avg_score
from edu.student s
join edu.score s2 on s.s_id  = s2.s_id 
where s.s_class  = 3
group by s2.s_id 
having avg(s2.s_score) >= 60
order by avg_score desc
limit 3;

-- MySQL: from -> on -> join -> where -> group by -> having -> window func -> select -> distinct -> union -> order by -> limit

-- Hive:from -> where -> on -> join -> group by -> 聚合函数 -> having -> select -> order by -> limit

STAGE DEPENDENCIES:
  Stage-6 is a root stage
  Stage-2 depends on stages: Stage-6
  Stage-3 depends on stages: Stage-2
  Stage-0 depends on stages: Stage-3

STAGE PLANS:
  Stage: Stage-6
    Map Reduce Local Work
      Alias -> Map Local Tables:
        $hdt$_0:s -- student表
          Fetch Operator
            limit: -1 -- 不限制行数, 不是limit 3
      Alias -> Map Local Operator Tree:
        $hdt$_0:s 
          TableScan -- 表扫描
            alias: s -- student表
            Statistics: Num rows: 58 Data size: 470 Basic stats: COMPLETE Column stats: NONE
            Filter Operator -- where筛选条件
              predicate: ((s_class = 3) and s_id is not null) (type: boolean)
              Statistics: Num rows: 29 Data size: 235 Basic stats: COMPLETE Column stats: NONE
              Select Operator -- on所需字段: s.s_id
                expressions: s_id (type: int)
                outputColumnNames: _col0
                Statistics: Num rows: 29 Data size: 235 Basic stats: COMPLETE Column stats: NONE
                HashTable Sink Operator
                  keys:
                    0 _col0 (type: int)
                    1 _col0 (type: int)

  Stage: Stage-2
    Map Reduce
      Map Operator Tree:
          TableScan -- 表扫描
            alias: sc -- score表
            Statistics: Num rows: 66 Data size: 531 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              -- on筛选条件
              predicate: s_id is not null (type: boolean)
              Statistics: Num rows: 66 Data size: 531 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                  -- on所需字段: sc.s_id
                  -- group by + avg所需字段: sc.s_id, sc.score
                expressions: s_id (type: int), s_score (type: int)
                outputColumnNames: _col0, _col1
                Statistics: Num rows: 66 Data size: 531 Basic stats: COMPLETE Column stats: NONE
                Map Join Operator
                  condition map:
                          -- inner join内连接操作
                       Inner Join 0 to 1
                  keys:
                    0 _col0 (type: int)
                    1 _col0 (type: int)
                  outputColumnNames: _col2, _col3
                  Statistics: Num rows: 72 Data size: 584 Basic stats: COMPLETE Column stats: NONE
                  Group By Operator -- group by 分组操作
                    aggregations: avg(_col3) -- avg聚合操作
                    keys: _col2 (type: int)
                    mode: hash
                    outputColumnNames: _col0, _col1
                    Statistics: Num rows: 72 Data size: 584 Basic stats: COMPLETE Column stats: NONE
                    Reduce Output Operator
                      key expressions: _col0 (type: int)
                      sort order: +  -- group by 升序操作, 非order by 降序
                      Map-reduce partition columns: _col0 (type: int)
                      Statistics: Num rows: 72 Data size: 584 Basic stats: COMPLETE Column stats: NONE
                      value expressions: _col1 (type: struct)
      Local Work:
        Map Reduce Local Work
      Reduce Operator Tree:
        Group By Operator
          aggregations: avg(VALUE._col0)
          keys: KEY._col0 (type: int)
          mode: mergepartial
          outputColumnNames: _col0, _col1
          Statistics: Num rows: 36 Data size: 292 Basic stats: COMPLETE Column stats: NONE
          Filter Operator -- having筛选操作
            predicate: (_col1 > 60.0) (type: boolean)
            Statistics: Num rows: 12 Data size: 97 Basic stats: COMPLETE Column stats: NONE
            File Output Operator
              compressed: false  -- 不压缩
              table:
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                  serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe

  Stage: Stage-3
    Map Reduce
      Map Operator Tree:
          TableScan
            Reduce Output Operator
              key expressions: _col1 (type: double)
              sort order: -  -- order by 降序操作
              Statistics: Num rows: 12 Data size: 97 Basic stats: COMPLETE Column stats: NONE
              TopN Hash Memory Usage: 0.1
              value expressions: _col0 (type: int)
      Reduce Operator Tree:
        Select Operator
          expressions: VALUE._col0 (type: int), KEY.reducesinkkey0 (type: double)
          outputColumnNames: _col0, _col1
          Statistics: Num rows: 12 Data size: 97 Basic stats: COMPLETE Column stats: NONE
          Limit -- limit操作
            Number of rows: 3
            Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE
            File Output Operator
              compressed: false
              Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE
              table:
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: 3
      Processor Tree:
        ListSink

2.explain 场景

案例一:join会过滤null吗?

-- join会过滤null 

案例二:group by会排序吗?

-- group by 会排序且只能升序

案例三:哪条SQL执行效率高?

-- 以下两条SQL语句输出结果一致,但哪条执行效率高呢?
-- 以下两段SQL语句除了表别名和字段别名不一致, 其他执行计划完全一致
select s.s_id, s.s_name, sc.c_id, sc.s_score 
from student s
join score sc on sc.s_id=s.s_id 
where s.s_class=3;

select s.s_id, s.s_name, sc.c_id, sc.s_score 
from (select * from student where s_class=3) s
join score sc on sc.s_id=s.s_id;

两条语句的执行计划只有表别名与字段别名有区别,所以以上两段sql语句运行效率相同

3.explain dependency 用法

用于描述数据来源,输出json格式,包含以下两部分:

  • input_partitions:数据来源的表分区

  • input_tables:数据来源的表

explain dependency select * from edu.student
{"input_tables":[{"tablename":"edu@student","tabletype":"MANAGED_TABLE"}],"input_partitions":[]}
explain dependency select * from sm.sm_order_detail
{"input_tables":[{"tablename":"sm@sm_order_detail","tabletype":"MANAGED_TABLE"}],"input_partitions":[{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2017/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2018/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2019/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=01"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=02"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=03"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=04"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=05"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=06"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=07"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=08"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=09"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=10"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=11"},{"partitionName":"sm@sm_order_detail@p_year=2020/p_month=12"},{"partitionName":"sm@sm_order_detail@p_year=2021/p_month=01"}]}

4.explain dependency 场景

案例一:识别代码是否等价?

代码1

explain dependency
select * 
from sm.sm_order_info soi 
join sm.sm_region_info sri on sri.region_id=soi.region_id and soi.p_year='2020'


{"input_tables":[{"tablename":"sm@sm_order_info","tabletype":"MANAGED_TABLE"},{"tablename":"sm@sm_region_info","tabletype":"MANAGED_TABLE"}],"input_partitions":[{"partitionName":"sm@sm_order_info@p_year=2020/p_month=01"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=02"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=03"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=04"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=05"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=06"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=07"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=08"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=09"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=10"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=11"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=12"}]}

代码2

explain dependency
select * 
from sm.sm_order_info soi 
join sm.sm_region_info sri on sri.region_id=soi.region_id
where soi.p_year='2020'

{"input_tables":[{"tablename":"sm@sm_order_info","tabletype":"MANAGED_TABLE"},{"tablename":"sm@sm_region_info","tabletype":"MANAGED_TABLE"}],"input_partitions":[{"partitionName":"sm@sm_order_info@p_year=2020/p_month=01"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=02"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=03"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=04"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=05"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=06"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=07"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=08"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=09"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=10"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=11"},{"partitionName":"sm@sm_order_info@p_year=2020/p_month=12"}]}

5.explain authorization 用法

explain authorization
select s_id, s_name from edu.student


INPUTS: 
  edu@student
OUTPUTS: 
  hdfs://localhost:8020/tmp/hive/root/c7dcee71-89d7-457e-ad6f-16d1419b032a/hive_2022-10-13_20-20-21_296_5224225313108621685-3/-mr-10001
CURRENT_USER: 
  root
OPERATION: 
  QUERY

二、Hive性能调优

1.SQL语句优化

1.union all

-- 创建数据库
create database test;
-- 创建分区表,最高分插入到max分区,最低分插入到min分区
create table test.course_max_min (
	c_id int comment '课程id',
    s_score int comment '分数'
) comment '每个课程的最高分和最低分'
partitioned by (tp string comment '分数类型');

-- 开启动态分区,分两次运行
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

-- 优化前, 同一张表分组两次(运行前开启动态分区)
-- Total jobs = 5
insert into table test.course_max_min partition(tp)
select 
sc.c_id, max(sc.s_score) as s_score, 'max' as tp
from edu.score sc
group by sc.c_id
union all 
select 
sc.c_id, min(sc.s_score) as s_score, 'min' as tp
from edu.score sc
group by sc.c_id;
-- 优化后, 同一张表分组一次(运行前开启动态分区)
-- Total jobs = 1
from edu.score sc
insert into table test.course_max_min partition(tp)
select 
sc.c_id, max(sc.s_score) as s_score, 'max' as tp
group by sc.c_id
insert into table test.course_max_min partition(tp)
select 
sc.c_id, max(sc.s_score) as s_score, 'min' as tp
group by sc.c_id;

 2.distinct

-- 优化前
select count(1)
from(
  select s_id
  from edu.student
  group by s_id
) tb;

-- 优化后
select count(distinct s_id)
from edu.student;

3.left semi join

  • 通过退货表的订单编号筛选订单详情表的数据

-- 通过退货表的订单编号筛选订单详情表的数据
-- 优化前, in子查询
select * 
from sm.sm_order_detail_np a
where a.order_id in (
	select b.order_id 
	from sm.sm_return_info b);

-- 优化前, exists子查询
select * 
from sm.sm_order_detail_np a
where exists(
	select b.*
	from sm.sm_return_info b
	where a.order_id = b.order_id)
	
-- 优化后, left semi join左半连接(执行效率更佳)
-- select子句只能选择左表的字段
-- on子句的过滤条件只能是等于号
-- 右表的过滤条件只能在on子句设置
select a.* 
from sm.sm_order_detail_np a 
left semi join sm.sm_return_info b on b.order_id = a.order_id;

 2.并行执行优化

-- 启动并行执行
set hive.exec.parallel=true;
-- 设置并行执行的线程数量
set hive.exec.parallel.thread.number=16;

你可能感兴趣的:(数据分析,hive,java,mysql)