REDSHIFT性能优化-QUERYID-1301108

一、待优化SQL
INSERT INTO cap.report_flows_hour ( "date", "hour", adid, appid, ccode, kv, sv, isroot, send, recive, "show", download, successd, install, click, "restart", fail_download ) SELECT tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot, 
SUM ( CASE WHEN tb.action_type =- 1 THEN 1 ELSE 0 END ) send, 
SUM ( CASE WHEN tb.action_type = 0 THEN 1 ELSE 0 END ) recive, 
SUM ( CASE WHEN tb.action_type = 1 THEN 1 ELSE 0 END ) "show", 
SUM ( CASE WHEN tb.action_type = 2 THEN 1 ELSE 0 END ) download, 
SUM ( CASE WHEN tb.action_type = 3 THEN 1 ELSE 0 END ) successd, 
SUM ( CASE WHEN tb.action_type = 4 THEN 1 ELSE 0 END ) install, 
SUM ( CASE WHEN tb.action_type = 5 THEN 1 ELSE 0 END ) click, 
SUM ( CASE WHEN tb.action_type = 11 THEN 1 ELSE 0 END ) "restart", 
SUM ( CASE WHEN tb.action_type = 22 THEN 1 ELSE 0 END ) "fail_download" 
FROM ( SELECT ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type 
FROM cap.cap_flows ta 
WHERE ta."date" = '2016-04-18' AND ta."hour" = '00' 
GROUP BY ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type 
) tb 
GROUP BY tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot




二、诊断过程
查看表定义:字段,字段类型,字段压缩类型,distkey类型,sortkey类型
select * from pg_table_def where tablename like '%report_flows_hour%';
cap=> select * from pg_table_def where tablename like '%cap_flows%';
 schemaname | tablename |    column    |            type             | encoding | distkey | sortkey | notnull 
------------+-----------+--------------+-----------------------------+----------+---------+---------+---------
 cap        | cap_flows | flowdate     | timestamp without time zone | delta    | f       |       0 | t
 cap        | cap_flows | date         | character varying(10)       | text255  | f       |       1 | t
 cap        | cap_flows | hour         | character varying(2)        | text255  | t       |       0 | t
 cap        | cap_flows | tid          | character varying(32)       | text255  | f       |       0 | t
 cap        | cap_flows | adid         | bigint                      | delta32k | f       |       0 | t
 cap        | cap_flows | appid        | bigint                      | delta32k | f       |       0 | t
 cap        | cap_flows | ccode        | character varying(10)       | lzo      | f       |       0 | t
 cap        | cap_flows | kv           | integer                     | delta32k | f       |       0 | t
 cap        | cap_flows | sv           | character varying(24)       | text255  | f       |       0 | t
 cap        | cap_flows | imei         | character varying(45)       | text255  | f       |       0 | t
 cap        | cap_flows | api          | smallint                    | delta    | f       |       0 | t
 cap        | cap_flows | isroot       | smallint                    | delta    | f       |       0 | t
 cap        | cap_flows | action_type  | smallint                    | delta    | f       |       0 | t
 cap        | cap_flows | action_value | smallint                    | delta    | f       |       0 | t
 cap        | cap_flows | androidid    | character varying(45)       | none     | f       |       0 | f
(15 rows)




查看distribute类型和字段,数据分布是否均衡,排序情况,压缩情况
cap=# \x
Expanded display is on.
cap=# select * from svv_table_info where "table"='cap_flows';
-[ RECORD 1 ]-+-----------
database      | cap
schema        | cap
table_id      | 128192
table         | cap_flows
encoded       | Y
diststyle     | KEY(hour)    #表示用的hour来分区
sortkey1      | date
max_varchar   | 45
sortkey1_enc  | text255
sortkey_num   | 1
size          | 1032926
pct_used      | 18.0760
empty         | 0
unsorted      | 99.57        #没有排序的行比例
stats_off     | 62.78
tbl_rows      | 9245636670   #表行数
skew_sortkey1 | 3.56
skew_rows     | 1.48         #slice包含最大的记录数/slice包含最小的记录数




分析表的压缩建议:
cap=> analyze compression cap_flows;
   Table   |    Column    | Encoding  
-----------+--------------+-----------
 cap_flows | flowdate     | runlength
 cap_flows | date         | lzo
 cap_flows | hour         | runlength
 cap_flows | tid          | lzo
 cap_flows | adid         | lzo
 cap_flows | appid        | lzo
 cap_flows | ccode        | lzo
 cap_flows | kv           | lzo
 cap_flows | sv           | runlength
 cap_flows | imei         | lzo
 cap_flows | api          | lzo
 cap_flows | isroot       | runlength
 cap_flows | action_type  | lzo
 cap_flows | action_value | lzo
 cap_flows | androidid    | lzo
(15 rows)






查看SQL历史,发现没有和cap_flows关联查询的表,根据skew_rows和diststyle的指标,不需要优化。
根据unsorted指标,发现99.57都没有排序,根据条件ta."date" = '2016-04-18' AND ta."hour" = '00' ,这两个条件经常一起使用,建议使用date和hour排序
根据压缩建议进行改进








三、测试优化方案
因为源表数据量太大,我们抓取20160417一天的数据,分别创建两个表,一个和源表定义一样,另外一个是优化后的定义:


create table cap_flows_20160417_old(
flowdate      timestamp         encode delta              ,
date          varchar(10)       encode text255            sortkey,
hour          varchar(2)        encode text255            distkey,
tid           varchar(32)       encode text255            ,
adid          bigint                      encode delta32k ,
appid         bigint                      encode delta32k ,
ccode         varchar(10)       encode lzo                ,
kv            integer                     encode delta32k ,
sv            varchar(24)       encode text255            ,
imei          varchar(45)       encode text255            ,
api           smallint                    encode delta    ,
isroot        smallint                    encode delta    ,
action_type   smallint                    encode delta    ,
action_value  smallint                    encode delta    ,
androidid     varchar(45)       encode text255               
);




create table cap_flows_20160417_new(
flowdate      timestamp         encode runlength     ,
date          varchar(10)       encode lzo           ,
hour          varchar(2)        encode runlength           distkey,
tid           varchar(32)       encode lzo           ,
adid          bigint            encode lzo ,
appid         bigint            encode lzo ,
ccode         varchar(10)       encode lzo           ,
kv            integer           encode lzo ,
sv            varchar(24)       encode runlength           ,
imei          varchar(45)       encode lzo           ,
api           smallint          encode lzo ,
isroot        smallint          encode runlength ,
action_type   smallint          encode lzo ,
action_value  smallint          encode lzo ,
androidid     varchar(45)       encode lzo              
)compound sortkey (date,hour);




cap=> insert into cap_flows_20160417_old
cap-> select * from cap_flows 
cap-> where "date" = '2016-04-17';
INSERT 0 129018523
30分钟插入完成


cap=> insert into cap_flows_20160417_new
cap-> select * from cap_flows_20160417_old ;
INSERT 0 129018523






新建表过后,会重新应用新的压缩类型,重新排序,再次查看表的基本信息:
cap=# \x
Expanded display is on.
cap=# select * from svv_table_info where "table"='cap_flows_20160417_old';
-[ RECORD 1 ]-+-----------------------
database      | cap
schema        | cap
table_id      | 225386
table         | cap_flows_20160417_old
encoded       | Y
diststyle     | KEY(hour)
sortkey1      | date
max_varchar   | 45
sortkey1_enc  | text255
sortkey_num   | 1
size          | 17476
pct_used      | 0.3058
empty         | 0
unsorted      | 0.00
stats_off     | 99.99
tbl_rows      | 129018523
skew_sortkey1 | 3.55
skew_rows     | 1.75


cap=# select * from svv_table_info where "table"='cap_flows_20160417_new';
-[ RECORD 1 ]-+-----------------------
database      | cap
schema        | cap
table_id      | 225388
table         | cap_flows_20160417_new
encoded       | Y
diststyle     | KEY(hour)
sortkey1      | date
max_varchar   | 45
sortkey1_enc  | lzo
sortkey_num   | 2
size          | 8112
pct_used      | 0.1419
empty         | 0
unsorted      | 0.00
stats_off     | 99.99
tbl_rows      | 129018523
skew_sortkey1 | 110.90
skew_rows     | 1.75






执行下面的测试SQL:SQL2比SQL1用时更少
select count(1) from (
SELECT tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot, 
SUM ( CASE WHEN tb.action_type =- 1 THEN 1 ELSE 0 END ) send, 
SUM ( CASE WHEN tb.action_type = 0 THEN 1 ELSE 0 END ) recive, 
SUM ( CASE WHEN tb.action_type = 1 THEN 1 ELSE 0 END ) "show", 
SUM ( CASE WHEN tb.action_type = 2 THEN 1 ELSE 0 END ) download, 
SUM ( CASE WHEN tb.action_type = 3 THEN 1 ELSE 0 END ) successd, 
SUM ( CASE WHEN tb.action_type = 4 THEN 1 ELSE 0 END ) install, 
SUM ( CASE WHEN tb.action_type = 5 THEN 1 ELSE 0 END ) click, 
SUM ( CASE WHEN tb.action_type = 11 THEN 1 ELSE 0 END ) "restart", 
SUM ( CASE WHEN tb.action_type = 22 THEN 1 ELSE 0 END ) "fail_download" 
FROM ( SELECT ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type 
FROM cap.cap_flows_20160417_old ta 
WHERE ta."date" = '2016-04-17' AND ta."hour" = '00' 
GROUP BY ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type 
) tb 
GROUP BY tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot) a;




select count(1) from (
SELECT tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot, 
SUM ( CASE WHEN tb.action_type =- 1 THEN 1 ELSE 0 END ) send, 
SUM ( CASE WHEN tb.action_type = 0 THEN 1 ELSE 0 END ) recive, 
SUM ( CASE WHEN tb.action_type = 1 THEN 1 ELSE 0 END ) "show", 
SUM ( CASE WHEN tb.action_type = 2 THEN 1 ELSE 0 END ) download, 
SUM ( CASE WHEN tb.action_type = 3 THEN 1 ELSE 0 END ) successd, 
SUM ( CASE WHEN tb.action_type = 4 THEN 1 ELSE 0 END ) install, 
SUM ( CASE WHEN tb.action_type = 5 THEN 1 ELSE 0 END ) click, 
SUM ( CASE WHEN tb.action_type = 11 THEN 1 ELSE 0 END ) "restart", 
SUM ( CASE WHEN tb.action_type = 22 THEN 1 ELSE 0 END ) "fail_download" 
FROM ( SELECT ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type 
FROM cap.cap_flows_20160417_new ta 
WHERE ta."date" = '2016-04-17' AND ta."hour" = '00' 
GROUP BY ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type 
) tb 
GROUP BY tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot) a;




 




四、优化结论:
1、执行accum compression查看列压缩建议,按照建议修改压缩类型
2、由于本SQL是简单的SELECT,不对DISTRIBUTE进行调整
3、由于排序比例太少,date和hour字段应用比较多,决定使用date和hour联合排序。可以对源表执行accum进行重新排序



你可能感兴趣的:(REDSHIFT)