一、待优化SQL
INSERT INTO cap.report_flows_hour ( "date", "hour", adid, appid, ccode, kv, sv, isroot, send, recive, "show", download, successd, install, click, "restart", fail_download ) SELECT tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot,
SUM ( CASE WHEN tb.action_type =- 1 THEN 1 ELSE 0 END ) send,
SUM ( CASE WHEN tb.action_type = 0 THEN 1 ELSE 0 END ) recive,
SUM ( CASE WHEN tb.action_type = 1 THEN 1 ELSE 0 END ) "show",
SUM ( CASE WHEN tb.action_type = 2 THEN 1 ELSE 0 END ) download,
SUM ( CASE WHEN tb.action_type = 3 THEN 1 ELSE 0 END ) successd,
SUM ( CASE WHEN tb.action_type = 4 THEN 1 ELSE 0 END ) install,
SUM ( CASE WHEN tb.action_type = 5 THEN 1 ELSE 0 END ) click,
SUM ( CASE WHEN tb.action_type = 11 THEN 1 ELSE 0 END ) "restart",
SUM ( CASE WHEN tb.action_type = 22 THEN 1 ELSE 0 END ) "fail_download"
FROM ( SELECT ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type
FROM cap.cap_flows ta
WHERE ta."date" = '2016-04-18' AND ta."hour" = '00'
GROUP BY ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type
) tb
GROUP BY tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot
二、诊断过程
查看表定义:字段,字段类型,字段压缩类型,distkey类型,sortkey类型
select * from pg_table_def where tablename like '%report_flows_hour%';
cap=> select * from pg_table_def where tablename like '%cap_flows%';
schemaname | tablename | column | type | encoding | distkey | sortkey | notnull
------------+-----------+--------------+-----------------------------+----------+---------+---------+---------
cap | cap_flows | flowdate | timestamp without time zone | delta | f | 0 | t
cap | cap_flows | date | character varying(10) | text255 | f | 1 | t
cap | cap_flows | hour | character varying(2) | text255 | t | 0 | t
cap | cap_flows | tid | character varying(32) | text255 | f | 0 | t
cap | cap_flows | adid | bigint | delta32k | f | 0 | t
cap | cap_flows | appid | bigint | delta32k | f | 0 | t
cap | cap_flows | ccode | character varying(10) | lzo | f | 0 | t
cap | cap_flows | kv | integer | delta32k | f | 0 | t
cap | cap_flows | sv | character varying(24) | text255 | f | 0 | t
cap | cap_flows | imei | character varying(45) | text255 | f | 0 | t
cap | cap_flows | api | smallint | delta | f | 0 | t
cap | cap_flows | isroot | smallint | delta | f | 0 | t
cap | cap_flows | action_type | smallint | delta | f | 0 | t
cap | cap_flows | action_value | smallint | delta | f | 0 | t
cap | cap_flows | androidid | character varying(45) | none | f | 0 | f
(15 rows)
查看distribute类型和字段,数据分布是否均衡,排序情况,压缩情况
cap=# \x
Expanded display is on.
cap=# select * from svv_table_info where "table"='cap_flows';
-[ RECORD 1 ]-+-----------
database | cap
schema | cap
table_id | 128192
table | cap_flows
encoded | Y
diststyle | KEY(hour) #表示用的hour来分区
sortkey1 | date
max_varchar | 45
sortkey1_enc | text255
sortkey_num | 1
size | 1032926
pct_used | 18.0760
empty | 0
unsorted | 99.57 #没有排序的行比例
stats_off | 62.78
tbl_rows | 9245636670 #表行数
skew_sortkey1 | 3.56
skew_rows | 1.48 #slice包含最大的记录数/slice包含最小的记录数
分析表的压缩建议:
cap=> analyze compression cap_flows;
Table | Column | Encoding
-----------+--------------+-----------
cap_flows | flowdate | runlength
cap_flows | date | lzo
cap_flows | hour | runlength
cap_flows | tid | lzo
cap_flows | adid | lzo
cap_flows | appid | lzo
cap_flows | ccode | lzo
cap_flows | kv | lzo
cap_flows | sv | runlength
cap_flows | imei | lzo
cap_flows | api | lzo
cap_flows | isroot | runlength
cap_flows | action_type | lzo
cap_flows | action_value | lzo
cap_flows | androidid | lzo
(15 rows)
查看SQL历史,发现没有和cap_flows关联查询的表,根据skew_rows和diststyle的指标,不需要优化。
根据unsorted指标,发现99.57都没有排序,根据条件ta."date" = '2016-04-18' AND ta."hour" = '00' ,这两个条件经常一起使用,建议使用date和hour排序
根据压缩建议进行改进
三、测试优化方案
因为源表数据量太大,我们抓取20160417一天的数据,分别创建两个表,一个和源表定义一样,另外一个是优化后的定义:
create table cap_flows_20160417_old(
flowdate timestamp encode delta ,
date varchar(10) encode text255 sortkey,
hour varchar(2) encode text255 distkey,
tid varchar(32) encode text255 ,
adid bigint encode delta32k ,
appid bigint encode delta32k ,
ccode varchar(10) encode lzo ,
kv integer encode delta32k ,
sv varchar(24) encode text255 ,
imei varchar(45) encode text255 ,
api smallint encode delta ,
isroot smallint encode delta ,
action_type smallint encode delta ,
action_value smallint encode delta ,
androidid varchar(45) encode text255
);
create table cap_flows_20160417_new(
flowdate timestamp encode runlength ,
date varchar(10) encode lzo ,
hour varchar(2) encode runlength distkey,
tid varchar(32) encode lzo ,
adid bigint encode lzo ,
appid bigint encode lzo ,
ccode varchar(10) encode lzo ,
kv integer encode lzo ,
sv varchar(24) encode runlength ,
imei varchar(45) encode lzo ,
api smallint encode lzo ,
isroot smallint encode runlength ,
action_type smallint encode lzo ,
action_value smallint encode lzo ,
androidid varchar(45) encode lzo
)compound sortkey (date,hour);
cap=> insert into cap_flows_20160417_old
cap-> select * from cap_flows
cap-> where "date" = '2016-04-17';
INSERT 0 129018523
30分钟插入完成
cap=> insert into cap_flows_20160417_new
cap-> select * from cap_flows_20160417_old ;
INSERT 0 129018523
新建表过后,会重新应用新的压缩类型,重新排序,再次查看表的基本信息:
cap=# \x
Expanded display is on.
cap=# select * from svv_table_info where "table"='cap_flows_20160417_old';
-[ RECORD 1 ]-+-----------------------
database | cap
schema | cap
table_id | 225386
table | cap_flows_20160417_old
encoded | Y
diststyle | KEY(hour)
sortkey1 | date
max_varchar | 45
sortkey1_enc | text255
sortkey_num | 1
size | 17476
pct_used | 0.3058
empty | 0
unsorted | 0.00
stats_off | 99.99
tbl_rows | 129018523
skew_sortkey1 | 3.55
skew_rows | 1.75
cap=# select * from svv_table_info where "table"='cap_flows_20160417_new';
-[ RECORD 1 ]-+-----------------------
database | cap
schema | cap
table_id | 225388
table | cap_flows_20160417_new
encoded | Y
diststyle | KEY(hour)
sortkey1 | date
max_varchar | 45
sortkey1_enc | lzo
sortkey_num | 2
size | 8112
pct_used | 0.1419
empty | 0
unsorted | 0.00
stats_off | 99.99
tbl_rows | 129018523
skew_sortkey1 | 110.90
skew_rows | 1.75
执行下面的测试SQL:SQL2比SQL1用时更少
select count(1) from (
SELECT tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot,
SUM ( CASE WHEN tb.action_type =- 1 THEN 1 ELSE 0 END ) send,
SUM ( CASE WHEN tb.action_type = 0 THEN 1 ELSE 0 END ) recive,
SUM ( CASE WHEN tb.action_type = 1 THEN 1 ELSE 0 END ) "show",
SUM ( CASE WHEN tb.action_type = 2 THEN 1 ELSE 0 END ) download,
SUM ( CASE WHEN tb.action_type = 3 THEN 1 ELSE 0 END ) successd,
SUM ( CASE WHEN tb.action_type = 4 THEN 1 ELSE 0 END ) install,
SUM ( CASE WHEN tb.action_type = 5 THEN 1 ELSE 0 END ) click,
SUM ( CASE WHEN tb.action_type = 11 THEN 1 ELSE 0 END ) "restart",
SUM ( CASE WHEN tb.action_type = 22 THEN 1 ELSE 0 END ) "fail_download"
FROM ( SELECT ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type
FROM cap.cap_flows_20160417_old ta
WHERE ta."date" = '2016-04-17' AND ta."hour" = '00'
GROUP BY ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type
) tb
GROUP BY tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot) a;
select count(1) from (
SELECT tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot,
SUM ( CASE WHEN tb.action_type =- 1 THEN 1 ELSE 0 END ) send,
SUM ( CASE WHEN tb.action_type = 0 THEN 1 ELSE 0 END ) recive,
SUM ( CASE WHEN tb.action_type = 1 THEN 1 ELSE 0 END ) "show",
SUM ( CASE WHEN tb.action_type = 2 THEN 1 ELSE 0 END ) download,
SUM ( CASE WHEN tb.action_type = 3 THEN 1 ELSE 0 END ) successd,
SUM ( CASE WHEN tb.action_type = 4 THEN 1 ELSE 0 END ) install,
SUM ( CASE WHEN tb.action_type = 5 THEN 1 ELSE 0 END ) click,
SUM ( CASE WHEN tb.action_type = 11 THEN 1 ELSE 0 END ) "restart",
SUM ( CASE WHEN tb.action_type = 22 THEN 1 ELSE 0 END ) "fail_download"
FROM ( SELECT ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type
FROM cap.cap_flows_20160417_new ta
WHERE ta."date" = '2016-04-17' AND ta."hour" = '00'
GROUP BY ta."date", ta."hour", ta.adid, ta.appid, ta.ccode, ta.kv, ta.sv, ta.isroot, ta.tid, ta.action_type
) tb
GROUP BY tb."date", tb."hour", tb.adid, tb.appid, tb.ccode, tb.kv, tb.sv, tb.isroot) a;
四、优化结论:
1、执行accum compression查看列压缩建议,按照建议修改压缩类型
2、由于本SQL是简单的SELECT,不对DISTRIBUTE进行调整
3、由于排序比例太少,date和hour字段应用比较多,决定使用date和hour联合排序。可以对源表执行accum进行重新排序