这几天工作遇到了一个问题。千万级别的表,每秒钟产生很多数据,select count(id) from table where flag = 1 and create_time < 2023.11.07;分区表,range级别,已经是走create_time列上的索引,flag的值只有0,1。厂商业务卡死在这条sql语句。有什么办法还能再通过什么手段优化吗。大家不妨想一想。
初看已经走索引,range级别好像已经够优秀了,但是执行计划产生了回表,看查找的列不是select *。很好我们可以通过创建联合索引避免回表。
create index inx_caf on table(create_time,flag);
-- 创建结构相似的表
create table lian(
id int primary key,
name varchar(10),
flag varchar(10),
unique key inx_name(name),
key inx_flag(flag)
)
-- 创建存储过程,插入flag分别为0,1的数据
delimiter $$ # 定义结束符
drop procedure if exists addTestData; # 存储过程名叫:addTestData
create procedure addTestData()
begin
declare number int;
set number = 20001;
while number <= 30000 #插入N条数据
do
insert into lian
values(null,number+'1',0); # 为了区分姓名,我们加上后缀
set number = number + 1;
end
while;
end $$;
select count(*) from lian;
select count(*) from lian where flag = 0;
select count(*) from lian where flag = 1;
explain
select count(id) from lian where flag = 1 and name < '1000';
explain format=json
select count(id) from lian where flag = 1 and name < '1000';
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "5.21"
},
"table": {
"table_name": "lian",
"access_type": "range",
"possible_keys": [
"inx_name",
"inx_flag"
],
"key": "inx_name",
"used_key_parts": [
"name"
],
"key_length": "13",
"rows_examined_per_scan": 3,
"rows_produced_per_join": 0,
"filtered": "10.00",
"index_condition": "(`test2`.`lian`.`name` < '1000')",
"cost_info": {
"read_cost": "5.15",
"eval_cost": "0.06",
"prefix_cost": "5.21",
"data_read_per_join": "9"
},
"used_columns": [
"id",
"name",
"flag"
],
"attached_condition": "(`test2`.`lian`.`flag` = 1)"
}
}
}
可以看到这就如厂商卡到的sql语句一样,并且范围查询如果 范围过大的话,执行计划就会变成全表扫描。执行代价5.21
创建联合索引:
create index inx_naf on lian(name,flag);
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "2.22"
},
"table": {
"table_name": "lian",
"access_type": "range",
"possible_keys": [
"inx_name",
"inx_flag",
"inx_naf"
],
"key": "inx_naf",
"used_key_parts": [
"name"
],
"key_length": "13",
"rows_examined_per_scan": 3,
"rows_produced_per_join": 0,
"filtered": "10.00",
"using_index": true,
"cost_info": {
"read_cost": "2.16",
"eval_cost": "0.06",
"prefix_cost": "2.22",
"data_read_per_join": "9"
},
"used_columns": [
"id",
"name",
"flag"
],
"attached_condition": "((`test2`.`lian`.`flag` = 1) and (`test2`.`lian`.`name` < '1000'))"
}
}
}
2.22比之走单列索引的5.21小了很多(我后续做了实验,小这么多的原因是因为name列是唯一约束,或者说name列的唯一值特别多。后续也做了唯一值特别少的代价实验,请往后看)
众所周知联合索引设在前谁在后是有考量的,规则就是谁的选择性好(相对来说唯一值多)谁就放在前面 ,所以我先入为主就把create_time放在了前面。让我们测试下flag在前面的情况。
-- 还没删除naf
create index inx_fan on lian(flag,name);
explain
select count(id) from lian where flag = 1 and name < '1000';
explain format=json
select count(id) from lian where flag = 1 and name < '1000';
删除naf:
优化器没有选择fan,而是选择了最初的单列索引,产生回表。
让我们强制走fan,看看执行代价:
explain format=json
select count(id) from lian force index(inx_fan) where flag = 1 and name < '1000';
explain
select count(id) from lian force index(inx_fan) where flag = 1 and name < '1000';
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "35755.00"
},
"table": {
"table_name": "lian",
"access_type": "index",
"possible_keys": [
"inx_fan"
],
"key": "inx_fan",
"used_key_parts": [
"flag",
"name"
],
"key_length": "26",
"rows_examined_per_scan": 29795,
"rows_produced_per_join": 993,
"filtered": "3.33",
"using_index": true,
"cost_info": {
"read_cost": "35556.39",
"eval_cost": "198.61",
"prefix_cost": "35755.00",
"data_read_per_join": "31K"
},
"used_columns": [
"id",
"name",
"flag"
],
"attached_condition": "((`test2`.`lian`.`flag` = 1) and (`test2`.`lian`.`name` < '1000'))"
}
}
}
执行代价35755特别大,所以联合索引不可以把flag放在最前面,索引选择性特别低。
create table lian3(
id int primary key,
name varchar(10),
flag varchar(10),
key inx_flag(flag)
)
delimiter $$ # 定义结束符
drop procedure if exists addTestData; # 存储过程名叫:addTestData
create procedure addTestData()
begin
declare number int;
set number = 40001;
while number <= 45000 #插入N条数据
do
insert into lian3
values(null,'h',0); # 为了区分姓名,我们加上后缀
set number = number + 1;
end
while;
end $$;
call addTestData();
分别看name和flag的索引选择度
select count(distinct(name))/count(*),count(distinct(name)),count(*) from lian3;
select count(distinct(flag))/count(*),count(distinct(flag)),count(*) from lian3;
explain
select count(id) from lian3 where flag = 1 and name < 'e';
explain format=json
select count(id) from lian3 where flag = 1 and name < 'e';
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "6035.00"
},
"table": {
"table_name": "lian3",
"access_type": "ALL",
"possible_keys": [
"inx_flag"
],
"rows_examined_per_scan": 29690,
"rows_produced_per_join": 989,
"filtered": "3.33",
"cost_info": {
"read_cost": "5837.09",
"eval_cost": "197.91",
"prefix_cost": "6035.00",
"data_read_per_join": "30K"
},
"used_columns": [
"id",
"name",
"flag"
],
"attached_condition": "((`test2`.`lian3`.`flag` = 1) and (`test2`.`lian3`.`name` < 'e'))"
}
}
}
create index inx_name on lian3(name);
因为name的选择度特别低,所以必须强制走索引
explain
select count(id) from lian3 force index(inx_name) where flag = 1 and name < 'e';
explain format=json
select count(id) from lian3 force index(inx_name) where flag = 1 and name < 'e';
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "20784.01"
},
"table": {
"table_name": "lian3",
"access_type": "range",
"possible_keys": [
"inx_name"
],
"key": "inx_name",
"used_key_parts": [
"name"
],
"key_length": "13",
"rows_examined_per_scan": 14845,
"rows_produced_per_join": 1484,
"filtered": "10.00",
"index_condition": "(`test2`.`lian3`.`name` < 'e')",
"cost_info": {
"read_cost": "20487.11",
"eval_cost": "296.90",
"prefix_cost": "20784.01",
"data_read_per_join": "46K"
},
"used_columns": [
"id",
"name",
"flag"
],
"attached_condition": "(`test2`.`lian3`.`flag` = 1)"
}
}
}
create index inx_naf on lian3(name,flag);
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "5993.18"
},
"table": {
"table_name": "lian3",
"access_type": "range",
"possible_keys": [
"inx_flag",
"inx_name",
"inx_naf"
],
"key": "inx_naf",
"used_key_parts": [
"name"
],
"key_length": "13",
"rows_examined_per_scan": 14845,
"rows_produced_per_join": 1484,
"filtered": "10.00",
"using_index": true,
"cost_info": {
"read_cost": "5696.29",
"eval_cost": "296.90",
"prefix_cost": "5993.19",
"data_read_per_join": "46K"
},
"used_columns": [
"id",
"name",
"flag"
],
"attached_condition": "((`test2`.`lian3`.`flag` = 1) and (`test2`.`lian3`.`name` < 'e'))"
}
}
}
create index inx_fan on lian3(flag,name);
drop index inx_naf on lian3;
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "6035.00"
},
"table": {
"table_name": "lian3",
"access_type": "index",
"possible_keys": [
"inx_flag",
"inx_name",
"inx_fan"
],
"key": "inx_fan",
"used_key_parts": [
"flag",
"name"
],
"key_length": "26",
"rows_examined_per_scan": 29690,
"rows_produced_per_join": 1484,
"filtered": "5.00",
"using_index": true,
"cost_info": {
"read_cost": "5738.10",
"eval_cost": "296.90",
"prefix_cost": "6035.00",
"data_read_per_join": "46K"
},
"used_columns": [
"id",
"name",
"flag"
],
"attached_condition": "((`test2`.`lian3`.`flag` = 1) and (`test2`.`lian3`.`name` < 'e'))"
}
}
}
全表扫描:6035
单列name索引:20784
naf:5993.18
fan:6035
可以看到naf和fan相差不大,这和name列的选择度低有关,但是就算选择度低,naf也是最优解,所以选择naf是最好的。
这就根本不用做实验了,flag等值查询,选择性还好肯定放在联合索引的左侧。
综上所述,不管是什么做实验时硬道理,实践出真理。也要直到range级别了也可以继续优化,优化的一个思路就是创建联合索引避免回表。