下面测试了,varchar字段类型的条件下,在数据完全一致的情况下,varchar长度长和短的情况下,group by的效率
| CREATE TABLE test
(
TABLE_CATALOG
varchar(512) NOT NULL DEFAULT ”,
TABLE_SCHEMA
varchar(64) NOT NULL DEFAULT ”,
TABLE_NAME
varchar(64) NOT NULL DEFAULT ”,
TABLE_TYPE
varchar(64) NOT NULL DEFAULT ”,
ENGINE
varchar(64) DEFAULT NULL,
VERSION
bigint(21) unsigned DEFAULT NULL,
ROW_FORMAT
varchar(10) DEFAULT NULL,
TABLE_ROWS
bigint(21) unsigned DEFAULT NULL,
AVG_ROW_LENGTH
bigint(21) unsigned DEFAULT NULL,
DATA_LENGTH
bigint(21) unsigned DEFAULT NULL,
MAX_DATA_LENGTH
bigint(21) unsigned DEFAULT NULL,
INDEX_LENGTH
bigint(21) unsigned DEFAULT NULL,
DATA_FREE
bigint(21) unsigned DEFAULT NULL,
AUTO_INCREMENT
bigint(21) unsigned DEFAULT NULL,
CREATE_TIME
datetime DEFAULT NULL,
UPDATE_TIME
datetime DEFAULT NULL,
CHECK_TIME
datetime DEFAULT NULL,
TABLE_COLLATION
varchar(32) DEFAULT NULL,
CHECKSUM
bigint(21) unsigned DEFAULT NULL,
CREATE_OPTIONS
varchar(255) DEFAULT NULL,
TABLE_COMMENT
varchar(2048) NOT NULL DEFAULT ”
) ENGINE=InnoDB DEFAULT CHARSET=utf8 |
| CREATE TABLE test2
(
TABLE_CATALOG
varchar(512) NOT NULL DEFAULT ”,
TABLE_SCHEMA
varchar(64) NOT NULL DEFAULT ”,
TABLE_NAME
varchar(64) NOT NULL DEFAULT ”,
TABLE_TYPE
varchar(64) NOT NULL DEFAULT ”,
ENGINE
varchar(64) DEFAULT NULL,
VERSION
bigint(21) unsigned DEFAULT NULL,
ROW_FORMAT
varchar(10) DEFAULT NULL,
TABLE_ROWS
bigint(21) unsigned DEFAULT NULL,
AVG_ROW_LENGTH
bigint(21) unsigned DEFAULT NULL,
DATA_LENGTH
bigint(21) unsigned DEFAULT NULL,
MAX_DATA_LENGTH
bigint(21) unsigned DEFAULT NULL,
INDEX_LENGTH
bigint(21) unsigned DEFAULT NULL,
DATA_FREE
bigint(21) unsigned DEFAULT NULL,
AUTO_INCREMENT
bigint(21) unsigned DEFAULT NULL,
CREATE_TIME
datetime DEFAULT NULL,
UPDATE_TIME
datetime DEFAULT NULL,
CHECK_TIME
datetime DEFAULT NULL,
TABLE_COLLATION
varchar(32) DEFAULT NULL,
CHECKSUM
bigint(21) unsigned DEFAULT NULL,
CREATE_OPTIONS
varchar(255) DEFAULT NULL,
table_comment
varchar(40) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8 |
select count(*) from test;
se+———-+
| count(*) |
+———-+
| 573440 |
+———-+
mysql> select count(*) from test2;
+———-+
| count(*) |
+———-+
| 573440 |
+———-+
1 row in set (0.54 sec)
mysql> select max(length(table_comment)) from test;
+—————————-+
| max(length(table_comment)) |
+—————————-+
| 39 |
+—————————-+
1 row in set (0.55 sec)
mysql> select max(length(table_comment)) from test2;
+—————————-+
| max(length(table_comment)) |
+—————————-+
| 39 |
+—————————-+
1 row in set (0.20 sec)
mysql> explain select table_comment,count(*) from test group by table_comment;
+—-+————-+——-+——+—————+——+———+——+——–+———————————+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+—-+————-+——-+——+—————+——+———+——+——–+———————————+
| 1 | SIMPLE | test | ALL | NULL | NULL | NULL | NULL | 563672 | Using temporary; Using filesort |
+—-+————-+——-+——+—————+——+———+——+——–+———————————+
1 row in set (0.00 sec)
mysql> explain select table_comment,count(*) from test2 group by table_comment;
+—-+————-+——-+——+—————+——+———+——+——–+———————————+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+—-+————-+——-+——+—————+——+———+——+——–+———————————+
| 1 | SIMPLE | test2 | ALL | NULL | NULL | NULL | NULL | 570173 | Using temporary; Using filesort |
+—-+————-+——-+——+—————+——+———+——+——–+———————————+
mysql> select table_comment,count(*) from test group by table_comment;
+—————————————–+———-+
| table_comment | count(*) |
+—————————————–+———-+
| | 471040 |
| Column privileges | 4096 |
| Database privileges | 4096 |
| Events | 4096 |
| General log | 4096 |
| help categories | 4096 |
| help keywords | 4096 |
| help topics | 4096 |
| keyword-topic relation | 4096 |
| Leap seconds information for time zones | 4096 |
| Master Information | 4096 |
| MySQL Foreign Servers table | 4096 |
| MySQL plugins | 4096 |
| Procedure privileges | 4096 |
| Relay Log Information | 4096 |
| Slow log | 4096 |
| Stored Procedures | 4096 |
| Table privileges | 4096 |
| Time zone names | 4096 |
| Time zone transition types | 4096 |
| Time zone transitions | 4096 |
| Time zones | 4096 |
| User defined functions | 4096 |
| User proxy privileges | 4096 |
| Users and global privileges | 4096 |
| Worker Information | 4096 |
+—————————————–+———-+
26 rows in set (3.84 sec)
mysql> select table_comment,count(*) from test2 group by table_comment;
+—————————————–+———-+
| table_comment | count(*) |
+—————————————–+———-+
| | 471040 |
| Column privileges | 4096 |
| Database privileges | 4096 |
| Events | 4096 |
| General log | 4096 |
| help categories | 4096 |
| help keywords | 4096 |
| help topics | 4096 |
| keyword-topic relation | 4096 |
| Leap seconds information for time zones | 4096 |
| Master Information | 4096 |
| MySQL Foreign Servers table | 4096 |
| MySQL plugins | 4096 |
| Procedure privileges | 4096 |
| Relay Log Information | 4096 |
| Slow log | 4096 |
| Stored Procedures | 4096 |
| Table privileges | 4096 |
| Time zone names | 4096 |
| Time zone transition types | 4096 |
| Time zone transitions | 4096 |
| Time zones | 4096 |
| User defined functions | 4096 |
| User proxy privileges | 4096 |
| Users and global privileges | 4096 |
| Worker Information | 4096 |
+—————————————–+———-+
26 rows in set (0.94 sec)
可见,varchar长度短的情况下,基于这个字段的group by效率更高。所以开发在设计表的时候要使该字段在满足业务需求的情况下尽可能的小。
ps:本想找源码看看代码那是怎么处理的,直接用vim看文件太累了,还是把家里的ubuntu电脑带来,下载源码用eclipse跟吧。。。。吐槽微软的vs,,,,,,,太麻烦,还tm要安装windows最新版本
因为group by是需要排序的,上面的测试中也是只有排序的地方可能走的方式不同,所以我就直接在filesort.cc中filesort方法设置断点跟踪。
走到了下面的代码处:
while (memory_available >= min_sort_memory)
{
ha_rows keys= memory_available / (param.rec_length + sizeof(char*));
param.max_keys_per_buffer= (uint) min(num_rows, keys);
table_sort.alloc_sort_buffer(param.max_keys_per_buffer, param.rec_length);
这段代码中先计算每个buffer上key的最大数量,然后根据每个buffer上最大key的数量和记录的长度分配排序buffer,很有可能是这块不一样导致的效率不一致。在测试环境中table_comment的定义长度是2048的时候,param.rec_length是1030,table_comment的长度是39的时候,rec_length87是
继续跟踪代码到了分配buffer的地方
if (m_idx_array.is_null())
{
uchar **sort_keys=
(uchar**) my_malloc(num_records * (record_length + sizeof(uchar*)),
MYF(0));
m_idx_array= Idx_array(sort_keys, num_records);
m_record_length= record_length;
uchar **start_of_data= m_idx_array.array() + m_idx_array.size();
m_start_of_data= reinterpret_cast(start_of_data);
}
看到my_malloc的分配是通过记录数乘以rec_length的长度分配的,下面是一个数据结构
class Sort_param {
public:
uint rec_length; // Length of sorted records.
uint sort_length; // Length of sorted columns.
uint ref_length; // Length of record ref.
uint addon_length; // Length of added packed fields.
uint res_length; // Length of records in final sorted file/buffer.
uint max_keys_per_buffer; // Max keys / buffer.
ha_rows max_rows; // Select limit, or HA_POS_ERROR if unlimited.
ha_rows examined_rows; // Number of examined rows.
TABLE *sort_form; // For quicker make_sortkey.
SORT_FIELD *local_sortorder;
SORT_FIELD *end;
SORT_ADDON_FIELD *addon_field; // Descriptors for companion fields.
uchar *unique_buff;
bool not_killable;
计算排序字段长度的算法如下:
uint
sortlength(THD *thd, SORT_FIELD *sortorder, uint s_length,
bool *multi_byte_charset)
{
uint total_length= 0;
const CHARSET_INFO *cs;
*multi_byte_charset= false;
for (; s_length-- ; sortorder++)
{
sortorder->need_strxnfrm= 0;
sortorder->suffix_length= 0;
if (sortorder->field)
{
cs= sortorder->field->sort_charset();
sortorder->length= sortorder->field->sort_length(); //从这来
if (use_strnxfrm((cs=sortorder->field->sort_charset())))
{
sortorder->need_strxnfrm= 1;
*multi_byte_charset= 1;
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
}
if (sortorder->field->maybe_null())
total_length++; // Place for NULL marker
if (sortorder->field->result_type() == STRING_RESULT &&
!sortorder->field->is_temporal())
{
set_if_smaller(sortorder->length, thd->variables.max_sort_length);
}
}
else
{
sortorder->result_type= sortorder->item->result_type();
if (sortorder->item->is_temporal())
sortorder->result_type= INT_RESULT;
switch (sortorder->result_type) {
case STRING_RESULT:
sortorder->length= sortorder->item->max_length;
set_if_smaller(sortorder->length, thd->variables.max_sort_length);
if (use_strnxfrm((cs=sortorder->item->collation.collation)))
{
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
sortorder->need_strxnfrm= 1;
*multi_byte_charset= 1;
}
else if (cs == &my_charset_bin)
{
/* Store length last to be able to sort blob/varbinary */
sortorder->suffix_length= suffix_length(sortorder->length);
sortorder->length+= sortorder->suffix_length;
}
break;
case INT_RESULT:
#if SIZEOF_LONG_LONG > 4
sortorder->length=8; // Size of intern longlong
#else
sortorder->length=4;
#endif
break;
case DECIMAL_RESULT:
sortorder->length=
my_decimal_get_binary_size(sortorder->item->max_length -
(sortorder->item->decimals ? 1 : 0),
sortorder->item->decimals);
break;
case REAL_RESULT:
sortorder->length=sizeof(double);
break;
case ROW_RESULT:
default:
// This case should never be choosen
DBUG_ASSERT(0);
break;
}
if (sortorder->item->maybe_null)
total_length++; // Place for NULL marker
}
total_length+= sortorder->length;
}
sortorder->field= NULL; // end marker
DBUG_PRINT("info",("sort_length: %u", total_length));
return total_length;
}
从上面看到,对于排序字段长度的不同导致申请排序空间不同,这样导致了2者效率上的不同