innodb对B树游标的定位过程以及对“小于(等于)B树最小记录”的特殊处理

 

innodbB树进行游标定位时,主要通过函数btr_cur_search_to_nth_level进行,该函数从根页开始向下层页迭代,直到指定的层级level,最终将B树游标定位在第一个大/小于(等于)tuple的位置,先不考虑页面latch、锁、自适应哈希索引、插入缓冲的影响,仅看B树游标定位:

UNIV_INTERN

void

btr_cur_search_to_nth_level(

/*========================*/

         dict_index_t*  index,       /*!< in: index */

         ulint          level,         /*!< in: the tree level of search */

         const dtuple_t*        tuple,        /*!< in: data tuple; NOTE: n_fields_cmp in

                                     tuple must be set so that it cannot get

                                     compared to the node ptr page number field! */

         ulint          mode,       /*!< in: PAGE_CUR_L, ...;

                                     Inserts should always be made using

                                     PAGE_CUR_LE to search the position! */

         ulint          latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with

                                     at most one of BTR_INSERT, BTR_DELETE_MARK,

                                     BTR_DELETE, or BTR_ESTIMATE;

                                     cursor->left_block is used to store a pointer

                                     to the left neighbor page, in the cases

                                     BTR_SEARCH_PREV and BTR_MODIFY_PREV;

                                     NOTE that if has_search_latch

                                     is != 0, we maybe do not have a latch set

                                     on the cursor page, we assume

                                     the caller uses his search latch

                                     to protect the record! */

         btr_cur_t*        cursor, /*!< in/out: tree cursor; the cursor page is

                                     s- or x-latched, but see also above! */

         ulint          has_search_latch,/*!< in: info on the latch mode the

                                     caller currently has on btr_search_latch:

                                     RW_S_LATCH, or 0 */

         const char*      file,  /*!< in: file name */

         ulint          line, /*!< in: line where called */

         mtr_t*               mtr) /*!< in: mtr */

{

 

         …

         //1.取得根页页号:

         page_cursor = btr_cur_get_page_cur(cursor);

         space = dict_index_get_space(index);

         page_no = dict_index_get_page(index);

         …

         //2.对于非叶子页,将游标定位模式标准化:

         switch (mode) {

         case PAGE_CUR_GE:

                   page_mode = PAGE_CUR_L;

                   break;

         case PAGE_CUR_G:

                   page_mode = PAGE_CUR_LE;

                   break;

         default:

                   page_mode = mode;

                   break;

         }

         …

         //3.开始B树迭代:

search_loop:            <----------------------------------------------------------------------------|

         buf_mode = BUF_GET;                                                                                  |

         rw_latch = RW_NO_LATCH;                                                                          |

retry_page_get:                                                                                                         |

         //3.1取得本层页面,首次为根页面                                                                |

         block = buf_page_get_gen(                                                                            |

                   space, zip_size, page_no, rw_latch, guess, buf_mode,                    |

                   file, line, mtr);                                                                                           |

         page = buf_block_get_frame(block);                                                             |

         …                                                                                                                        |

         if (height == 0) {                                                                                                 |

                   //叶子页需要恢复游标定位模式                                                            |

                   page_mode = mode;                                                                              |

         }                                                                                                                           |

         //3.2在本层页面进行游标定位:                                                                     |

         page_cur_search_with_match(                                                                       |

                   block, index, tuple, page_mode, &up_match, &up_bytes,                 |

                   &low_match, &low_bytes, page_cursor);                                             |

         //3.3若未到达指定level,则向下一层迭代:                                                 |

         if (level != height) {                                                                                             |

                   const rec_t*    node_ptr;                                                                          |

                   height--;                                                                                                     |

                   //去下层子页页号                                                                                     |

                   node_ptr = page_cur_get_rec(page_cursor);                                      |

                   offsets = rec_get_offsets(                                                                       |

                            node_ptr, index, offsets, ULINT_UNDEFINED, &heap);            |

                   page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);     |

                   goto search_loop;-------------------------------------------------------------------|

         }

         //3.4若到达指定level,则完成定位:

         if (level != 0) {

                   buf_block_t*    child_block = btr_block_get(

                            space, zip_size, page_no, RW_X_LATCH, index, mtr);

                   page = buf_block_get_frame(child_block);

                   btr_assert_not_corrupted(child_block, index);

         } else {

                   cursor->low_match = low_match;

                   cursor->low_bytes = low_bytes;

                   cursor->up_match = up_match;

                   cursor->up_bytes = up_bytes;

         }

}

值得注意的是,通过page_cur_search_with_match对非叶子页索引项进行二分搜索时,并不对得到的结果进行检查,而是直接将结果记录的最后一个字段作为下层页的页号。

 页面二分搜索函数page_cur_search_with_match根据传入tuple大小和比较模式,将页面游标定位在在第一个大/小于(等于)tuple的页面位置

UNIV_INTERN

void

page_cur_search_with_match(

/*=======================*/

         const buf_block_t*  block,        /*!< in: buffer block */

         const dict_index_t*          index,       /*!< in: record descriptor */

         const dtuple_t*                 tuple,        /*!< in: data tuple */

         ulint                    mode,       /*!< in: PAGE_CUR_L,

                                               PAGE_CUR_LE, PAGE_CUR_G, or

                                               PAGE_CUR_GE */

         ulint*                           iup_matched_fields,

                                               /*!< in/out: already matched

                                               fields in upper limit record */

         ulint*                           iup_matched_bytes,

                                               /*!< in/out: already matched

                                               bytes in a field not yet

                                               completely matched */

         ulint*                           ilow_matched_fields,

                                               /*!< in/out: already matched

                                               fields in lower limit record */

         ulint*                           ilow_matched_bytes,

                                               /*!< in/out: already matched

                                               bytes in a field not yet

                                               completely matched */

         page_cur_t*             cursor)     /*!< o

{

         //1.初始化搜索

         up_matched_fields  = *iup_matched_fields;

         up_matched_bytes   = *iup_matched_bytes;

         low_matched_fields = *ilow_matched_fields;

         low_matched_bytes  = *ilow_matched_bytes;

         //2.首先进行页面目录的二分搜索,lowinfimum记录的页面目录槽,而upsupremum记录的页面目录槽

         low = 0;

         up = page_dir_get_n_slots(page) - 1;

         //3.开始页面目录的二分搜索,这是一个非常直接的二分搜索过程

         while (up - low > 1) {

                   mid = (low + up) / 2;

                   slot = page_dir_get_nth_slot(page, mid);

                   mid_rec = page_dir_slot_get_rec(slot);

                   offsets = rec_get_offsets(mid_rec, index, offsets,

                                                 dtuple_get_n_fields_cmp(tuple),

                                                 &heap);

                   //用于记录与tuple比较的函数

                   cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,

                                                        &cur_matched_fields,

                                                        &cur_matched_bytes);

                   if (UNIV_LIKELY(cmp > 0)) {

low_slot_match:

                            low = mid;

                            low_matched_fields = cur_matched_fields;

                            low_matched_bytes = cur_matched_bytes;

                   } else if (UNIV_EXPECT(cmp, -1)) {

up_slot_match:

                            up = mid;

                            up_matched_fields = cur_matched_fields;

                            up_matched_bytes = cur_matched_bytes;

                   } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE) {

                            goto low_slot_match;

                   } else {

                            goto up_slot_match;

                   }

         }

         //4.此时low_recup_rec是两个相邻的页面目录槽指向的记录,然后在low_recup_rec之间进行线性搜索

         slot = page_dir_get_nth_slot(page, low);

         low_rec = page_dir_slot_get_rec(slot);

         slot = page_dir_get_nth_slot(page, up);

         up_rec = page_dir_slot_get_rec(slot);

         //5.low_recup_rec之间进行线性搜索

         while (page_rec_get_next_const(low_rec) != up_rec) {

                   mid_rec = page_rec_get_next_const(low_rec);

                   offsets = rec_get_offsets(mid_rec, index, offsets,

                                                 dtuple_get_n_fields_cmp(tuple),

                                                 &heap);

                   //用于记录与tuple比较的函数

                   cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,

                                                        &cur_matched_fields,

                                                        &cur_matched_bytes);

                   if (UNIV_LIKELY(cmp > 0)) {

low_rec_match:

                            low_rec = mid_rec;

                            low_matched_fields = cur_matched_fields;

                            low_matched_bytes = cur_matched_bytes;

                   } else if (UNIV_EXPECT(cmp, -1)) {

up_rec_match:

                            up_rec = mid_rec;

                            up_matched_fields = cur_matched_fields;

                            up_matched_bytes = cur_matched_bytes;

                   } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE) {

                            goto low_rec_match;

                   } else {

                            goto up_rec_match;

                   }

         }

         //6.将游标正确的定位到页面记录上,同时记录比较结果

         if (mode <= PAGE_CUR_GE) {

                   page_cur_position(up_rec, block, cursor);

         } else {

                   page_cur_position(low_rec, block, cursor);

         }

         *iup_matched_fields  = up_matched_fields;

         *iup_matched_bytes   = up_matched_bytes;

         *ilow_matched_fields = low_matched_fields;

         *ilow_matched_bytes  = low_matched_bytes;

}

 

通常,我们会认为记录比较函数cmp_dtuple_rec_with_match将直接比较tuple与页面记录的大小,并将结果返回:

tuple大于页面记录时返回1tuple等于页面记录时返回0tuple小于页面记录时返回-1,这样就造成了一个误解,若tuple 小于当前B树最小记录,如果使用PAGE_CUR_L|PAGE_CUR_LE定位游标,这回导致low_rec定位到页面的infimum,游标也将定位到这里,从而导致page_cur_search_with_matchbtr_cur_search_to_nth_level返回时,将游标定位到infimum,对于非叶子页,若未到达指定levelbtr_cur_search_to_nth_level之后会从这条infimum记录中取下一层页面的页号,但是infimum记录中显然不会有下层页号,所以这里有一个BUG

可是当真是这样吗?innodb开发成员不会注意不到这样一个基础性的细节,那么问题在哪里呢?

问题来源于我们的想当然,即比较函数cmp_dtuple_rec_with_match的处理过程,它并不是按照我们想象的进行的:

 

UNIV_INTERN

int

cmp_dtuple_rec_with_match_low(

/*==========================*/

         const dtuple_t*        dtuple,     /*!< in: data tuple */

         const rec_t*    rec,  /*!< in: physical record which differs from

                                     dtuple in some of the common fields, or which

                                     has an equal number or more fields than

                                     dtuple */

         const ulint*      offsets,/*!< in: array returned by rec_get_offsets() */

         ulint          n_cmp,     /*!< in: number of fields to compare */

         ulint*                 matched_fields, /*!< in/out: number of already completely

                                     matched fields; when function returns,

                                     contains the value for current comparison */

         ulint*                 matched_bytes) /*!< in/out: number of already matched

                                     bytes within the first field not completely

                                     matched; when function returns, contains the

                                     value for current comparison */

{

         //1.比较开始前的初始化

         cur_field = *matched_fields;

         cur_bytes = *matched_bytes;

         //2.进行一些特殊检查,也即前面问题的根源:

         if (cur_bytes == 0 && cur_field == 0) {

                   ulint rec_info = rec_get_info_bits(rec, rec_offs_comp(offsets));

                   ulint tup_info = dtuple_get_info_bits(dtuple);

                   if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {

                            //注意此处,对于非叶子页的最左记录,其上有一个标志REC_INFO_MIN_REC_FLAG,而叶子页没有这个标志,若tuple也有这个标志,则tuple与页面记录相等,若没有这个标志,则即使tuple的实际值小于页面记录,该函数仍然会返回1,即tuple大于页面记录。这便会令btr_cur_search_to_nth_level完成对“小于(等于)B树最小记录”的定位,沿着每层B树最左页向下,最终到达叶子页,由于最左叶子页最左记录没有这个标志,对于插入等操作就可以正确定位到最左叶子页的infimum,插入动作也可以正确的进行了

                            ret = !(tup_info & REC_INFO_MIN_REC_FLAG);

                            goto order_resolved;

                   } else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {

                            ret = -1;

                            goto order_resolved;

                   }

         }

         //3.进行记录各个字段的比较,不详细分析:

         while (cur_field < n_cmp) {

                   ulint mtype;

                   ulint prtype;

                   dtuple_field = dtuple_get_nth_field(dtuple, cur_field);

                   {

                            const dtype_t*         type = dfield_get_type(dtuple_field);

                            mtype = type->mtype;

                            prtype = type->prtype;

                   }

                   dtuple_f_len = dfield_get_len(dtuple_field);

                   rec_b_ptr = rec_get_nth_field(rec, offsets, cur_field, &rec_f_len);

 

                   if (UNIV_LIKELY(cur_bytes == 0)) {

                            if (rec_offs_nth_extern(offsets, cur_field)) {

                                     //不处理外部存储字段

                                     ret = 0;

                                     goto order_resolved;

                            }

 

                            if (dtuple_f_len == UNIV_SQL_NULL) {

                                     //tupleNULL字段小于等于页面记录字段

                                     if (rec_f_len == UNIV_SQL_NULL) {

                                               goto next_field;

                                     }

                                     ret = -1;

                                     goto order_resolved;

                            } else if (rec_f_len == UNIV_SQL_NULL) {

                                     //页面记录的NULL字段小于等于tuple的字段

                                     ret = 1;

                                     goto order_resolved;

                            }

                   }

 

                   //FLOAT等类型的比较,安装类型进行比较

                   if (mtype >= DATA_FLOAT

                       || (mtype == DATA_BLOB

                            && 0 == (prtype & DATA_BINARY_TYPE)

                            && dtype_get_charset_coll(prtype)

                            != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {

                            ret = cmp_whole_field(

                                     mtype, prtype,

                                     static_cast<const byte*>(dfield_get_data(dtuple_field)),

                                              (unsigned) dtuple_f_len, rec_b_ptr, (unsigned) rec_f_len);

                            if (ret != 0) {

                                     cur_bytes = 0;

                                     goto order_resolved;

                            } else {

                                     goto next_field;

                            }

                   }

 

                   //其它类型的比较,遍历每一个字节比较

                   rec_b_ptr = rec_b_ptr + cur_bytes;

                   dtuple_b_ptr = (byte*) dfield_get_data(dtuple_field)

                            + cur_bytes;

                   for (;;) {

                            if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) {

                                     if (dtuple_f_len <= cur_bytes) {

                                               goto next_field;

                                     }

                                     rec_byte = dtype_get_pad_char(mtype, prtype);

                                     if (rec_byte == ULINT_UNDEFINED) {

                                               ret = 1;

                                               goto order_resolved;

                                     }

                            } else {

                                     rec_byte = *rec_b_ptr;

                            }

                            if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) {

                                     dtuple_byte = dtype_get_pad_char(mtype,

                                                                            prtype);

 

                                     if (dtuple_byte == ULINT_UNDEFINED) {

                                               ret = -1;

 

                                               goto order_resolved;

                                     }

                            } else {

                                     dtuple_byte = *dtuple_b_ptr;

                            }

 

                            if (dtuple_byte == rec_byte) {

                                     /* If the bytes are equal, they will

                                     remain such even after the collation

                                     transformation below */

 

                                     goto next_byte;

                            }

 

                            if (mtype <= DATA_CHAR

                                || (mtype == DATA_BLOB

                                     && !(prtype & DATA_BINARY_TYPE))) {

 

                                     rec_byte = cmp_collate(rec_byte);

                                     dtuple_byte = cmp_collate(dtuple_byte);

                            }

 

                            ret = (int) (dtuple_byte - rec_byte);

                            if (UNIV_LIKELY(ret)) {

                                     if (ret < 0) {

                                               ret = -1;

                                               goto order_resolved;

                                     } else {

                                               ret = 1;

                                               goto order_resolved;

                                     }

                            }

next_byte:

                            /* Next byte */

                            cur_bytes++;

                            rec_b_ptr++;

                            dtuple_b_ptr++;

                   }

 

next_field:

                   cur_field++;

                   cur_bytes = 0;

         }

 

         ret = 0;

order_resolved:

         *matched_fields = cur_field;

         *matched_bytes = cur_bytes;

         return(ret);

}

 

综上所述,btr_cur_search_to_nth_level进行B树层次迭代,page_cur_search_with_match进行页面二分搜索(页面目录槽二分搜索,相邻槽间线性搜索),cmp_dtuple_rec_with_match_low进行记录比较,特殊之处在于对“小于(等于)B树最小记录”这种模式的定位,cmp_dtuple_rec_with_match_low会针对叶子页和非叶子页进行特殊处理。

 

 

 

 

 

 

 

你可能感兴趣的:(InnoDB)