从上一篇文章了解到Innodb的查询最终都是通过row_search_mvcc执行的,接下来就来略详细的分析下row_search_mvcc的执行流程。本文主要分析查询逻辑忽略了部分细节,比如行锁表锁等处理,数据库事务的快照查询等。由于本人是Java开发并不擅长C++,只是看代码并没有进行调试,分析细节不对的地方还请指正哈~
1.1、基本参数定义
dict_index_t *index = prebuilt->index; //索引
const dtuple_t *search_tuple = prebuilt->search_tuple; //索引查询条件
btr_pcur_t *pcur = prebuilt->pcur; //当前游标
dict_index_t *clust_index; //聚集索引(主键索引)
Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql; //通过二级索引获取主键索引
1.2、如果是索引查询,从顶至下遍历B+Tree,查找指定的叶子节点
btr_pcur_open_with_no_init(index, search_tuple, mode, BTR_SEARCH_LEAF, pcur,0, &mtr);
->#define btr_pcur_open_with_no_init(i, t, md, l, p, has, m) (p)->open_no_init((i), (t), (md), (l), (has), (m), __FILE__, __LINE__)
->auto cur = get_btr_cur();
->btr_cur_search_to_nth_level(index, m_read_level, tuple, mode, latch_mode, cur, has_search_latch, file, line, mtr);
//似乎获取索引记录
rec = btr_pcur_get_rec(pcur);
->#define btr_pcur_get_rec(p) (p)->get_rec()
1.3、如果是全表扫描或者全索引扫描,从左至右或者从右至左遍历叶子节点(后面再详细介绍innodb是如何遍历B+Tree的)
//尚未放置cursor,将cursor置于B+Tree的一端
btr_pcur_open_at_index_side(mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF, pcur, false, 0, &mtr);
->#define btr_pcur_open_at_index_side(e, i, lm, p, ip, lv, m) (p)->open_at_side((e), (i), (lm), (ip), (lv), (m))
->m_search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L;
->btr_cur_open_at_index_side(from_left, index, latch_mode, get_btr_cur(), level, mtr); //level 0是叶子节点
->#define btr_cur_open_at_index_side(f, i, l, c, lv, m) btr_cur_open_at_index_side_func(f, i, l, c, lv, __FILE__, __LINE__, m)
1.4、查询出来的record可能并不满足条件,需要循环遍历,所以这里添加rec_loop锚点,取到下一条record跳到此处
rec_loop: //循环查找
rec = btr_pcur_get_rec(pcur); //获取record
->#define btr_pcur_get_rec(p) (p)->get_rec()
//如果是该页的第一条记录(第一条不是真实数据),跳下一跳
if (page_rec_is_infimum(rec)) {
goto next_rec; //取下一条
}
//如果是该页的最后一条记录,跳下一页
if (page_rec_is_supremum(rec)) {
goto next_rec;
}
1.5、如果该条记录不是聚簇索引,则是二级索引,通过二级索引去查主键索引(和btr_cur_search_to_nth_level差不多)
if (index != clust_index && prebuilt->need_to_access_clustered) {
requires_clust_rec:
err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, thr, &clust_rec, &offsets, &heap, need_vrow ? &vrow : nullptr, &mtr, prebuilt->get_lob_undo());
result_rec = clust_rec;
}
1.6、如果满足条件则退出
idx_cond_failed:
goto normal_return;
1.7、上面取到的record如果不满足条件会跳到next_rec锚点,查找下一条记录并跳到rec_loop处理
next_rec:
move = btr_pcur_move_to_next(pcur, &mtr);
->#define btr_pcur_move_to_next(p, m) (p)->move_to_next(m)
goto rec_loop;
1.8、取本页的下一条,如果是最后一条跳下一页(下面会具体分析Page是如何加载的)
inline bool btr_pcur_t::move_to_next(mtr_t *mtr) {
if (is_after_last_on_page()) {
move_to_next_page(mtr); //取下一页
return (true);
}
move_to_next_on_page(); //取本页下一条
->page_cur_move_to_next(get_page_cur());
->cur->rec = page_rec_get_next(cur->rec);
->return ((rec_t *)page_rec_get_next_low(rec, page_rec_is_comp(rec)));
->const page_t *page;
->page = page_align(rec);
->return ((page_t *)ut_align_down(ptr, UNIV_PAGE_SIZE));
->offs = rec_get_next_offs(rec, comp);
->ulint field_value;
->field_value = mach_read_from_2(rec - REC_NEXT);
->return (field_value);
->return (page + offs);
return (true);
}
2.1、innodb对B树进行游标定位,该函数从根页开始向下层页迭代,直到指定的层级level,最终将B树游标定位在第一个大/小于(等于)tuple的位置
void btr_cur_search_to_nth_level(dict_index_t *index, ulint level, const dtuple_t *tuple, page_cur_mode_t mode, ulint latch_mode, btr_cur_t *cursor, ulint has_search_latch, const char *file, ulint line, mtr_t *mtr){
page_cursor = btr_cur_get_page_cur(cursor);
->#define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
const space_id_t space = dict_index_get_space(index);
page_id_t page_id(space, dict_index_get_page(index));
->return (index->page);
height = ULINT_UNDEFINED;
switch (mode) {
case PAGE_CUR_GE:
case PAGE_CUR_G:
}
search_loop:
retry_page_get:
//从磁盘或内存中获取page_id当前页
block = buf_page_get_gen(page_id, page_size, rw_latch, guess, fetch, file, line, mtr);
tree_blocks[n_blocks] = block;
page = buf_block_get_frame(block);
//刚进去没有初始化层级
if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
height = btr_page_get_level(page, mtr);//获取当前页的层级
root_height = height;
}
//通过二分查找法找到第一个匹配的record,存入page_cursor
page_cur_search_with_match(block, index, tuple, page_mode, &up_match, &low_match, page_cursor, need_path ? cursor->rtr_info : nullptr);
//未到达指定层
if (level != height) {
height--;
node_ptr = page_cur_get_rec(page_cursor);
->#define page_cur_get_rec(cur) (cur)->rec
offsets = rec_get_offsets(node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
->#define rec_get_offsets(rec, index, offsets, n, heap) rec_get_offsets_func(rec, index, offsets, n, __FILE__, __LINE__, heap)
->rec_offs_set_n_fields(offsets, n);
->rec_init_offsets(rec, index, offsets);
->return (offsets);
//获取孩子节点页
page_id.reset(space, btr_node_ptr_get_child_page_no(node_ptr, offsets));
->page_no_t btr_node_ptr_get_child_page_no(const rec_t *rec, const ulint *offsets)
->field = rec_get_nth_field(rec, offsets, rec_offs_n_fields(offsets) - 1, &len); //最后一个field是孩子节点的地址
->page_no = mach_read_from_4(field);
->return (page_no);
n_blocks++;
goto search_loop;
}
}
2.2、在页内查询匹配的record,不会跨页查询说明B+Tree的一个内节点的最多孩子节点为一页的大小
//1、在一个页内查找第一个匹配值
//2、先通过二分查找法找到第一个匹配的slot
//3、再遍历slot里面的record,找到第一个匹配的record
//4、如果正序则是最小值,如果降序是最大值
//5、通过cursor返回结果
void page_cur_search_with_match(const buf_block_t *block, const dict_index_t *index, const dtuple_t *tuple, page_cur_mode_t mode, ulint *iup_matched_fields, ulint *ilow_matched_fields, page_cur_t *cursor, rtr_info_t *rtr_info) {
const page_t *page;
const page_dir_slot_t *slot;
const rec_t *up_rec;
const rec_t *low_rec;
const rec_t *mid_rec;
//首先进行页面目录的二分搜索,low为infimum记录的页面目录槽,而up为supremum记录的页面目录槽
low = 0;
up = page_dir_get_n_slots(page) - 1;
while (up - low > 1) {
mid = (low + up) / 2;
cmp = tuple->compare(mid_rec, index, offsets, &cur_matched_fields);
if (cmp > 0) {
low = mid;
} else if (cmp) {
up = mid;
}
}
//同一个槽,在low_rec与up_rec之间进行线性搜索
slot = page_dir_get_nth_slot(page, low);
low_rec = page_dir_slot_get_rec(slot);
slot = page_dir_get_nth_slot(page, up);
up_rec = page_dir_slot_get_rec(slot);
while (page_rec_get_next_const(low_rec) != up_rec) {
mid_rec = page_rec_get_next_const(low_rec);
offsets = rec_get_offsets(mid_rec, index, offsets, dtuple_get_n_fields_cmp(tuple), &heap);
cmp = tuple->compare(mid_rec, index, offsets, &cur_matched_fields);
->return (cmp_dtuple_rec_with_match_low(this, rec, index, offsets, n_fields_cmp, matched_fields));
->for (auto i = *matched_fields; i < n_cmp; ++i) //matched_fields是待查询字段
->const auto dtuple_field = dtuple_get_nth_field(dtuple, i);
->return ((dfield_t *)tuple->fields + n);
->const auto dtuple_b_ptr = static_cast<const byte *>(dfield_get_data(dtuple_field));
->#define dfield_get_data(field) ((field)->data)
->const auto rec_b_ptr = rec_get_nth_field(rec, offsets, i, &rec_f_len);
->#define rec_get_nth_field(rec, offsets, n, len) ((rec) + rec_get_nth_field_offs(offsets, n, len))
->ulint rec_get_nth_field_offs(const ulint *offsets,ulint n,ulint *len)
->offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
->return (offs);
->ret = cmp_data(type->mtype, type->prtype, dict_index_is_ibuf(index) || index->get_field(i)->is_ascending, dtuple_b_ptr, dtuple_f_len, rec_b_ptr, rec_f_len);
->do while (cmp == 0 && len < len1);
->cmp = static_cast<int>(mach_read_from_1(&data1[len]) - pad);
->++len;
->return (is_asc ? cmp : -cmp);
->return (ret);
if (cmp > 0) {
low_rec_match:
low_rec = mid_rec;
low_matched_fields = cur_matched_fields;
} else if (cmp) {
}
}
if (mode <= PAGE_CUR_GE) {
page_cur_position(up_rec, block, cursor);
} else {
page_cur_position(low_rec, block, cursor);
->cur->rec = (rec_t *)rec;
->cur->block = (buf_block_t *)block;
}
}
3.1、算出下一页的页号next_page_no,从buf_pool_ptr找下一页,有则返回,如果没有,从磁盘加载下一页,并放入buf_pool_ptr
void btr_pcur_t::move_to_next_page(mtr_t *mtr) {
auto page = get_page();
auto next_page_no = btr_page_get_next(page, mtr);
->return (mach_read_from_4(page + FIL_PAGE_NEXT)); //#define FIL_PAGE_NEXT 12
auto next_block = btr_block_get(page_id_t(block->page.id.space(), next_page_no), block->page.size, mode, get_btr_cur()->index, mtr);
->#define btr_block_get(page_id, page_size, mode, index, mtr) btr_block_get_func(page_id, page_size, mode, __FILE__, __LINE__, index, mtr)
->block = buf_page_get_gen(page_id, page_size, mode, nullptr, Page_fetch::NORMAL, file, line, mtr);
->const page_size_t &space_page_size = fil_space_get_page_size(page_id.space(), &found);
->Buf_fetch_normal fetch(page_id, page_size);
->m_buf_pool(buf_pool_get(m_page_id))
->return (&buf_pool_ptr[i]);
->fetch.m_file = file;
->fetch.m_mtr = mtr;
->return (fetch.single_page());
auto next_page = buf_block_get_frame(next_block);
->#define buf_block_get_frame(block) (block)->frame
}
3.2、调用lookup()从缓存中加载page,如果不存在调用read_page()从磁盘中加载
template <typename T> buf_block_t *Buf_fetch<T>::single_page() {
buf_block_t *block;
for (;;) {
static_cast<T *>(this)->get(block)
->dberr_t Buf_fetch_normal::get(buf_block_t *&block)
//
->block = lookup(); //template buf_block_t *Buf_fetch::lookup()
->if (block != nullptr)
->buf_block_fix(block);
->break;
->read_page();
break;
}
//页的加载是通过异步线程执行的,所以这里需要等待加载结果
buf_wait_for_read(block);
//将页放入缓存
mtr_add_page(block);
->mtr_memo_push(m_mtr, block, fix_type);
->#define mtr_memo_push(m, o, t) (m)->memo_push(o, t)
return (block);
}
3.3、调用buf_read_page同步加载,调用buf_read_page_low异步加载
template <typename T> void Buf_fetch<T>::read_page() {
if (sync) {
success = buf_read_page(m_page_id, m_page_size);
} else {
auto ret = buf_read_page_low(&err, false, 0, BUF_READ_ANY_PAGE, m_page_id, m_page_size, false);
}
buf_read_ahead_random(m_page_id, m_page_size, ibuf_inside(m_mtr));
}
3.4、假设通过同步方式加载page
1)磁盘中读取的页存储在buf_block_t的frame中
2)bpage->frame是buf_pool_ptr分配的
3)可以将页缓存强制类型转换成buf_block_t和buf_page_t(结构一致,只是关注点不一样)
bool buf_read_page(const page_id_t &page_id, const page_size_t &page_size) {
count = buf_read_page_low(&err, true, 0, BUF_READ_ANY_PAGE, page_id, page_size, false);
->buf_page_t *bpage;
->ibuf_bitmap_page(page_id, page_size)
->bpage = buf_page_init_for_read(err, mode, page_id, page_size, unzip);
->buf_block_t *block;
->buf_pool_t *buf_pool = buf_pool_get(page_id);
->return (&buf_pool_ptr[i]);
->buf_page_t *bpage = nullptr;
->bpage = buf_page_alloc_descriptor();
->bpage = (buf_page_t *)ut_zalloc_nokey(sizeof *bpage);
->data = buf_buddy_alloc(buf_pool, page_size.physical());
->return (static_cast<byte *>(buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size))));
->block = buf_LRU_get_free_only(buf_pool);
->block = reinterpret_cast<buf_block_t *>(UT_LIST_GET_FIRST(buf_pool->free));
->UT_LIST_REMOVE(buf_pool->free, &block->page);
->buf_buddy_block_register(block);
->buf_pool_t *buf_pool = buf_pool_from_block(block);
->HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
->cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));
->cell3333->node = DATA;
->buf_pool->buddy_n_frames++
->void *dst;
->dst = ((buf_block_t *)bpage)->frame;
->IORequest request(type | IORequest::READ);
->*err = fil_io(request, sync, page_id, page_size, 0, page_size.physical(), dst, bpage);
->auto shard = fil_system->shard_by_id(page_id.space());
->return (shard->do_io(type, sync, page_id, page_size, byte_offset, len, buf, message));
->if (sync) //如果是同步,暂停等待查询结果
->thd_wait_end(nullptr);
->#define thd_wait_end(_THD) thd_wait_service->thd_wait_end_func(_THD)
->buf_page_io_complete(bpage, false)
->
}
3.5、根据page_no 和page_size计算文件的偏移地址,将page读入buf缓存中
dberr_t Fil_shard::do_io(const IORequest &type, bool sync, const page_id_t &page_id, const page_size_t &page_size, ulint byte_offset, ulint len, void *buf, void *message) {
fil_space_t *space;
bool slot = mutex_acquire_and_get_space(page_id.space(), space);
fil_node_t *file;
auto page_no = page_id.page_no();
auto err = get_file_for_io(req_type, space, &page_no, file);
bool opened = prepare_file_for_io(file, false);
fil_io_set_encryption(req_type, page_id, space);
auto offset = (os_offset_t)page_no * page_size.physical(); //第page_no页的文件偏移地址
err = os_file_read(req_type, file->name, file->handle, buf, offset, len);
->#define os_file_read(type, file_name, file, buf, offset, n) os_file_read_pfs(type, file_name, file.m_file, buf, offset, n)
->#define os_file_read_pfs(type, file_name, file, buf, offset, n) os_file_read_func(type, file_name, file, buf, offset, n, __FILE__, __LINE__)
->return (os_file_read_page(type, file_name, file, buf, offset, n, nullptr, true));
err = os_aio(req_type, aio_mode, file->name, file->handle, buf, offset, len, fsp_is_system_temporary(page_id.space()) ? false : srv_read_only_mode, file, message);
->#define os_aio(type, mode, name, file, buf, offset, n, read_only, message1, message2) os_aio_func(type, mode, name, file, buf, offset, n, read_only, message1, message2)
->return (os_file_read_func(type, name, file.m_file, buf, offset, n));
->return (os_file_read_page(type, file_name, file, buf, offset, n, nullptr, true));
if (sync) {
complete_io(file, req_type);
}
}
static MY_ATTRIBUTE((warn_unused_result)) dberr_t os_file_read_page(IORequest &type, const char *file_name, os_file_t file, void *buf, os_offset_t offset, ulint n, ulint *o, bool exit_on_err) {
for (;;) {
n_bytes = os_file_pread(type, file, buf, n, offset, &err);
->ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
->SyncFileIO sync_file_io(file, buf, n, offset);
}
}
static MY_ATTRIBUTE((warn_unused_result)) ssize_t os_file_io(const IORequest &in_type, os_file_t file, void *buf, ulint n, os_offset_t offset, dberr_t *err) {
IORequest type = in_type;
SyncFileIO sync_file_io(file, buf, n, offset);
for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
ssize_t n_bytes = sync_file_io.execute(type);
->n_bytes = pread(m_fh, m_buf, m_n, m_offset);
sync_file_io.advance(n_bytes);
}
}