【数据库篇】MySQL源码分析之row_search_mvcc详细分析(Page加载及索引分析)

从上一篇文章了解到Innodb的查询最终都是通过row_search_mvcc执行的,接下来就来略详细的分析下row_search_mvcc的执行流程。本文主要分析查询逻辑忽略了部分细节,比如行锁表锁等处理,数据库事务的快照查询等。由于本人是Java开发并不擅长C++,只是看代码并没有进行调试,分析细节不对的地方还请指正哈~

一、先理清楚row_search_mvcc的主要流程

1.1、基本参数定义

dict_index_t *index = prebuilt->index; //索引
const dtuple_t *search_tuple = prebuilt->search_tuple; //索引查询条件
btr_pcur_t *pcur = prebuilt->pcur; //当前游标
dict_index_t *clust_index; //聚集索引(主键索引)
Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql; //通过二级索引获取主键索引

1.2、如果是索引查询,从顶至下遍历B+Tree,查找指定的叶子节点

btr_pcur_open_with_no_init(index, search_tuple, mode, BTR_SEARCH_LEAF, pcur,0, &mtr);
->#define btr_pcur_open_with_no_init(i, t, md, l, p, has, m) (p)->open_no_init((i), (t), (md), (l), (has), (m), __FILE__, __LINE__)
  ->auto cur = get_btr_cur();
  ->btr_cur_search_to_nth_level(index, m_read_level, tuple, mode, latch_mode, cur, has_search_latch, file, line, mtr);
//似乎获取索引记录
rec = btr_pcur_get_rec(pcur);
->#define btr_pcur_get_rec(p) (p)->get_rec()

1.3、如果是全表扫描或者全索引扫描,从左至右或者从右至左遍历叶子节点(后面再详细介绍innodb是如何遍历B+Tree的)

//尚未放置cursor,将cursor置于B+Tree的一端
btr_pcur_open_at_index_side(mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF, pcur, false, 0, &mtr);
->#define btr_pcur_open_at_index_side(e, i, lm, p, ip, lv, m) (p)->open_at_side((e), (i), (lm), (ip), (lv), (m))
  ->m_search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L;
  ->btr_cur_open_at_index_side(from_left, index, latch_mode, get_btr_cur(), level, mtr);  //level 0是叶子节点
    ->#define btr_cur_open_at_index_side(f, i, l, c, lv, m) btr_cur_open_at_index_side_func(f, i, l, c, lv, __FILE__, __LINE__, m)

1.4、查询出来的record可能并不满足条件,需要循环遍历,所以这里添加rec_loop锚点,取到下一条record跳到此处

rec_loop:  //循环查找
rec = btr_pcur_get_rec(pcur);  //获取record
->#define btr_pcur_get_rec(p) (p)->get_rec()
//如果是该页的第一条记录(第一条不是真实数据),跳下一跳
if (page_rec_is_infimum(rec)) {
    goto next_rec;  //取下一条
}
//如果是该页的最后一条记录,跳下一页
if (page_rec_is_supremum(rec)) {
	goto next_rec;
}

1.5、如果该条记录不是聚簇索引,则是二级索引,通过二级索引去查主键索引(和btr_cur_search_to_nth_level差不多)

if (index != clust_index && prebuilt->need_to_access_clustered) {
requires_clust_rec:    
   err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, thr, &clust_rec, &offsets, &heap, need_vrow ? &vrow : nullptr, &mtr, prebuilt->get_lob_undo());
   result_rec = clust_rec;
}

1.6、如果满足条件则退出

idx_cond_failed:
   goto normal_return;

1.7、上面取到的record如果不满足条件会跳到next_rec锚点,查找下一条记录并跳到rec_loop处理

next_rec:
   move = btr_pcur_move_to_next(pcur, &mtr);
   ->#define btr_pcur_move_to_next(p, m) (p)->move_to_next(m)
   goto rec_loop;

1.8、取本页的下一条,如果是最后一条跳下一页(下面会具体分析Page是如何加载的)

inline bool btr_pcur_t::move_to_next(mtr_t *mtr) {
	if (is_after_last_on_page()) {
		move_to_next_page(mtr);  //取下一页
		return (true);
	}
	move_to_next_on_page(); //取本页下一条
    ->page_cur_move_to_next(get_page_cur());
      ->cur->rec = page_rec_get_next(cur->rec);
        ->return ((rec_t *)page_rec_get_next_low(rec, page_rec_is_comp(rec)));
          ->const page_t *page;
          ->page = page_align(rec);
            ->return ((page_t *)ut_align_down(ptr, UNIV_PAGE_SIZE));
          ->offs = rec_get_next_offs(rec, comp);
            ->ulint field_value; 
            ->field_value = mach_read_from_2(rec - REC_NEXT);
            ->return (field_value);
         ->return (page + offs); 
	return (true);
}
二、分析索引的查找逻辑

2.1、innodb对B树进行游标定位,该函数从根页开始向下层页迭代,直到指定的层级level,最终将B树游标定位在第一个大/小于(等于)tuple的位置

void btr_cur_search_to_nth_level(dict_index_t *index, ulint level, const dtuple_t *tuple, page_cur_mode_t mode, ulint latch_mode, btr_cur_t *cursor, ulint has_search_latch, const char *file, ulint line, mtr_t *mtr){
    page_cursor = btr_cur_get_page_cur(cursor);
    ->#define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
    const space_id_t space = dict_index_get_space(index);
    page_id_t page_id(space, dict_index_get_page(index));
    ->return (index->page);
    
    height = ULINT_UNDEFINED;
    
    switch (mode) {
        case PAGE_CUR_GE:
        case PAGE_CUR_G:
    }
    
search_loop:
    
retry_page_get:
    //从磁盘或内存中获取page_id当前页
    block = buf_page_get_gen(page_id, page_size, rw_latch, guess, fetch, file, line, mtr);
    tree_blocks[n_blocks] = block;
    page = buf_block_get_frame(block);
    
    //刚进去没有初始化层级
    if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
        height = btr_page_get_level(page, mtr);//获取当前页的层级
        root_height = height;
    }
    
    //通过二分查找法找到第一个匹配的record,存入page_cursor
    page_cur_search_with_match(block, index, tuple, page_mode, &up_match, &low_match, page_cursor, need_path ? cursor->rtr_info : nullptr);
    
    //未到达指定层
    if (level != height) {
        height--;
        node_ptr = page_cur_get_rec(page_cursor);
        ->#define page_cur_get_rec(cur) (cur)->rec
        offsets = rec_get_offsets(node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
        ->#define rec_get_offsets(rec, index, offsets, n, heap) rec_get_offsets_func(rec, index, offsets, n, __FILE__, __LINE__, heap)
          ->rec_offs_set_n_fields(offsets, n);
          ->rec_init_offsets(rec, index, offsets);
          ->return (offsets);
        //获取孩子节点页 
        page_id.reset(space, btr_node_ptr_get_child_page_no(node_ptr, offsets));
        ->page_no_t btr_node_ptr_get_child_page_no(const rec_t *rec, const ulint *offsets)
          ->field = rec_get_nth_field(rec, offsets, rec_offs_n_fields(offsets) - 1, &len);  //最后一个field是孩子节点的地址
          ->page_no = mach_read_from_4(field);
          ->return (page_no);
        n_blocks++;
        goto search_loop;  
    }
}

2.2、在页内查询匹配的record,不会跨页查询说明B+Tree的一个内节点的最多孩子节点为一页的大小

//1、在一个页内查找第一个匹配值
//2、先通过二分查找法找到第一个匹配的slot
//3、再遍历slot里面的record,找到第一个匹配的record
//4、如果正序则是最小值,如果降序是最大值
//5、通过cursor返回结果
void page_cur_search_with_match(const buf_block_t *block, const dict_index_t *index, const dtuple_t *tuple, page_cur_mode_t mode,  ulint *iup_matched_fields, ulint *ilow_matched_fields, page_cur_t *cursor, rtr_info_t *rtr_info) {
    const page_t *page;
    const page_dir_slot_t *slot;
    const rec_t *up_rec;
   const rec_t *low_rec;
   const rec_t *mid_rec;
    
    //首先进行页面目录的二分搜索,low为infimum记录的页面目录槽,而up为supremum记录的页面目录槽
   low = 0;
   up = page_dir_get_n_slots(page) - 1;
   while (up - low > 1) {
        mid = (low + up) / 2;
        cmp = tuple->compare(mid_rec, index, offsets, &cur_matched_fields);
        if (cmp > 0) {
            low = mid;
        } else if (cmp) {
            up = mid;
        }
    }
    
    //同一个槽,在low_rec与up_rec之间进行线性搜索
    slot = page_dir_get_nth_slot(page, low);
    low_rec = page_dir_slot_get_rec(slot);
    slot = page_dir_get_nth_slot(page, up);
    up_rec = page_dir_slot_get_rec(slot);
    while (page_rec_get_next_const(low_rec) != up_rec) {
        mid_rec = page_rec_get_next_const(low_rec);
        offsets = rec_get_offsets(mid_rec, index, offsets, dtuple_get_n_fields_cmp(tuple), &heap);
        cmp = tuple->compare(mid_rec, index, offsets, &cur_matched_fields);
        ->return (cmp_dtuple_rec_with_match_low(this, rec, index, offsets, n_fields_cmp, matched_fields));
          ->for (auto i = *matched_fields; i < n_cmp; ++i)  //matched_fields是待查询字段
            ->const auto dtuple_field = dtuple_get_nth_field(dtuple, i);
              ->return ((dfield_t *)tuple->fields + n);
            ->const auto dtuple_b_ptr = static_cast<const byte *>(dfield_get_data(dtuple_field));
              ->#define dfield_get_data(field) ((field)->data)
            ->const auto rec_b_ptr = rec_get_nth_field(rec, offsets, i, &rec_f_len);
              ->#define rec_get_nth_field(rec, offsets, n, len) ((rec) + rec_get_nth_field_offs(offsets, n, len))
                ->ulint rec_get_nth_field_offs(const ulint *offsets,ulint n,ulint *len) 
                  ->offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
                  ->return (offs);
            ->ret = cmp_data(type->mtype, type->prtype, dict_index_is_ibuf(index) || index->get_field(i)->is_ascending, dtuple_b_ptr, dtuple_f_len, rec_b_ptr, rec_f_len);
              ->do while (cmp == 0 && len < len1);
                ->cmp = static_cast<int>(mach_read_from_1(&data1[len]) - pad);
                ->++len;
              ->return (is_asc ? cmp : -cmp);  
            ->return (ret);
        if (cmp > 0) {
        low_rec_match:
            low_rec = mid_rec;
            low_matched_fields = cur_matched_fields;
        } else if (cmp) {
            
        }
    }
    
    if (mode <= PAGE_CUR_GE) {
      page_cur_position(up_rec, block, cursor);
    } else {
      page_cur_position(low_rec, block, cursor);
      ->cur->rec = (rec_t *)rec;
      ->cur->block = (buf_block_t *)block;
    }
}
三、分析Page的加载逻辑

3.1、算出下一页的页号next_page_no,从buf_pool_ptr找下一页,有则返回,如果没有,从磁盘加载下一页,并放入buf_pool_ptr

void btr_pcur_t::move_to_next_page(mtr_t *mtr) {
    auto page = get_page();
    auto next_page_no = btr_page_get_next(page, mtr);
    ->return (mach_read_from_4(page + FIL_PAGE_NEXT));  //#define FIL_PAGE_NEXT 12
    auto next_block = btr_block_get(page_id_t(block->page.id.space(), next_page_no), block->page.size, mode, get_btr_cur()->index, mtr);
    ->#define btr_block_get(page_id, page_size, mode, index, mtr) btr_block_get_func(page_id, page_size, mode, __FILE__, __LINE__, index, mtr)
      ->block = buf_page_get_gen(page_id, page_size, mode, nullptr, Page_fetch::NORMAL, file, line, mtr);
        ->const page_size_t &space_page_size = fil_space_get_page_size(page_id.space(), &found);
        ->Buf_fetch_normal fetch(page_id, page_size);
         ->m_buf_pool(buf_pool_get(m_page_id))
           ->return (&buf_pool_ptr[i]);
       ->fetch.m_file = file;
       ->fetch.m_mtr = mtr;
       ->return (fetch.single_page());
    auto next_page = buf_block_get_frame(next_block);
    ->#define buf_block_get_frame(block) (block)->frame
}

3.2、调用lookup()从缓存中加载page,如果不存在调用read_page()从磁盘中加载

template <typename T> buf_block_t *Buf_fetch<T>::single_page() {
    buf_block_t *block;
    for (;;) {
        static_cast<T *>(this)->get(block) 
        ->dberr_t Buf_fetch_normal::get(buf_block_t *&block) 
          //
          ->block = lookup();  //template  buf_block_t *Buf_fetch::lookup()
          ->if (block != nullptr)
            ->buf_block_fix(block);
            ->break;
          ->read_page();
        break;
    }
    //页的加载是通过异步线程执行的,所以这里需要等待加载结果
    buf_wait_for_read(block);
    //将页放入缓存
    mtr_add_page(block);
    ->mtr_memo_push(m_mtr, block, fix_type);
      ->#define mtr_memo_push(m, o, t) (m)->memo_push(o, t)
    return (block);
}

3.3、调用buf_read_page同步加载,调用buf_read_page_low异步加载

template <typename T> void Buf_fetch<T>::read_page() {
    if (sync) {
      success = buf_read_page(m_page_id, m_page_size);
    } else {
      auto ret = buf_read_page_low(&err, false, 0, BUF_READ_ANY_PAGE, m_page_id, m_page_size, false);
    }
    buf_read_ahead_random(m_page_id, m_page_size, ibuf_inside(m_mtr));
}

3.4、假设通过同步方式加载page
1)磁盘中读取的页存储在buf_block_t的frame中
2)bpage->frame是buf_pool_ptr分配的
3)可以将页缓存强制类型转换成buf_block_t和buf_page_t(结构一致,只是关注点不一样)

bool buf_read_page(const page_id_t &page_id, const page_size_t &page_size) {
    count = buf_read_page_low(&err, true, 0, BUF_READ_ANY_PAGE, page_id, page_size, false);
    ->buf_page_t *bpage; 
    ->ibuf_bitmap_page(page_id, page_size)
    ->bpage = buf_page_init_for_read(err, mode, page_id, page_size, unzip);
      ->buf_block_t *block;
      ->buf_pool_t *buf_pool = buf_pool_get(page_id);
        ->return (&buf_pool_ptr[i]);
      ->buf_page_t *bpage = nullptr;
      ->bpage = buf_page_alloc_descriptor();
       ->bpage = (buf_page_t *)ut_zalloc_nokey(sizeof *bpage);
     ->data = buf_buddy_alloc(buf_pool, page_size.physical());   
       ->return (static_cast<byte *>(buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size))));
         ->block = buf_LRU_get_free_only(buf_pool);
           ->block = reinterpret_cast<buf_block_t *>(UT_LIST_GET_FIRST(buf_pool->free));
           ->UT_LIST_REMOVE(buf_pool->free, &block->page);
         ->buf_buddy_block_register(block);
           ->buf_pool_t *buf_pool = buf_pool_from_block(block);
           ->HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
             ->cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));
             ->cell3333->node = DATA;   
           ->buf_pool->buddy_n_frames++
    ->void *dst;
    ->dst = ((buf_block_t *)bpage)->frame;
    ->IORequest request(type | IORequest::READ);
    ->*err = fil_io(request, sync, page_id, page_size, 0, page_size.physical(), dst, bpage);
      ->auto shard = fil_system->shard_by_id(page_id.space());
      ->return (shard->do_io(type, sync, page_id, page_size, byte_offset, len, buf, message));
    ->if (sync)   //如果是同步,暂停等待查询结果
      ->thd_wait_end(nullptr);
        ->#define thd_wait_end(_THD) thd_wait_service->thd_wait_end_func(_THD)
    ->buf_page_io_complete(bpage, false)
      ->    
}

3.5、根据page_no 和page_size计算文件的偏移地址,将page读入buf缓存中

dberr_t Fil_shard::do_io(const IORequest &type, bool sync, const page_id_t &page_id, const page_size_t &page_size, ulint byte_offset, ulint len, void *buf, void *message) {
    fil_space_t *space;
    bool slot = mutex_acquire_and_get_space(page_id.space(), space);
    fil_node_t *file;
    auto page_no = page_id.page_no();
    auto err = get_file_for_io(req_type, space, &page_no, file);
    bool opened = prepare_file_for_io(file, false);
    fil_io_set_encryption(req_type, page_id, space);
    auto offset = (os_offset_t)page_no * page_size.physical();  //第page_no页的文件偏移地址
    err = os_file_read(req_type, file->name, file->handle, buf, offset, len);
    ->#define os_file_read(type, file_name, file, buf, offset, n) os_file_read_pfs(type, file_name, file.m_file, buf, offset, n)
      ->#define os_file_read_pfs(type, file_name, file, buf, offset, n) os_file_read_func(type, file_name, file, buf, offset, n, __FILE__, __LINE__)
        ->return (os_file_read_page(type, file_name, file, buf, offset, n, nullptr, true));
    err = os_aio(req_type, aio_mode, file->name, file->handle, buf, offset, len, fsp_is_system_temporary(page_id.space()) ? false : srv_read_only_mode, file, message);
    ->#define os_aio(type, mode, name, file, buf, offset, n, read_only, message1, message2) os_aio_func(type, mode, name, file, buf, offset, n, read_only, message1,  message2)
      ->return (os_file_read_func(type, name, file.m_file, buf, offset, n));
        ->return (os_file_read_page(type, file_name, file, buf, offset, n, nullptr, true));
    if (sync) {
        complete_io(file, req_type);
    }
}
static MY_ATTRIBUTE((warn_unused_result)) dberr_t os_file_read_page(IORequest &type, const char *file_name, os_file_t file, void *buf, os_offset_t offset, ulint n, ulint *o, bool exit_on_err) {
    for (;;) {
        n_bytes = os_file_pread(type, file, buf, n, offset, &err);
        ->ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
          ->SyncFileIO sync_file_io(file, buf, n, offset);
    }
}
static MY_ATTRIBUTE((warn_unused_result)) ssize_t os_file_io(const IORequest &in_type, os_file_t file, void *buf, ulint n, os_offset_t offset, dberr_t *err) {
    IORequest type = in_type;
    SyncFileIO sync_file_io(file, buf, n, offset);
    for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
        ssize_t n_bytes = sync_file_io.execute(type);
        ->n_bytes = pread(m_fh, m_buf, m_n, m_offset);
        sync_file_io.advance(n_bytes);
    }
}

你可能感兴趣的:(数据库,mysql,数据库,sql)