在前面基本把几个缓冲的创建应用的源码搞定了。但是在宏观层次上的使用是怎么设计的呢?这篇就分析一下Buffer Pool的整体应用框架,其它的如果有时间再慢慢一一补齐,重点还是要把MySql的架构先理清大的脉络,最后在抓住细节各个击破。
在MySql中,一个数据库的实例生成,一定会生成一个数据的引擎实例。所以,在前面的数据库启动流程里,会通过插件的调用来初始化这个引擎相关的数据结构和参数等。比如InnoDB就是下面的代码:
mysql_declare_plugin(innobase){
MYSQL_STORAGE_ENGINE_PLUGIN,
&innobase_storage_engine,
innobase_hton_name,
PLUGIN_AUTHOR_ORACLE,
"Supports transactions, row-level locking, and foreign keys",
PLUGIN_LICENSE_GPL,
innodb_init, /* Plugin Init * /
nullptr, /* Plugin Check uninstall * /
innodb_deinit, /* Plugin Deinit * /
INNODB_VERSION_SHORT,
innodb_status_variables_export, /* status variables * /
innobase_system_variables, /* system variables * /
nullptr, /* reserved * /
0, /* flags * /
},
i_s_innodb_trx, i_s_innodb_cmp, i_s_innodb_cmp_reset, i_s_innodb_cmpmem,
i_s_innodb_cmpmem_reset, i_s_innodb_cmp_per_index,
i_s_innodb_cmp_per_index_reset, i_s_innodb_buffer_page,
i_s_innodb_buffer_page_lru, i_s_innodb_buffer_stats,
i_s_innodb_temp_table_info, i_s_innodb_metrics,
i_s_innodb_ft_default_stopword, i_s_innodb_ft_deleted,
i_s_innodb_ft_being_deleted, i_s_innodb_ft_config,
i_s_innodb_ft_index_cache, i_s_innodb_ft_index_table, i_s_innodb_tables,
i_s_innodb_tablestats, i_s_innodb_indexes, i_s_innodb_tablespaces,
i_s_innodb_columns, i_s_innodb_virtual, i_s_innodb_cached_indexes,
i_s_innodb_session_temp_tablespaces
mysql_declare_plugin_end;
/ ** @brief Initialize the default value of innodb_commit_concurrency.
Once InnoDB is running, the innodb_commit_concurrency must not change
from zero to nonzero. (Bug #42101)
The initial default value is 0, and without this extra initialization,
SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
to 0, even if it was initially set to nonzero at the command line
or configuration file. * /
static void innobase_commit_concurrency_init_default() {
MYSQL_SYSVAR_NAME(commit_concurrency).def_val = innobase_commit_concurrency;
}
在此插件的初始化函数innodb_init中,会初始化innobase_ddse_dict_init供上层初始化调用时使用,此函数调用 innobase_init_files,其调用srv_star(srv0start.cc)t这个函数,这个函数就调用buf_pool_init来完成相关的缓冲池的初始化。前面分析过,说Buffer Pool的实例可以有多个,它通过配置文件中的innodb_buffer_pool_instances来控制,默认是1个。
好,完成了上层应用的初始化调用框架流程,现在分析一下算法和Pool使用流程分析,先看一下Buffer Pool中最终形成对外应用的链表前的相关动作:
static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size,
ulint instance_no, std::mutex *mutex,
dberr_t &err) {
ulint i;
ulint chunk_size;
buf_chunk_t *chunk;
#ifdef UNIV_LINUX
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
const long n_cores = sysconf(_SC_NPROCESSORS_ONLN);
CPU_SET(instance_no % n_cores, &cpuset);
buf_pool->stat.reset();
if (pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset) == -1) {
ib::error(ER_IB_ERR_SCHED_SETAFFNINITY_FAILED)
<< "sched_setaffinity() failed!";
}
/* Linux might be able to set different setting for each thread
worth to try to set high priority for this thread. */
setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid), -20);
#endif /* UNIV_LINUX */
ut_ad(buf_pool_size % srv_buf_pool_chunk_unit == 0);
/* 1. Initialize general fields
------------------------------- */
mutex_create(LATCH_ID_BUF_POOL_CHUNKS, &buf_pool->chunks_mutex);
mutex_create(LATCH_ID_BUF_POOL_LRU_LIST, &buf_pool->LRU_list_mutex);
mutex_create(LATCH_ID_BUF_POOL_FREE_LIST, &buf_pool->free_list_mutex);
mutex_create(LATCH_ID_BUF_POOL_ZIP_FREE, &buf_pool->zip_free_mutex);
mutex_create(LATCH_ID_BUF_POOL_ZIP_HASH, &buf_pool->zip_hash_mutex);
mutex_create(LATCH_ID_BUF_POOL_ZIP, &buf_pool->zip_mutex);
mutex_create(LATCH_ID_BUF_POOL_FLUSH_STATE, &buf_pool->flush_state_mutex);
new (&buf_pool->allocator) ut_allocator(mem_key_buf_buf_pool);
if (buf_pool_size > 0) {
mutex_enter(&buf_pool->chunks_mutex);
buf_pool->n_chunks = buf_pool_size / srv_buf_pool_chunk_unit;
chunk_size = srv_buf_pool_chunk_unit;
buf_pool->chunks = reinterpret_cast(
ut_zalloc_nokey(buf_pool->n_chunks * sizeof(*chunk)));
buf_pool->chunks_old = nullptr;
UT_LIST_INIT(buf_pool->LRU, &buf_page_t::LRU);
UT_LIST_INIT(buf_pool->free, &buf_page_t::list);
UT_LIST_INIT(buf_pool->withdraw, &buf_page_t::list);
buf_pool->withdraw_target = 0;
UT_LIST_INIT(buf_pool->flush_list, &buf_page_t::list);
UT_LIST_INIT(buf_pool->unzip_LRU, &buf_block_t::unzip_LRU);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
UT_LIST_INIT(buf_pool->zip_clean, &buf_page_t::list);
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
for (i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) {
UT_LIST_INIT(buf_pool->zip_free[i], &buf_buddy_free_t::list);
}
buf_pool->curr_size = 0;
chunk = buf_pool->chunks;
do {
if (!buf_chunk_init(buf_pool, chunk, chunk_size, mutex)) {
while (--chunk >= buf_pool->chunks) {
buf_block_t *block = chunk->blocks;
for (i = chunk->size; i--; block++) {
mutex_free(&block->mutex);
rw_lock_free(&block->lock);
ut_d(rw_lock_free(&block->debug_latch));
}
buf_pool->deallocate_chunk(chunk);
}
ut_free(buf_pool->chunks);
buf_pool->chunks = nullptr;
err = DB_ERROR;
mutex_exit(&buf_pool->chunks_mutex);
return;
}
buf_pool->curr_size += chunk->size;
} while (++chunk < buf_pool->chunks + buf_pool->n_chunks);
mutex_exit(&buf_pool->chunks_mutex);
buf_pool->instance_no = instance_no;
buf_pool->read_ahead_area = static_cast(
ut_min(BUF_READ_AHEAD_PAGES,
ut_2_power_up(buf_pool->curr_size / BUF_READ_AHEAD_PORTION)));
buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
buf_pool->old_size = buf_pool->curr_size;
buf_pool->n_chunks_new = buf_pool->n_chunks;
/* Number of locks protecting page_hash must be a
power of two */
srv_n_page_hash_locks =
static_cast(ut_2_power_up(srv_n_page_hash_locks));
ut_a(srv_n_page_hash_locks != 0);
ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
buf_pool->page_hash =
ib_create(2 * buf_pool->curr_size, LATCH_ID_HASH_TABLE_RW_LOCK,
srv_n_page_hash_locks, MEM_HEAP_FOR_PAGE_HASH);
buf_pool->page_hash_old = nullptr;
buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
buf_pool->last_printout_time = ut_time_monotonic();
}
/* 2. Initialize flushing fields
-------------------------------- */
mutex_create(LATCH_ID_FLUSH_LIST, &buf_pool->flush_list_mutex);
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
buf_pool->no_flush[i] = os_event_create();
}
buf_pool->watch = (buf_page_t *)ut_zalloc_nokey(sizeof(*buf_pool->watch) *
BUF_POOL_WATCH_SIZE);
for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
buf_pool->watch[i].buf_pool_index = buf_pool->instance_no;
}
/* All fields are initialized by ut_zalloc_nokey(). */
buf_pool->try_LRU_scan = TRUE;
/* Dirty Page Tracking is disabled by default. */
buf_pool->track_page_lsn = LSN_MAX;
buf_pool->max_lsn_io = 0;
/* Initialize the hazard pointer for flush_list batches */
new (&buf_pool->flush_hp) FlushHp(buf_pool, &buf_pool->flush_list_mutex);
/* Initialize the hazard pointer for the oldest page scan */
new (&buf_pool->oldest_hp) FlushHp(buf_pool, &buf_pool->flush_list_mutex);
/* Initialize the hazard pointer for LRU batches */
new (&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->LRU_list_mutex);
/* Initialize the iterator for LRU scan search */
new (&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->LRU_list_mutex);
/* Initialize the iterator for single page scan search */
new (&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->LRU_list_mutex);
err = DB_SUCCESS;
}
static buf_chunk_t *buf_chunk_init(
buf_pool_t *buf_pool, /*!< in: buffer pool instance */
buf_chunk_t *chunk, /*!< out: chunk of buffers */
ulonglong mem_size, /*!< in: requested size in bytes */
std::mutex *mutex) /*!< in,out: Mutex protecting chunk map. */
{
buf_block_t *block;
byte *frame;
ulint i;
mutex_own(&buf_pool->chunks_mutex);
/* Round down to a multiple of page size,
although it already should be. */
mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
/* Reserve space for the block descriptors. */
mem_size += ut_2pow_round(
(mem_size / UNIV_PAGE_SIZE) * (sizeof *block) + (UNIV_PAGE_SIZE - 1),
UNIV_PAGE_SIZE);
DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return (nullptr););
if (!buf_pool->allocate_chunk(mem_size, chunk)) {
return (nullptr);
}
#ifdef HAVE_LIBNUMA
if (srv_numa_interleave) {
struct bitmask *numa_nodes = numa_get_mems_allowed();
int st = mbind(chunk->mem, chunk->mem_size(), MPOL_INTERLEAVE,
numa_nodes->maskp, numa_nodes->size, MPOL_MF_MOVE);
if (st != 0) {
ib::warn(ER_IB_MSG_54) << "Failed to set NUMA memory policy of"
" buffer pool page frames to MPOL_INTERLEAVE"
" (error: "
<< strerror(errno) << ").";
}
numa_bitmask_free(numa_nodes);
}
#endif /* HAVE_LIBNUMA */
/* Allocate the block descriptors from
the start of the memory block. */
chunk->blocks = (buf_block_t *)chunk->mem;
/* Align a pointer to the first frame. Note that when
os_large_page_size is smaller than UNIV_PAGE_SIZE,
we may allocate one fewer block than requested. When
it is bigger, we may allocate more blocks than requested. */
frame = (byte *)ut_align(chunk->mem, UNIV_PAGE_SIZE);
chunk->size = chunk->mem_pfx.m_size / UNIV_PAGE_SIZE - (frame != chunk->mem);
/* Subtract the space needed for block descriptors. */
{
ulint size = chunk->size;
while (frame < (byte *)(chunk->blocks + size)) {
frame += UNIV_PAGE_SIZE;
size--;
}
chunk->size = size;
}
/* Init block structs and assign frames for them. Then we
assign the frames to the first blocks (we already mapped the
memory above). */
block = chunk->blocks;
for (i = chunk->size; i--;) {
buf_block_init(buf_pool, block, frame);
UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
/* Add the block to the free list */
UT_LIST_ADD_LAST(buf_pool->free, &block->page);
ut_d(block->page.in_free_list = TRUE);
ut_ad(!block->page.someone_has_io_responsibility());
ut_ad(buf_pool_from_block(block) == buf_pool);
block++;
frame += UNIV_PAGE_SIZE;
}
if (mutex != nullptr) {
mutex->lock();
}
buf_pool_register_chunk(chunk);
if (mutex != nullptr) {
mutex->unlock();
}
#ifdef PFS_GROUP_BUFFER_SYNC
pfs_register_buffer_block(chunk);
#endif /* PFS_GROUP_BUFFER_SYNC * /
return (chunk);
}
buf_pool_init会调用在一开始生成Buffer Pool的实例,然后调用 buf_pool_create 来生成链表,然后再调用buf_chunk_init来分配内存,创建block和page,并将其添加到空闲的链表中去。最后创建BufferPool的page_hash和zip_hash Hash表
上面的代码即从一个Buffer Pool的实例到Chunk到Block到Page。在Buffer Pool中通过page_hash可以快速定位内存中的数据页,从而尽量避免去LRU的链表中去遍历。
对外开放的Page应用:
//fsp0fsp.cc
bool fsp_header_init(space_id_t space_id, page_no_t size, mtr_t *mtr,
bool is_boot) {
auto space = fil_space_get(space_id);
ut_ad(space != nullptr);
mtr_x_lock_space(space, mtr);
const page_id_t page_id(space_id, 0);
const page_size_t page_size(space->flags);
auto block = buf_page_create(page_id, page_size, RW_SX_LATCH, mtr);
.......
}
buf_block_t *buf_page_create(const page_id_t &page_id,
const page_size_t &page_size,
rw_lock_type_t rw_latch, mtr_t *mtr) {
buf_frame_t *frame;
buf_block_t *block;
buf_block_t *free_block = nullptr;
buf_pool_t *buf_pool = buf_pool_get(page_id);
rw_lock_t *hash_lock;
ut_ad(mtr->is_active());
ut_ad(page_id.space() != 0 || !page_size.is_compressed());
free_block = buf_LRU_get_free_block(buf_pool);
for (;;) {
mutex_enter(&buf_pool->LRU_list_mutex);
hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
rw_lock_x_lock(hash_lock);
block = (buf_block_t *)buf_page_hash_get_low(buf_pool, page_id);
if (block && buf_page_in_file(&block->page) &&
!buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
if (block->page.was_stale()) {
/* We must release page hash latch. The LRU mutex protects the block
from being relocated or freed. */
rw_lock_x_unlock(hash_lock);
if (!buf_page_free_stale(buf_pool, &block->page)) {
/* The page is during IO and can't be released. We wait some to not go
into loop that would consume CPU. This is not something that will be
hit frequently. */
mutex_exit(&buf_pool->LRU_list_mutex);
std::this_thread::sleep_for(std::chrono::microseconds(100));
}
/* The hash lock was released, we should try again lookup for the page
until it's gone - it should disappear eventually when the IO ends. */
continue;
}
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(page_id) == 0);
#endif /* UNIV_IBUF_COUNT_DEBUG */
ut_d(block->page.file_page_was_freed = FALSE);
ut_ad(!block->page.was_stale());
/* Page can be found in buf_pool */
mutex_exit(&buf_pool->LRU_list_mutex);
rw_lock_x_unlock(hash_lock);
buf_block_free(free_block);
return (buf_page_get(page_id, page_size, rw_latch, mtr));
}
break;
}
/* If we get here, the page was not in buf_pool: init it there */
DBUG_PRINT("ib_buf", ("create page " UINT32PF ":" UINT32PF, page_id.space(),
page_id.page_no()));
block = free_block;
buf_page_mutex_enter(block);
buf_page_init(buf_pool, page_id, page_size, block);
buf_block_buf_fix_inc(block, __FILE__, __LINE__);
buf_page_set_accessed(&block->page);
mutex_exit(&block->mutex);
/* Latch the page before releasing hash lock so that concurrent request for
this page doesn't see half initialized page. ALTER tablespace for encryption
and clone page copy can request page for any page id within tablespace
size limit. */
mtr_memo_type_t mtr_latch_type;
if (rw_latch == RW_X_LATCH) {
rw_lock_x_lock(&block->lock);
mtr_latch_type = MTR_MEMO_PAGE_X_FIX;
} else {
rw_lock_sx_lock(&block->lock);
mtr_latch_type = MTR_MEMO_PAGE_SX_FIX;
}
mtr_memo_push(mtr, block, mtr_latch_type);
rw_lock_x_unlock(hash_lock);
/* The block must be put to the LRU list */
buf_LRU_add_block(&block->page, FALSE);
buf_pool->stat.n_pages_created.fetch_add(1);
if (page_size.is_compressed()) {
mutex_exit(&buf_pool->LRU_list_mutex);
auto data = buf_buddy_alloc(buf_pool, page_size.physical());
mutex_enter(&buf_pool->LRU_list_mutex);
buf_page_mutex_enter(block);
block->page.zip.data = (page_zip_t *)data;
buf_page_mutex_exit(block);
/* To maintain the invariant
block->in_unzip_LRU_list
== buf_page_belongs_to_unzip_LRU(&block->page)
we have to add this block to unzip_LRU after
block->page.zip.data is set. */
ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
buf_unzip_LRU_add_block(block, FALSE);
}
mutex_exit(&buf_pool->LRU_list_mutex);
/* Change buffer will not contain entries for undo tablespaces or temporary
tablespaces. */
bool skip_ibuf = fsp_is_system_temporary(page_id.space()) ||
fsp_is_undo_tablespace(page_id.space());
if (!skip_ibuf) {
/* Delete possible entries for the page from the insert buffer:
such can exist if the page belonged to an index which was dropped */
ibuf_merge_or_delete_for_page(nullptr, page_id, &page_size, TRUE);
}
frame = block->frame;
memset(frame + FIL_PAGE_PREV, 0xff, 4);
memset(frame + FIL_PAGE_NEXT, 0xff, 4);
mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
/* These 8 bytes are also repurposed for PageIO compression and must
be reset when the frame is assigned to a new page id. See fil0fil.h.
FIL_PAGE_FILE_FLUSH_LSN is used on the following pages:
(1) The first page of the InnoDB system tablespace (page 0:0)
(2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages .
Therefore we don't transparently compress such pages. */
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(++buf_dbg_counter % 5771 || buf_validate());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG * /
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(block->page.id) == 0);
#endif
return (block);
}
河流千条归大海,又回到了buf0buf.cc中的缓冲而的创建了,页首先要拿到block,这就清晰了。而在btr0btr.ic这个文件中就是对页的调用处理代码了:
buf_block_t *btr_block_get_func(const page_id_t &page_id,
const page_size_t &page_size, ulint mode,
const char *file, ulint line,
#ifdef UNIV_DEBUG
const dict_index_t *index,
#endif /* UNIV_DEBUG * /
mtr_t * mtr) {
buf_block_t * block;
block = buf_page_get_gen(page_id, page_size, mode, nullptr,
Page_fetch::NORMAL, file, line, mtr);
if (mode != RW_NO_LATCH) {
buf_block_dbg_add_level(block, index != nullptr && dict_index_is_ibuf(index)
? SYNC_IBUF_TREE_NODE
: SYNC_TREE_NODE);
}
return (block);
}
buf_block_t *buf_page_get_gen(const page_id_t &page_id,
const page_size_t &page_size, ulint rw_latch,
buf_block_t *guess, Page_fetch mode,
const char *file, ulint line, mtr_t *mtr,
bool dirty_with_no_latch) {
#ifdef UNIV_DEBUG
ut_ad(mtr->is_active());
ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH ||
rw_latch == RW_SX_LATCH || rw_latch == RW_NO_LATCH);
ut_ad(!ibuf_inside(mtr) ||
ibuf_page_low(page_id, page_size, false, file, line, nullptr));
switch (mode) {
case Page_fetch::NO_LATCH:
ut_ad(rw_latch == RW_NO_LATCH);
break;
case Page_fetch::NORMAL:
case Page_fetch::SCAN:
case Page_fetch::IF_IN_POOL:
case Page_fetch::PEEK_IF_IN_POOL:
case Page_fetch::IF_IN_POOL_OR_WATCH:
case Page_fetch::POSSIBLY_FREED:
break;
default:
ib::fatal(ER_IB_ERR_UNKNOWN_PAGE_FETCH_MODE)
<< "Unknown fetch mode: " << (int)mode;
ut_error;
}
bool found;
const page_size_t &space_page_size =
fil_space_get_page_size(page_id.space(), &found);
ut_ad(page_size.equals_to(space_page_size));
#endif /* UNIV_DEBUG * /
if (mode == Page_fetch::NORMAL && !fsp_is_system_temporary(page_id.space())) {
Buf_fetch_normal fetch(page_id, page_size);
fetch.m_rw_latch = rw_latch;
fetch.m_guess = guess;
fetch.m_mode = mode;
fetch.m_file = file;
fetch.m_line = line;
fetch.m_mtr = mtr;
fetch.m_dirty_with_no_latch = dirty_with_no_latch;
return (fetch.single_page());
} else {
Buf_fetch_other fetch(page_id, page_size);
fetch.m_rw_latch = rw_latch;
fetch.m_guess = guess;
fetch.m_mode = mode;
fetch.m_file = file;
fetch.m_line = line;
fetch.m_mtr = mtr;
fetch.m_dirty_with_no_latch = dirty_with_no_latch;
return (fetch.single_page());
}
}
从末尾的single_page调用Buf_fetch_normal::get再到lookup,来查找buf_page_t的实例,否则从硬盘读取相关页面即read_page 调用buf_read_page,再到buf_Read_page_low,并将其放置于LRU链表。
然后就是数据持久化的交互:
static void buf_flush_write_block_low(buf_page_t *bpage, buf_flush_t flush_type,
bool sync) {
page_t *frame = nullptr;
#ifdef UNIV_DEBUG
buf_pool_t *buf_pool = buf_pool_from_bpage(bpage);
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
#endif /* UNIV_DEBUG */
DBUG_PRINT("ib_buf", ("flush %s %u page " UINT32PF ":" UINT32PF,
sync ? "sync" : "async", (unsigned)flush_type,
bpage->id.space(), bpage->id.page_no()));
ut_ad(buf_page_in_file(bpage));
/* We are not holding block_mutex here. Nevertheless, it is safe to
access bpage, because it is io_fixed and oldest_modification != 0.
Thus, it cannot be relocated in the buffer pool or removed from
flush_list or LRU_list. */
ut_ad(!buf_flush_list_mutex_own(buf_pool));
ut_ad(!buf_page_get_mutex(bpage)->is_owned());
ut_ad(bpage->is_io_fix_write());
ut_ad(bpage->is_dirty());
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(bpage->id) == 0);
#endif /* UNIV_IBUF_COUNT_DEBUG */
ut_ad(recv_recovery_is_on() || bpage->get_newest_lsn() != 0);
/* Force the log to the disk before writing the modified block */
if (!srv_read_only_mode) {
const lsn_t flush_to_lsn = bpage->get_newest_lsn();
/* Do the check before calling log_write_up_to() because in most
cases it would allow to avoid call, and because of that we don't
want those calls because they would have bad impact on the counter
of calls, which is monitored to save CPU on spinning in log threads. */
if (log_sys->flushed_to_disk_lsn.load() < flush_to_lsn) {
Wait_stats wait_stats;
wait_stats = log_write_up_to(*log_sys, flush_to_lsn, true);
MONITOR_INC_WAIT_STATS_EX(MONITOR_ON_LOG_, _PAGE_WRITTEN, wait_stats);
}
}
DBUG_EXECUTE_IF("log_first_rec_group_test", {
recv_no_ibuf_operations = false;
const lsn_t end_lsn = mtr_commit_mlog_test(*log_sys);
log_write_up_to(*log_sys, end_lsn, true);
DBUG_SUICIDE();
});
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_POOL_WATCH:
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_READY_FOR_USE:
case BUF_BLOCK_MEMORY:
case BUF_BLOCK_REMOVE_HASH:
ut_error;
break;
case BUF_BLOCK_ZIP_DIRTY: {
frame = bpage->zip.data;
BlockReporter reporter =
BlockReporter(false, frame, bpage->size,
fsp_is_checksum_disabled(bpage->id.space()));
mach_write_to_8(frame + FIL_PAGE_LSN, bpage->get_newest_lsn());
ut_a(reporter.verify_zip_checksum());
break;
}
case BUF_BLOCK_FILE_PAGE:
frame = bpage->zip.data;
if (!frame) {
frame = ((buf_block_t *)bpage)->frame;
}
buf_flush_init_for_writing(
reinterpret_cast(bpage),
reinterpret_cast(bpage)->frame,
bpage->zip.data ? &bpage->zip : nullptr, bpage->get_newest_lsn(),
fsp_is_checksum_disabled(bpage->id.space()),
false /* do not skip lsn check */);
break;
}
dberr_t err = dblwr::write(flush_type, bpage, sync);
ut_a(err == DB_SUCCESS || err == DB_TABLESPACE_DELETED);
/* Increment the counter of I/O operations used
for selecting LRU policy. * /
buf_LRU_stat_inc_io();
}
dberr_t dblwr::write(buf_flush_t flush_type, buf_page_t *bpage,
bool sync) noexcept {
dberr_t err;
const space_id_t space_id = bpage->id.space();
ut_ad(bpage->current_thread_has_io_responsibility());
/* This is not required for correctness, but it aborts the processing early.
*/
if (bpage->was_stale()) {
/* Disable batch completion in write_complete(). */
bpage->set_dblwr_batch_id(std::numeric_limits::max());
buf_page_free_stale_during_write(
bpage, buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
/* We don't hold io_responsibility here no matter which path through ifs and
elses we've got here, but we can't assert:
ut_ad(!bpage->current_thread_has_io_responsibility());
because bpage could be freed by the time we got here. */
return DB_SUCCESS;
}
if (srv_read_only_mode || fsp_is_system_temporary(space_id) ||
!dblwr::enabled || Double_write::s_instances == nullptr ||
mtr_t::s_logging.dblwr_disabled()) {
/* Skip the double-write buffer since it is not needed. Temporary
tablespaces are never recovered, therefore we don't care about
torn writes. */
bpage->set_dblwr_batch_id(std::numeric_limits::max());
err = Double_write::write_to_datafile(bpage, sync, nullptr, 0);
if (err == DB_PAGE_IS_STALE || err == DB_TABLESPACE_DELETED) {
buf_page_free_stale_during_write(
bpage, buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
err = DB_SUCCESS;
} else if (sync) {
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_SINGLE_PAGE);
if (err == DB_SUCCESS) {
fil_flush(space_id);
}
/* true means we want to evict this page from the LRU list as well. */
buf_page_io_complete(bpage, true);
}
} else {
ut_d(auto page_id = bpage->id);
/* Encrypt the page here, so that the same encrypted contents are written
to the dblwr file and the data file. */
uint32_t e_len{};
file::Block *e_block = dblwr::get_encrypted_frame(bpage, e_len);
if (!sync && flush_type != BUF_FLUSH_SINGLE_PAGE) {
MONITOR_INC(MONITOR_DBLWR_ASYNC_REQUESTS);
ut_d(bpage->release_io_responsibility());
Double_write::submit(flush_type, bpage, e_block, e_len);
err = DB_SUCCESS;
#ifdef UNIV_DEBUG
if (dblwr::Force_crash == page_id) {
force_flush(flush_type, buf_pool_index(buf_pool_from_bpage(bpage)));
}
#endif /* UNIV_DEBUG */
} else {
MONITOR_INC(MONITOR_DBLWR_SYNC_REQUESTS);
/* Disable batch completion in write_complete(). */
bpage->set_dblwr_batch_id(std::numeric_limits::max());
err = Double_write::sync_page_flush(bpage, e_block, e_len);
}
}
/* We don't hold io_responsibility here no matter which path through ifs and
elses we've got here, but we can't assert:
ut_ad(!bpage->current_thread_has_io_responsibility());
because bpage could be freed by the time we got here. * /
return err;
}
在buf0buf.cc分配Block时,buf_block_alloc函数调用buf_LRU_get_free_block,其继续调用buf_flush_single_page_from_LRU调用buf_flush_page调用buf_flush_write_block_low,此函数中调用log_write_up_to ,buf_flush_init_for_writing ,dblwr::write 和buf_LRU_stat_inc_io 四个主要函数。主要的是在dblwr::write函数中:
static dberr_t write_to_datafile(const buf_page_t *in_bpage, bool sync,
const file::Block* e_block, uint32_t e_len)
noexcept MY_ATTRIBUTE((warn_unused_result));
/** Force a flush of the page queue.
@param[in] flush_type FLUSH LIST or LRU LIST flush.
@param[in] buf_pool_index Buffer pool instance for which called. */
static void force_flush(buf_flush_t flush_type, uint32_t buf_pool_index)
noexcept {
if (s_instances == nullptr) {
return;
}
auto dblwr = instance(flush_type, buf_pool_index);
dblwr->force_flush(flush_type);
}
void fil_flush(space_id_t space_id) {
auto shard = fil_system->shard_by_id(space_id);
shard->mutex_acquire();
/* Note: Will release and reacquire the Fil_shard::mutex. */
shard->space_flush(space_id);
shard->mutex_release();
}
/** Flush any pending writes to disk for the redo log. */
void Fil_shard::flush_file_redo() {
/* We never evict the redo log tablespace. It's for all
practical purposes a read-only data structure. */
mutex_acquire();
redo_space_flush();
mutex_release();
}
/** Collect the tablespace IDs of unflushed tablespaces in space_ids.
@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_LOG,
can be ORred */
void Fil_shard::flush_file_spaces(uint8_t purpose) {
Space_ids space_ids;
ut_ad((purpose & FIL_TYPE_TABLESPACE) || (purpose & FIL_TYPE_LOG));
mutex_acquire();
for (auto space = UT_LIST_GET_FIRST(m_unflushed_spaces); space != nullptr;
space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
if ((to_int(space->purpose) & purpose) && !space->stop_new_ops) {
space_ids.push_back(space->id);
}
}
mutex_release();
/* Flush the spaces. It will not hurt to call fil_flush() on
a non-existing space id. * /
for (auto space_id : space_ids) {
mutex_acquire();
space_flush(space_id);
mutex_release();
}
}
bool buf_page_io_complete(buf_page_t *bpage, bool evict) {
auto buf_pool = buf_pool_from_bpage(bpage);
const bool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
ut_a(buf_page_in_file(bpage));
/* We do not need protect io_fix here by mutex to read it because this is the
only function where we can change the value from BUF_IO_READ or BUF_IO_WRITE
to some other value, and our code ensures that this is the only thread that
handles the i/o for this block. There are other methods that reset the IO to
NONE, but they must do that before the IO is requested to OS and must be done
as a part of cleanup in thread that was trying to make such IO request. */
ut_ad(bpage->current_thread_has_io_responsibility());
const auto io_type =
bpage->is_io_fix_read_as_opposed_to_write() ? BUF_IO_READ : BUF_IO_WRITE;
const auto flush_type = buf_page_get_flush_type(bpage);
if (io_type == BUF_IO_READ) {
bool compressed_page;
byte *frame{};
page_no_t read_page_no;
space_id_t read_space_id;
bool is_wrong_page_id = false;
if (bpage->size.is_compressed()) {
frame = bpage->zip.data;
buf_pool->n_pend_unzip.fetch_add(1);
if (uncompressed && !buf_zip_decompress((buf_block_t *)bpage, FALSE)) {
buf_pool->n_pend_unzip.fetch_sub(1);
compressed_page = false;
goto corrupt;
}
buf_pool->n_pend_unzip.fetch_sub(1);
} else {
frame = reinterpret_cast(bpage)->frame;
ut_a(uncompressed);
}
/* If this page is not uninitialized and not in the
doublewrite buffer, then the page number and space id
should be the same as in block. */
read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
read_space_id = mach_read_from_4(frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
if (bpage->id.space() == TRX_SYS_SPACE &&
dblwr::v1::is_inside(bpage->id.page_no())) {
ib::error(ER_IB_MSG_78) << "Reading page " << bpage->id
<< ", which is in the doublewrite buffer!";
} else if (read_space_id == 0 && read_page_no == 0) {
/* This is likely an uninitialized page. */
} else if ((bpage->id.space() != 0 && bpage->id.space() != read_space_id) ||
bpage->id.page_no() != read_page_no) {
/* We did not compare space_id to read_space_id
if bpage->space == 0, because the field on the
page may contain garbage in MySQL < 4.1.1,
which only supported bpage->space == 0. */
ib::error(ER_IB_MSG_79) << "Space id and page number stored in "
"the page read in are "
<< page_id_t(read_space_id, read_page_no)
<< ", should be " << bpage->id;
is_wrong_page_id = true;
}
compressed_page = Compression::is_compressed_page(frame);
/* If the decompress failed then the most likely case is
that we are reading in a page for which this instance doesn't
support the compression algorithm. */
if (compressed_page) {
Compression::meta_t meta;
Compression::deserialize_header(frame, &meta);
ib::error(ER_IB_MSG_80)
<< "Page " << bpage->id << " "
<< "compressed with " << Compression::to_string(meta) << " "
<< "that is not supported by this instance";
}
/* From version 3.23.38 up we store the page checksum
to the 4 first bytes of the page end lsn field */
bool is_corrupted;
{
BlockReporter reporter =
BlockReporter(true, frame, bpage->size,
fsp_is_checksum_disabled(bpage->id.space()));
is_corrupted = reporter.is_corrupted();
}
#ifdef UNIV_LINUX
/* A crash during extending file might cause the inconsistent contents.
No problem for the cases. Just fills with zero for them.
- The next log record to apply is initializing
- No redo log record for the page yet (brand new page) */
if (recv_recovery_is_on() && (is_corrupted || is_wrong_page_id) &&
recv_page_is_brand_new((buf_block_t *)bpage)) {
memset(frame, 0, bpage->size.logical());
is_corrupted = false;
}
#endif /* UNIV_LINUX */
if (compressed_page || is_corrupted) {
/* Not a real corruption if it was triggered by
error injection */
DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
goto page_not_corrupt;);
corrupt:
/* Compressed pages are basically gibberish avoid
printing the contents. */
if (!compressed_page) {
ib::error(ER_IB_MSG_81)
<< "Database page corruption on disk"
" or a failed file read of page "
<< bpage->id << ". You may have to recover from "
<< "a backup.";
buf_page_print(frame, bpage->size, BUF_PAGE_PRINT_NO_CRASH);
ib::info(ER_IB_MSG_82) << "It is also possible that your"
" operating system has corrupted"
" its own file cache and rebooting"
" your computer removes the error."
" If the corrupt page is an index page."
" You can also try to fix the"
" corruption by dumping, dropping,"
" and reimporting the corrupt table."
" You can use CHECK TABLE to scan"
" your table for corruption. "
<< FORCE_RECOVERY_MSG;
}
if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
/* We do not have to mark any index as
corrupted here, since we only know the space
id but not the exact index id. There could
be multiple tables/indexes in the same space,
so we will mark it later in upper layer */
buf_read_page_handle_error(bpage);
return (false);
}
}
DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", page_not_corrupt
: bpage = bpage;);
if (recv_recovery_is_on()) {
/* Pages must be uncompressed for crash recovery. */
ut_a(uncompressed);
recv_recover_page(true, (buf_block_t *)bpage);
}
if (uncompressed && !Compression::is_compressed_page(frame) &&
!recv_no_ibuf_operations &&
fil_page_get_type(frame) == FIL_PAGE_INDEX && page_is_leaf(frame) &&
!fsp_is_system_temporary(bpage->id.space()) &&
!fsp_is_undo_tablespace(bpage->id.space()) && !bpage->was_stale()) {
ibuf_merge_or_delete_for_page((buf_block_t *)bpage, bpage->id,
&bpage->size, TRUE);
}
}
bool has_LRU_mutex{};
auto block_mutex = buf_page_get_mutex(bpage);
if (io_type == BUF_IO_WRITE) {
has_LRU_mutex = buf_get_LRU_mutex(bpage);
} else {
mutex_enter(block_mutex);
}
#ifdef UNIV_IBUF_COUNT_DEBUG
if (io_type == BUF_IO_WRITE || uncompressed) {
/* For BUF_IO_READ of compressed-only blocks, the
buffered operations will be merged by buf_page_get_gen()
after the block has been uncompressed. */
ut_a(ibuf_count_get(bpage->id) == 0);
}
#endif /* UNIV_IBUF_COUNT_DEBUG */
/* Because this thread which does the unlocking is not the same that
did the locking, we use a pass value != 0 in unlock, which simply
removes the newest lock debug record, without checking the thread
id. */
buf_page_monitor(bpage, io_type);
switch (io_type) {
case BUF_IO_READ:
ut_ad(!has_LRU_mutex);
buf_page_set_io_fix(bpage, BUF_IO_NONE);
/* NOTE that the call to ibuf may have moved the ownership of
the x-latch to this OS thread: do not let this confuse you in
debugging! */
if (uncompressed) {
rw_lock_x_unlock_gen(&((buf_block_t *)bpage)->lock, BUF_IO_READ);
}
mutex_exit(block_mutex);
ut_ad(buf_pool->n_pend_reads > 0);
buf_pool->n_pend_reads.fetch_sub(1);
buf_pool->stat.n_pages_read.fetch_add(1);
break;
case BUF_IO_WRITE:
/* Write means a flush operation: call the completion
routine in the flush system */
buf_flush_write_complete(bpage);
if (uncompressed) {
rw_lock_sx_unlock_gen(&((buf_block_t *)bpage)->lock, BUF_IO_WRITE);
}
buf_pool->stat.n_pages_written.fetch_add(1);
/* We decide whether or not to evict the page from the
LRU list based on the flush_type.
* BUF_FLUSH_LIST: don't evict
* BUF_FLUSH_LRU: always evict
* BUF_FLUSH_SINGLE_PAGE: eviction preference is passed
by the caller explicitly. * /
if (flush_type == BUF_FLUSH_LRU) {
evict = true;
ut_ad(has_LRU_mutex);
}
if (evict && buf_LRU_free_page(bpage, true)) {
has_LRU_mutex = false;
} else {
mutex_exit(block_mutex);
}
if (has_LRU_mutex) {
mutex_exit(&buf_pool->LRU_list_mutex);
}
break;
default:
ut_error;
}
DBUG_PRINT("ib_buf", ("%s page " UINT32PF ":" UINT32PF,
io_type == BUF_IO_READ ? "read" : "wrote",
bpage->id.space(), bpage->id.page_no()));
return (true);
}
上面只是禁止临时表和dblwr的情况,还有一部分是对Page加密后写dblwr和数据文件:
file::Block *dblwr::get_encrypted_frame(buf_page_t *bpage,
uint32_t &e_len) noexcept {
space_id_t space_id = bpage->space();
page_no_t page_no = bpage->page_no();
if (page_no == 0) {
/* The first page of any tablespace is never encrypted.
So return early. */
return nullptr;
}
if (fsp_is_undo_tablespace(space_id) && !srv_undo_log_encrypt) {
/* It is an undo tablespace and undo encryption is not enabled. */
return nullptr;
}
fil_space_t *space = bpage->get_space();
if (space->encryption_op_in_progress == DECRYPTION ||
!space->is_encrypted()) {
return nullptr;
}
if (!space->can_encrypt()) {
/* Encryption key information is not available. */
return nullptr;
}
IORequest type(IORequest::WRITE);
void *frame{};
uint32_t len{};
fil_node_t *node = space->get_file_node(&page_no);
type.block_size(node->block_size);
Double_write::prepare(bpage, &frame, &len);
ulint n = len;
file::Block *compressed_block{};
/* Transparent page compression (TPC) is disabled if punch hole is not
supported. A similar check is done in Fil_shard::do_io(). */
const bool do_compression =
space->is_compressed() && !bpage->size.is_compressed() &&
IORequest::is_punch_hole_supported() && node->punch_hole;
if (do_compression) {
/* @note Compression needs to be done before encryption. */
/* The page size must be a multiple of the OS punch hole size. * /
ut_ad(n % type.block_size() == 0);
type.compression_algorithm(space->compression_type);
compressed_block = os_file_compress_page(type, frame, &n);
}
space->get_encryption_info(type.get_encryption_info());
auto e_block = os_file_encrypt_page(type, frame, &n);
if (compressed_block != nullptr) {
file::Block::free(compressed_block);
}
e_len = n;
return e_block;
}
static void submit(buf_flush_t flush_type, buf_page_t *bpage,
const file::Block *e_block, uint32_t e_len) noexcept {
if (s_instances == nullptr) {
return;
}
auto dblwr = instance(flush_type, bpage);
dblwr->enqueue(flush_type, bpage, e_block, e_len);
}
dberr_t Double_write::sync_page_flush(buf_page_t *bpage, file::Block *e_block,
uint32_t e_len) noexcept {
#ifdef UNIV_DEBUG
ut_d(auto page_id = bpage->id);
if (dblwr::Force_crash == page_id) {
auto frame = reinterpret_cast(bpage)->frame;
const auto p = reinterpret_cast(frame);
ut_ad(page_get_space_id(p) == dblwr::Force_crash.space());
ut_ad(page_get_page_no(p) == dblwr::Force_crash.page_no());
}
#endif /* UNIV_DEBUG * /
Segment * segment{};
while (!s_single_segments->dequeue(segment)) {
std::this_thread::yield();
}
single_write(segment, bpage, e_block, e_len);
#ifndef _WIN32
if (is_fsync_required()) {
segment->flush();
}
#endif / * ! _ WIN32 * /
#ifdef UNIV_DEBUG
if (dblwr::Force_crash == page_id) {
DBUG_SUICIDE();
}
#endif /* UNIV_DEBUG * /
auto err = write_to_datafile(bpage, true, e_block, e_len);
if (err == DB_SUCCESS) {
fil_flush(bpage->id.space());
} else {
/* This block is not freed if the write_to_datafile doesn't succeed. * /
if (e_block != nullptr) {
os_free_block(e_block);
}
}
while (!s_single_segments->enqueue(segment)) {
UT_RELAX_CPU();
}
/* true means we want to evict this page from the LRU list as well. * /
buf_page_io_complete(bpage, true);
return DB_SUCCESS;
}
最后说一下和LRU的算法交互代码,前面的page创建时会调用buf_page_init,同时,它又会调用buf_LRU_get_free_block和buf_LRU_add_block:
/** Returns a free block from the buf_pool. The block is taken off the
free list. If free list is empty, blocks are moved from the end of the
LRU list to the free list.
This function is called from a user thread when it needs a clean
block to read in a page. Note that we only ever get a block from
the free list. Even when we flush a page or find a page in LRU scan
we put it to free list to be used.
* iteration 0:
* get a block from free list, success:done
* if buf_pool->try_LRU_scan is set
* scan LRU up to srv_LRU_scan_depth to find a clean block
* the above will put the block on free list
* success:retry the free list
* flush one dirty page from tail of LRU to disk
* the above will put the block on free list
* success: retry the free list
* iteration 1:
* same as iteration 0 except:
* scan whole LRU list
* scan LRU list even if buf_pool->try_LRU_scan is not set
* iteration > 1:
* same as iteration 1 but sleep 10ms
@param[in,out] buf_pool buffer pool instance
@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
buf_block_t *buf_LRU_get_free_block(buf_pool_t *buf_pool) {
buf_block_t *block = nullptr;
bool freed = false;
ulint n_iterations = 0;
ulint flush_failures = 0;
bool mon_value_was = false;
bool started_monitor = false;
ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
loop:
buf_LRU_check_size_of_non_data_objects(buf_pool);
/* If there is a block in the free list, take it */
block = buf_LRU_get_free_only(buf_pool);
if (block != nullptr) {
ut_ad(!block->page.someone_has_io_responsibility());
ut_ad(buf_pool_from_block(block) == buf_pool);
memset(&block->page.zip, 0, sizeof block->page.zip);
if (started_monitor) {
srv_print_innodb_monitor = static_cast(mon_value_was);
}
block->page.reset_flush_observer();
return block;
}
MONITOR_INC(MONITOR_LRU_GET_FREE_LOOPS);
freed = false;
os_rmb;
if (buf_pool->try_LRU_scan || n_iterations > 0) {
/* If no block was in the free list, search from the
end of the LRU list and try to free a block there.
If we are doing for the first time we'll scan only
tail of the LRU list otherwise we scan the whole LRU
list. */
freed = buf_LRU_scan_and_free_block(buf_pool, n_iterations > 0);
if (!freed && n_iterations == 0) {
/* Tell other threads that there is no point
in scanning the LRU list. This flag is set to
TRUE again when we flush a batch from this
buffer pool. */
buf_pool->try_LRU_scan = FALSE;
os_wmb;
}
}
if (freed) {
goto loop;
}
if (n_iterations > 20 && srv_buf_pool_old_size == srv_buf_pool_size) {
ib::warn(ER_IB_MSG_134)
<< "Difficult to find free blocks in the buffer pool"
" ("
<< n_iterations << " search iterations)! " << flush_failures
<< " failed attempts to"
" flush a page! Consider increasing the buffer pool"
" size. It is also possible that in your Unix version"
" fsync is very slow, or completely frozen inside"
" the OS kernel. Then upgrading to a newer version"
" of your operating system may help. Look at the"
" number of fsyncs in diagnostic info below."
" Pending flushes (fsync) log: "
<< fil_n_pending_log_flushes
<< "; buffer pool: " << fil_n_pending_tablespace_flushes << ". "
<< os_n_file_reads << " OS file reads, " << os_n_file_writes
<< " OS file writes, " << os_n_fsyncs
<< " OS fsyncs. Starting InnoDB Monitor to print"
" further diagnostics to the standard output.";
mon_value_was = srv_print_innodb_monitor;
started_monitor = true;
srv_print_innodb_monitor = true;
os_event_set(srv_monitor_event);
}
/* If we have scanned the whole LRU and still are unable to
find a free block then we should sleep here to let the
page_cleaner do an LRU batch for us. */
if (!srv_read_only_mode) {
os_event_set(buf_flush_event);
}
if (n_iterations > 1) {
MONITOR_INC(MONITOR_LRU_GET_FREE_WAITS);
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
/* No free block was found: try to flush the LRU list.
This call will flush one page from the LRU and put it on the
free list. That means that the free block is up for grabs for
all user threads.
TODO: A more elegant way would have been to return the freed
up block to the caller here but the code that deals with
removing the block from page_hash and LRU_list is fairly
involved (particularly in case of compressed pages). We
can do that in a separate patch sometime in future. * /
if (!buf_flush_single_page_from_LRU(buf_pool)) {
MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
++flush_failures;
}
srv_stats.buf_pool_wait_free.add(n_iterations, 1);
n_iterations++;
goto loop;
}
/** Adds a block to the LRU list. Please make sure that the page_size is
already set when invoking the function, so that we can get correct
page_size from the buffer page when adding a block into LRU */
void buf_LRU_add_block(buf_page_t *bpage, /*!< in: control block */
ibool old) /*!< in: TRUE if should be put to the old
blocks in the LRU list, else put to the start;
if the LRU list is very short, the block is
added to the start, regardless of this
parameter */
{
buf_LRU_add_block_low(bpage, old);
}
/** Adds a block to the LRU list. Please make sure that the page_size is
already set when invoking the function, so that we can get correct
page_size from the buffer page when adding a block into LRU
@param[in] bpage control block
@param[in] old TRUE if should be put to the old blocks in the LRU list,
else put to the start; if the LRU list is very short,
the block is added to the start, regardless of this
parameter */
UNIV_INLINE
void buf_LRU_add_block_low(buf_page_t *bpage, ibool old) {
buf_pool_t *buf_pool = buf_pool_from_bpage(bpage);
ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
ut_a(buf_page_in_file(bpage));
ut_ad(!bpage->in_LRU_list);
if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
UT_LIST_ADD_FIRST(buf_pool->LRU, bpage);
bpage->freed_page_clock = buf_pool->freed_page_clock;
} else {
#ifdef UNIV_LRU_DEBUG
/* buf_pool->LRU_old must be the first item in the LRU list
whose "old" flag is set. */
ut_a(buf_pool->LRU_old->old);
ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old) ||
!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old) ||
UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
#endif /* UNIV_LRU_DEBUG */
UT_LIST_INSERT_AFTER(buf_pool->LRU, buf_pool->LRU_old, bpage);
buf_pool->LRU_old_len++;
}
ut_d(bpage->in_LRU_list = TRUE);
incr_LRU_size_in_bytes(bpage, buf_pool);
if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
ut_ad(buf_pool->LRU_old);
/* Adjust the length of the old block list if necessary */
buf_page_set_old(bpage, old);
buf_LRU_old_adjust_len(buf_pool);
} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
/* The LRU list is now long enough for LRU_old to become
defined: init it */
buf_LRU_old_init(buf_pool);
} else {
buf_page_set_old(bpage, buf_pool->LRU_old != nullptr);
}
/* If this is a zipped block with decompressed frame as well
then put it on the unzip_LRU list * /
if (buf_page_belongs_to_unzip_LRU(bpage)) {
buf_unzip_LRU_add_block((buf_block_t *)bpage, old);
}
}
上面只是页创建时的相关,其实在前面的对上层交互和对持久化等交互中,都会涉及到LRU对链表中页的管理的动作,仔细跟下去就会发现LRU就在这底层的最后的坚强里。
从一个缓冲区的功能到基本的数据结构,再到这个数据结构的本地控制应用,前面用了不少的篇幅了。但仍然觉得意犹未尽,所以特别把Buffer Pool抽出来,分析一下在整个MySql中,其是如何应用,如何使用的一个流程框架。这样,就把Buffer Pool从微观到宏观,从细节到整体,有一个全面的把握。也只有如此,才能更好的明白整个缓存在InonoDB的应用过程。正所谓知微而见著,逆向推理。
理论有了,实践有了,能走多远,就看个人毅力和决心了。