mysql undo 是mvcc主要的实现环节。Undo记录默认备记录到系统表空间内,也可以使用单独的undo 表空间。
undo 使用 rollback segements管理;
手册下结构的描述:
InnoDB supports a maximum of 128 rollback segments, 32 of which are allocated to the temporary tablespace. This leaves 96 rollback segments that can be assigned to transactions that modify data in regular tables. The innodb_rollback_segments variable defines the number of rollback segments used by InnoDB
前面32个是用于管理临时表事物,分配给临时表空间系统的。从32个开始是用于普通的segments,共96个
undo 支持的事务数取决于 回滚段中的撤消槽数和每个事务所需的撤消日志数。这个和inndodb page size相关
默认的是16k一个page
page size | number of undo slots |
---|---|
4k | 256 |
8k | 512 |
16k | 1024 |
32k | 2046 |
64k | 4096 |
从上表中可以看到 理论上普通事务支持的事务个数是96乘1024个
mysql undo 主要分别两大类:
1.insert_undo
2.update_undo(update,delete)
由于undo分类 这两类,所以如果需要估计undo支持的并发事务量
需要
(innodb_page_size / 16 / 2) * (innodb_rollback_segments - 32)
当然这受很多影响,这是其他不考虑的情况下,undo能支持的。
这部分主要阅读了《数据库内核月报 - 2015 / 04 --MySQL · 引擎特性 · InnoDB undo log 漫游》
相关的定义在mysql-5.7.25\storage\innobase\include\trx0sys.h
mysql-5.7.25\storage\innobase\include\trx0rseg.h等
主要的定义:
trx0sys.h:
// 128 对于上面描述的128个回滚段 ()
#define TRX_SYS_N_RSEGS
//UNIV_PAGE_SIZE正常页面的大小 即 1024
#define TRX_RSEG_N_SLOTS (UNIV_PAGE_SIZE / 16)
trx_sys_t 里面的 rseg_array是128,对应这128个回滚段
对应的trx_rseg_t , 再往下就是很多,trx_undo_t 是undo的 log的内存结构。
从redo里面的一条insert中的btr_cur_ins_lock_and_undo 方法往下看,看能不能连起来。
trx_undo_report_row_operation ->方法
dberr_t
trx_undo_report_row_operation(
//上面忽略一些临时表,只读事务的判断。
.....................
switch (op_type) {
case TRX_UNDO_INSERT_OP:
undo = undo_ptr->insert_undo;
if (undo == NULL) {
err = trx_undo_assign_undo(
trx, undo_ptr, TRX_UNDO_INSERT);
undo = undo_ptr->insert_undo;
if (undo == NULL) {
/* Did not succeed */
ut_ad(err != DB_SUCCESS);
goto err_exit;
}
ut_ad(err == DB_SUCCESS);
}
break;
default:
ut_ad(op_type == TRX_UNDO_MODIFY_OP);
undo = undo_ptr->update_undo;
if (undo == NULL) {
err = trx_undo_assign_undo(
trx, undo_ptr, TRX_UNDO_UPDATE);
undo = undo_ptr->update_undo;
if (undo == NULL) {
/* Did not succeed */
ut_ad(err != DB_SUCCESS);
goto err_exit;
}
}
ut_ad(err == DB_SUCCESS);
}
page_no = undo->last_page_no;
undo_block = buf_page_get_gen(
page_id_t(undo->space, page_no), undo->page_size, RW_X_LATCH,
buf_pool_is_obsolete(undo->withdraw_clock)
? NULL : undo->guess_block, BUF_GET, __FILE__, __LINE__,
&mtr);
buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE);
do {
page_t* undo_page;
ulint offset;
undo_page = buf_block_get_frame(undo_block);
ut_ad(page_no == undo_block->page.id.page_no());
switch (op_type) {
case TRX_UNDO_INSERT_OP:
offset = trx_undo_page_report_insert(
undo_page, trx, index, clust_entry, &mtr);
break;
default:
ut_ad(op_type == TRX_UNDO_MODIFY_OP);
offset = trx_undo_page_report_modify(
undo_page, trx, index, rec, offsets, update,
cmpl_info, clust_entry, &mtr);
}
if (UNIV_UNLIKELY(offset == 0)) {
/* The record did not fit on the page. We erase the
end segment of the undo log page and write a log
record of it: this is to ensure that in the debug
version the replicate page constructed using the log
records stays identical to the original page */
if (!trx_undo_erase_page_end(undo_page, &mtr)) {
/* The record did not fit on an empty
undo page. Discard the freshly allocated
page and return an error. */
/* When we remove a page from an undo
log, this is analogous to a
pessimistic insert in a B-tree, and we
must reserve the counterpart of the
tree latch, which is the rseg
mutex. We must commit the mini-transaction
first, because it may be holding lower-level
latches, such as SYNC_FSP and SYNC_FSP_PAGE. */
mtr_commit(&mtr);
mtr_start(&mtr);
dict_disable_redo_if_temporary(
index->table, &mtr);
mutex_enter(&undo_ptr->rseg->mutex);
trx_undo_free_last_page(trx, undo, &mtr);
mutex_exit(&undo_ptr->rseg->mutex);
err = DB_UNDO_RECORD_TOO_BIG;
goto err_exit;
}
mtr_commit(&mtr);
} else {
/* Success */
undo->withdraw_clock = buf_withdraw_clock;
mtr_commit(&mtr);
undo->empty = FALSE;
undo->top_page_no = page_no;
undo->top_offset = offset;
undo->top_undo_no = trx->undo_no;
undo->guess_block = undo_block;
trx->undo_no++;
trx->undo_rseg_space = undo_ptr->rseg->space;
mutex_exit(&trx->undo_mutex);
*roll_ptr = trx_undo_build_roll_ptr(
op_type == TRX_UNDO_INSERT_OP,
undo_ptr->rseg->id, page_no, offset);
return(DB_SUCCESS);
}
ut_ad(page_no == undo->last_page_no);
/* We have to extend the undo log by one page */
ut_ad(++loop_count < 2);
mtr_start(&mtr);
dict_disable_redo_if_temporary(index->table, &mtr);
/* When we add a page to an undo log, this is analogous to
a pessimistic insert in a B-tree, and we must reserve the
counterpart of the tree latch, which is the rseg mutex. */
mutex_enter(&undo_ptr->rseg->mutex);
undo_block = trx_undo_add_page(trx, undo, undo_ptr, &mtr);
mutex_exit(&undo_ptr->rseg->mutex);
page_no = undo->last_page_no;
DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
undo_block = NULL;);
} while (undo_block != NULL);
ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
ER_INNODB_UNDO_LOG_FULL,
"No more space left over in %s tablespace for allocating UNDO"
" log pages. Please add new data file to the tablespace or"
" check if filesystem is full or enable auto-extension for"
" the tablespace",
((undo->space == srv_sys_space.space_id())
? "system" :
((fsp_is_system_temporary(undo->space))
? "temporary" : "undo")));
/* Did not succeed: out of space */
err = DB_OUT_OF_FILE_SPACE;
err_exit:
mutex_exit(&trx->undo_mutex);
mtr_commit(&mtr);
return(err);
}
switch (op_type)开始分了上面提到的两种: TRX_UNDO_INSERT_OP,TRX_UNDO_MODIFY_OP
trx_undo_report_row_operation:
undo_ptr = is_temp_table ? &trx->rsegs.m_noredo : &trx->rsegs.m_redo;
switch (op_type)
err = trx_undo_assign_undo(
trx, undo_ptr, TRX_UNDO_INSERT);
undo = undo_ptr->insert_undo;
主要是就是上面判断一些后面需要传参的情况,比如临时表等,是否read-only
临时表操作记录undo时不写redo log
调用trx_undo_assign_undo
trx_undo_assign_undo:
undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, trx->xid,
&mtr);
if (undo == NULL) {
err = trx_undo_create(trx, rseg, type, trx->id, trx->xid,
&undo, &mtr);
if (err != DB_SUCCESS) {
goto func_exit;
}
}
分配回滚段,如果没有新申请一个
trx_undo_create:
trx_rsegf_get:
获取回滚段的头信息,里面会通过一个通过的方法:buf_page_get_gen来获取一个page
buf_page_get:buf_page_get_gen:
操作系统去获取一个page
trx_undo_seg_create:
trx_undo_seg_create:创建undo segment
slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr)判断slot是否用完(通常这几乎不会发生),会返回DB_TOO_MANY_CONCURRENT_TRXS
回到trx_undo_assign_undo:
if (type == TRX_UNDO_INSERT) {
UT_LIST_ADD_FIRST(rseg->insert_undo_list, undo);
ut_ad(undo_ptr->insert_undo == NULL);
undo_ptr->insert_undo = undo;
} else {
UT_LIST_ADD_FIRST(rseg->update_undo_list, undo);
ut_ad(undo_ptr->update_undo == NULL);
undo_ptr->update_undo = undo;
}
UT_LIST_ADD_FIRST:
加到trx_rseg_t结构体的list上
回到trx_undo_report_row_operation:开始写入
switch (op_type) {
case TRX_UNDO_INSERT_OP:
offset = trx_undo_page_report_insert(
undo_page, trx, index, clust_entry, &mtr);
break;
default:
ut_ad(op_type == TRX_UNDO_MODIFY_OP);
offset = trx_undo_page_report_modify(
undo_page, trx, index, rec, offsets, update,
cmpl_info, clust_entry, &mtr);
}
insert 调用:
trx_undo_page_report_insert:
dict_index_get_n_unique索取索引的行数(应该是这样),获取对应的行的列的内存位置,
ptr指针加上对应列的长度。ut_memcpy获得指向数据的指针,拷贝到ptr指向的指针。
这样已经拷贝到 buffer pool 里面的undo cache.
这里面的mtr 是对undo的保护
undo的数据落盘,应该和data buffer一致。
我的理解,应该是内存结构和物理结构是一一对应的,这点需要有人解释下。