PostgreSQL中的WAL保留策略

一、背景描述

在PostgreSQL中,我们熟知对于wal日志的保留,有wal_keep_segments这个参数去配置。当主库日志刷新比较快时,我们一般会将其调整为一个比较大的值,来保证从库稳定地进行流复制。

但有时,我们会发现主库保留的日志远远大于这个参数配置,甚至会出现日志暴增,存在写满磁盘的风险。

这就涉及到了另外一个机制–复制槽,从官方文档描述来看复制槽提供了一种自动化的方法来确保主库在所有的从库收到 WAL 段 之前不会移除它们。因此wal_keep_segments参数和复制槽之间肯定存在着某种关系,来确定日志保留数量。

二、原理解析

1.场景制造

以我自己制造的场景为例来探究其中的原理,
wal_keep_segments设置为16,主库创建一个复制槽standby_repl_slot给从库使用,进行流复制。

主从正常连接的情况下,停止从库;在主库进行批量dml操作,可以观察到主库保留的wal越来越多,目前已经保留179个,并还在增加中。

[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$ ll  00000001*| wc -l
179
[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$ ll  00000001*| head -5
-rw-------. 1 postgres postgres 16777216 Oct  8 19:06 0000000100000000000000F8
-rw-------. 1 postgres postgres 16777216 Oct  8 19:06 0000000100000000000000F9
-rw-------. 1 postgres postgres 16777216 Oct  8 19:06 0000000100000000000000FA
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 0000000100000000000000FB
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 0000000100000000000000FC
[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$

查看复制槽信息如下:
复制槽目前是非活跃状态

psql (11.5)
Type "help" for help.

postgres=# select * from pg_replication_slots ;
     slot_name     | plugin | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_lsn
-------------------+--------+-----------+--------+----------+-----------+--------+------------+------+--------------+-------------+------------------

 standby_repl_slot |        | physical  |        |          | f         | f      |            |      |              | 0/F8000140  |
(1 row)

postgres=#

2.代码走读

wal的保留策略是由checkpointer进程来执行的,在CreateCheckPoint或者CreateRestartPoint时,会计算需要从什么位置开始保留,然后对之前的日志进行Recycle和remove。

以CreateCheckPoint为例:
在创建检查点时,KeepLogSeg函数会计算需要保留的日志段,RemoveOldXlogFiles将不需要保留的都处理掉。

void
CreateCheckPoint(int flags)
{
	bool		shutdown;
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogSegNo	_logSegNo;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	uint32		freespace;
	XLogRecPtr	PriorRedoPtr;
	XLogRecPtr	curInsert;
	XLogRecPtr	last_important_lsn;
	VirtualTransactionId *vxids;
	int			nvxids;
	/*省略部分代码*/
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE);

	XLogFlush(recptr);
	/*省略部分代码*/
	/*
	 * Update the average distance between checkpoints if the prior checkpoint
	 * exists.
	 */
	 /* 估算两次checkpoint的偏移量 */
	if (PriorRedoPtr != InvalidXLogRecPtr)
		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);

	/*
	/*
	 * Delete old log files, those no longer needed for last checkpoint to
	 * prevent the disk holding the xlog from growing full.
	 */
	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
	/*计算出需要保留的wal序列号*/
	KeepLogSeg(recptr, &_logSegNo);
	/*将序列号减一,也就是上一个wal日志,从这个日志及之前所有的都已经不需要*/
	_logSegNo--;
	/*回收或清理不需要的日志 */
	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
	
	/*省略部分代码*/
}

来看保留日志的计算过程:
通过两种策略,分别计算出需要保留的wal序列号,选择其中更小者

static void
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
{
	XLogSegNo	segno;
	XLogRecPtr	keep;
	/*segno是wal日志序列号,recptr是本次checkpoint的record,wal_segment_size为16MB*/
	/*XLByteToSeg计算出本次checkpoint flush的wal日志,一般是最新的wal日志*/
	XLByteToSeg(recptr, segno, wal_segment_size);
	/*这里的keep获取的是复制槽目前保存的LSN位点信息*/
	/*即 XLogCtl->replicationSlotMinLSN */
	keep = XLogGetReplicationSlotMinimumLSN();
    
    /*这里是通过策略1--wal_keep_segments,计算需要保留的wal序列号*/
	/* compute limit for wal_keep_segments first */
	if (wal_keep_segments > 0)
	{
		/* avoid underflow, don't go below 1 */
		if (segno <= wal_keep_segments)
			segno = 1;
		else
			/*需要保留的wal序列号= 当前最新wal序列号 - 16 */
			segno = segno - wal_keep_segments;
	}
    /*这里是通过策略二--复制槽,计算需要保留的wal序列号*/
	/* then check whether slots limit removal further */
	/* 在使用复制槽的情况下,才会考虑这种策略 */
	if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
	{
		XLogSegNo	slotSegNo;
        /*这里通过之前获取到的复制槽位点信息keep,计算出对应的wal序列号slotSegNo*/
		XLByteToSeg(keep, slotSegNo, wal_segment_size);

		if (slotSegNo <= 0)
			segno = 1;
		/*这里就是两种策略计算出的wal序列号进行比较,选择小值,也就是更早的wal日志*/
		else if (slotSegNo < segno)
			segno = slotSegNo;
	}
    /*当计算的wal序列号,小于本次checkpoint.redo记录时,将其赋值给logSegNo*/
	/* don't delete WAL segments newer than the calculated segment */
	if (segno < *logSegNo)
		*logSegNo = segno;
}

再来看清理日志的逻辑:

static void
RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
{
	DIR		   *xldir;
	struct dirent *xlde;
	char		lastoff[MAXFNAMELEN];

	/*
	 * Construct a filename of the last segment to be kept. The timeline ID
	 * doesn't matter, we ignore that in the comparison. (During recovery,
	 * ThisTimeLineID isn't set, so we can't use that.)
	 */
	/*根据wal序列号,计算wal文件名 ,这里把它描述为回收点*/
	XLogFileName(lastoff, 0, segno, wal_segment_size);

	elog(DEBUG2, "attempting to remove WAL segments older than log file %s",		 lastoff);

	xldir = AllocateDir(XLOGDIR);
	/*循环遍历pg_wal目录下的所有日志*/
	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
	{
		/* Ignore files that are not XLOG segments */
		if (!IsXLogFileName(xlde->d_name) &&
			!IsPartialXLogFileName(xlde->d_name))
			continue;

		/*
		 * We ignore the timeline part of the XLOG segment identifiers in
		 * deciding whether a segment is still needed.  This ensures that we
		 * won't prematurely remove a segment from a parent timeline. We could
		 * probably be a little more proactive about removing segments of
		 * non-parent timelines, but that would be a whole lot more
		 * complicated.
		 *
		 * We use the alphanumeric sorting property of the filenames to decide
		 * which ones are earlier than the lastoff segment.
		 */
		/* 用strcmp来比较,如果当前wal日志号小于或等于回收点,并且该日志已经归档(开归档的情况下)那么就可以回收或者删除 */
		if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
		{   
			/*检查日志是否归档完成(即pg_wal/archive_status目录下是不是已经存在对应的.done文件)*/
			if (XLogArchiveCheckDone(xlde->d_name))
			{
				/* Update the last removed location in shared memory first */
				UpdateLastRemovedPtr(xlde->d_name);
				/*真正的回收/删除函数,函数里使用unlink删除wal*/
				RemoveXlogFile(xlde->d_name, RedoRecPtr, endptr);
			}
		}
	}

	FreeDir(xldir);
}

RemoveXlogFile里边进行日志回收以及清理,回收就是从不需要保留的日志中选择一部分来给未来使用(回收数量和两次checkpoint间产生wal量有关系),其余的会被清理掉。

/*
 * Recycle or remove a log file that's no longer needed.
 *
 * endptr is current (or recent) end of xlog, and RedoRecPtr is the
 * redo pointer of the last checkpoint. These are used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
 * If RedoRecPtr is not known, pass invalid, and the function will recycle,
 * somewhat arbitrarily, 10 future segments.
 */
static void
RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
{
	char		path[MAXPGPATH];
#ifdef WIN32
	char		newpath[MAXPGPATH];
#endif
	struct stat statbuf;
	XLogSegNo	endlogSegNo;
	XLogSegNo	recycleSegNo;

	/*
	 * Initialize info about where to try to recycle to.
	 */
	/* 计算当前最新wal序列号 */
	XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
	/* 这里是很重要的一步,计算最大回收号recycleSegNo */
	/* 若当前为第一次checkpoint时,最大回收号为当前wal序列号+ 10,*/
	/*也就是说,回收10个日志 */
	if (RedoRecPtr == InvalidXLogRecPtr)
		recycleSegNo = endlogSegNo + 10;
	else /*当前不是第一次checkpoint,使用XLOGfileslop函数计算最大回收号*/
		recycleSegNo = XLOGfileslop(RedoRecPtr);

	snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);

	/*
	 * Before deleting the file, see if it can be recycled as a future log
	 * segment. Only recycle normal files, pg_standby for example can create
	 * symbolic links pointing to a separate archive directory.
	 */
	 /* 当前wal序列号小于最大回收号,并满足一定条件时,使用InstallXLogFileSegment函数回收日志*/
	if (endlogSegNo <= recycleSegNo &&
		lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
		InstallXLogFileSegment(&endlogSegNo, path,
							   true, recycleSegNo, true))
	{   /*服务器日志级别为debug2时,会提示当前正在回收wal*/
		ereport(DEBUG2,
				(errmsg("recycled write-ahead log file \"%s\"",
						segname)));
		CheckpointStats.ckpt_segs_recycled++;
		/* Needn't recheck that slot on future iterations */
		endlogSegNo++;
	}
	else /* 清理剩余的wal */
	{
		/* No need for any more future segments... */
		int			rc;

		ereport(DEBUG2,
				(errmsg("removing write-ahead log file \"%s\"",
						segname)));

#ifdef WIN32

		/*
		 * On Windows, if another process (e.g another backend) holds the file
		 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
		 * will still show up in directory listing until the last handle is
		 * closed. To avoid confusing the lingering deleted file for a live
		 * WAL file that needs to be archived, rename it before deleting it.
		 *
		 * If another process holds the file open without FILE_SHARE_DELETE
		 * flag, rename will fail. We'll try again at the next checkpoint.
		 */
		snprintf(newpath, MAXPGPATH, "%s.deleted", path);
		if (rename(path, newpath) != 0)
		{
			ereport(LOG,
					(errcode_for_file_access(),
					 errmsg("could not rename old write-ahead log file \"%s\": %m",
							path)));
			return;
		}
		rc = durable_unlink(newpath, LOG);
#else   /*使用unlink删除wal*/
		rc = durable_unlink(path, LOG);
#endif
		if (rc != 0)
		{
			/* Message already logged by durable_unlink() */
			return;
		}
		CheckpointStats.ckpt_segs_removed++;
	}

	XLogArchiveCleanup(segname);
}

InstallXLogFileSegment函数进行回收重用,回收至recycleSegNo返回false

static bool
InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
					   bool find_free, XLogSegNo max_segno,
					   bool use_lock)
{
	char		path[MAXPGPATH];
	struct stat stat_buf;

	XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		durable_unlink(path, DEBUG1);
	}
	else
	{
		/* Find a free slot to put it in */
		/*max_segno为recycleSegNo*/
		while (stat(path, &stat_buf) == 0)
		{   /*如果序列号已经到达recycleSegNo,直接返回False,在上层函数RemoveXlogFile中进入删除逻辑*/
			if ((*segno) >= max_segno)
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
					LWLockRelease(ControlFileLock);
				return false;
			}
			/*序列号+1,直到到达recycleSegNo*/
			(*segno)++;
			/*根据序列号segno,设置新文件名path(包含pg_wal路径)*/
			XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
		}
	}

	/*
	 * Perform the rename using link if available, paranoidly trying to avoid
	 * overwriting an existing file (there shouldn't be one).
	 */
	 /*在durable_link_or_rename中,将旧文件名替换为新文件名*/
	if (durable_link_or_rename(tmppath, path, LOG) != 0)
	{
		if (use_lock)
			LWLockRelease(ControlFileLock);
		/* durable_link_or_rename already emitted log message */
		return false;
	}

	if (use_lock)
		LWLockRelease(ControlFileLock);

	return true;
}
3.跟踪验证

以上已经初步的分析了下wal保留计算和回收清理的过程,下面我们跟踪checkpointer进程验证下。

attach checkpointer后,手动触发一次checkpoint

给两个函数设置断点

(gdb) bt
#0  0x00007f815452f0d3 in __select_nocancel () at ../sysdeps/unix/syscall-template.S:81
#1  0x000000000085fbca in pg_usleep (microsec=microsec@entry=100000) at pgsleep.c:56
#2  0x00000000006b3601 in CheckpointWriteDelay (flags=flags@entry=128, progress=progress@entry=0.79970104633781758) at checkpointer.c:718
#3  0x00000000006fd749 in BufferSync (flags=flags@entry=128) at bufmgr.c:2014
#4  0x00000000006fd826 in CheckPointBuffers (flags=flags@entry=128) at bufmgr.c:2578
#5  0x00000000004f33f1 in CheckPointGuts (checkPointRedo=5472965584, flags=flags@entry=128) at xlog.c:9149
#6  0x00000000004f99d7 in CreateCheckPoint (flags=flags@entry=128) at xlog.c:8937
#7  0x00000000006b3272 in CheckpointerMain () at checkpointer.c:491
#8  0x0000000000508715 in AuxiliaryProcessMain (argc=argc@entry=2, argv=argv@entry=0x7ffcb8ac85c0) at bootstrap.c:451
#9  0x00000000006bc7e9 in StartChildProcess (type=CheckpointerProcess) at postmaster.c:5337
#10 0x00000000006bda8a in reaper (postgres_signal_arg=<optimized out>) at postmaster.c:2867
#11 <signal handler called>
#12 0x00007f815452f0d3 in __select_nocancel () at ../sysdeps/unix/syscall-template.S:81
#13 0x000000000047a7f5 in ServerLoop () at postmaster.c:1671
#14 0x00000000006bee79 in PostmasterMain (argc=argc@entry=3, argv=argv@entry=0x1d45c50) at postmaster.c:1380
#15 0x000000000047bb91 in main (argc=3, argv=0x1d45c50) at main.c:228
(gdb) b KeepLogSeg
Breakpoint 1 at 0x4f0fa0: file xlog.c, line 9463.
(gdb) b RemoveOldXlogFiles
Breakpoint 2 at 0x4f3410: file xlog.c, line 3896.
(gdb) c
Continuing.

Breakpoint 1, KeepLogSeg (recptr=recptr@entry=5472965752, logSegNo=logSegNo@entry=0x7ffcb8ac83a8) at xlog.c:9463
9463            XLByteToSeg(recptr, segno, wal_segment_size);
(gdb) bt
#0  KeepLogSeg (recptr=recptr@entry=5472965752, logSegNo=logSegNo@entry=0x7ffcb8ac83a8) at xlog.c:9463
#1  0x00000000004f9bf9 in CreateCheckPoint (flags=flags@entry=128) at xlog.c:9046
#2  0x00000000006b3272 in CheckpointerMain () at checkpointer.c:491
#3  0x0000000000508715 in AuxiliaryProcessMain (argc=argc@entry=2, argv=argv@entry=0x7ffcb8ac85c0) at bootstrap.c:451
#4  0x00000000006bc7e9 in StartChildProcess (type=CheckpointerProcess) at postmaster.c:5337
#5  0x00000000006bda8a in reaper (postgres_signal_arg=<optimized out>) at postmaster.c:2867
#6  <signal handler called>
#7  0x00007f815452f0d3 in __select_nocancel () at ../sysdeps/unix/syscall-template.S:81
#8  0x000000000047a7f5 in ServerLoop () at postmaster.c:1671
#9  0x00000000006bee79 in PostmasterMain (argc=argc@entry=3, argv=argv@entry=0x1d45c50) at postmaster.c:1380
#10 0x000000000047bb91 in main (argc=3, argv=0x1d45c50) at main.c:228

跟踪保留wal序列号计算过程:

(gdb) list
9459    {
9460            XLogSegNo       segno;
9461            XLogRecPtr      keep;
9462
9463            XLByteToSeg(recptr, segno, wal_segment_size);
9464            keep = XLogGetReplicationSlotMinimumLSN();
9465
9466            /* compute limit for wal_keep_segments first */
9467            if (wal_keep_segments > 0)
9468            {
/*wal_keep_segments为16*/
(gdb) p wal_keep_segments
$13 = 16
(gdb) n
9463            XLByteToSeg(recptr, segno, wal_segment_size);
(gdb)/*当前最新wal序列号为326*/
(gdb) p segno
$14 = 326
9464            keep = XLogGetReplicationSlotMinimumLSN();
(gdb)
9467            if (wal_keep_segments > 0)
(gdb)
9473                            segno = segno - wal_keep_segments;
(gdb)/*根据wal_keep_segments策略计算的wal序列号为310*/
(gdb) p segno
$15 = 310
9477            if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
/*复制槽保存的位点为4160749888*/
(gdb) p keep
$16 = 4160749888
(gdb) p  XLogCtl->replicationSlotMinLSN
$17 = 4160749888
(gdb) n
9481                    XLByteToSeg(keep, slotSegNo, wal_segment_size);
/*根据复制槽策略计算的wal序列号为248*/
(gdb) p slotSegNo
$18 = 248
9483                    if (slotSegNo <= 0)
9485				else if (slotSegNo < segno)
9486					segno = slotSegNo;
/*两种策略比较,选择小值(复制槽计算更小)*/
(gdb) p segno 
$18 = 248
9490            if (segno < *logSegNo)
(gdb)
9491                    *logSegNo = segno;
(gdb)
9492    }
/*最终确定wal序列号为248*/
(gdb) p *logSegNo
$19 = 248

根据策略计算出序列号为248,这个值为十进制,转换为十六进制为F8,即对应wal文件为:0000000100000000000000F8,可以看到确实是从这个wal开始保留的

[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$ ll  00000001*| head -5
-rw-------. 1 postgres postgres 16777216 Oct  8 19:06 0000000100000000000000F8
-rw-------. 1 postgres postgres 16777216 Oct  8 19:06 0000000100000000000000F9
-rw-------. 1 postgres postgres 16777216 Oct  8 19:06 0000000100000000000000FA
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 0000000100000000000000FB
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 0000000100000000000000FC
[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$

同样也可以通过系统视图查询,确定复制槽保存点目前对应的wal为0000000100000000000000F8

pg_replication_slots 视图restart_lsn字段定义获取的值为:XLogCtl->replicationSlotMinLSN

psql (11.5)
Type "help" for help.

postgres=# select * from pg_replication_slots ;
     slot_name     | plugin | slot_type | datoid | database | temporary | active | active_pid | xmin | catalog_xmin | restart_lsn | confirmed_flush_l
sn
-------------------+--------+-----------+--------+----------+-----------+--------+------------+------+--------------+-------------+------------------
---
 standby_repl_slot |        | physical  |        |          | f         | f      |            |      |              | 0/F8000140  |
(1 row)

postgres=# select pg_walfile_name('0/F8000140');
     pg_walfile_name
--------------------------
 0000000100000000000000F8
(1 row)

postgres=#

接着来看remove过程:

/* 将序列号减一 */
9047            _logSegNo--;
(gdb)
9048            RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
(gdb)
/* 可以看到wal序列号已经为247 */
Breakpoint 2, RemoveOldXlogFiles (segno=247, RedoRecPtr=5472965584, endptr=endptr@entry=5472965752) at xlog.c:3896
3896    {
(gdb) list
3891     * redo pointer of the last checkpoint. These are used to determine
3892     * whether we want to recycle rather than delete no-longer-wanted log files.
3893     */
3894    static void
3895    RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
3896    {
3897            DIR                *xldir;
3898            struct dirent *xlde;
3899            char            lastoff[MAXFNAMELEN];
3900
(gdb) n
3906            XLogFileName(lastoff, 0, segno, wal_segment_size);
(gdb)
3908            elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
/*247对应为F7*/
(gdb) p lastoff
$30 = '0' <repeats 22 times>, "F7\000\362o\000\000\000\000\000x\324\066F\001\000\000\000\200\245WE\201\177\000\000\000\000\000\000\000\000\000\000x\324\066F\001\000\000"
(gdb)
(gdb) /* 然后就开始遍历pg_wal */
3913            while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
(gdb)/* 如果文件名小于等于F7,那么就进入删除逻辑*/
	 /* 这里对文件名+8,是为了去除时间线信息,文件名使用数组表示*/
	 /* wal文件的前8位表示时间线,下标移动8就去除时间线信息了*/
3931                    if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
     /*之后就是RemoveXlogFile重用/删除日志,因为现在lastoff为F7这个wal日志 */
     /*但是已经在上次checkpoint被清理了,目前pg_wal目录下的日志序列号*/
     /*都大于F7都不满足if条件,因此本次跟踪不到RemoveXlogFile函数了*/

可以看到目前最老的wal日志正是F8,说明上次或者很早之前的一次checkpoint,已经把F7及之前的wal日志重用并清理了。

篇幅问题,这里不展示wal重用和删除的跟踪过程了,在下边恢复方案中,会结合两次checkpoint,简述wal重用数量的计算过程。

4.恢复方案

我这里制造的场景是主从通过复制槽进行同步,从库异常停机。导致复制槽位点停止更新,wal保留越来越来多。

解决方案可以为,启动从库,从库追赶wal日志,很快就可以恢复了。

启动从库后,查询流复制信息

/*刚启动时,从库在追赶wal,state为catchup*/
postgres=# select * from pg_stat_replication ;
-[ RECORD 1 ]----+------------------------------
pid              | 15615
usesysid         | 16397
usename          | repuser
application_name | walreceiver
client_addr      | 192.168.92.128
client_hostname  |
client_port      | 56990
backend_start    | 2020-10-09 10:58:16.054867+08
backend_xmin     |
state            | catchup
sent_lsn         | 1/1F800000
write_lsn        | 1/1F7C0000
flush_lsn        | 1/1F7C0000
replay_lsn       | 1/1F77FFE0
write_lag        | 00:00:27.524233
flush_lag        | 00:00:27.524233
replay_lag       | 00:00:27.524233
sync_priority    | 0
sync_state       | async
/*经过一段时间后,已经追到最新的wal日志了,state为streaming */
postgres=# select * from pg_stat_replication ;
-[ RECORD 1 ]----+------------------------------
pid              | 15615
usesysid         | 16397
usename          | repuser
application_name | walreceiver
client_addr      | 192.168.92.128
client_hostname  |
client_port      | 56990
backend_start    | 2020-10-09 10:58:16.054867+08
backend_xmin     |
state            | streaming
sent_lsn         | 1/4636D590
write_lsn        | 1/4636D590
flush_lsn        | 1/4636D590
replay_lsn       | 1/4636D590
write_lag        |
flush_lag        |
replay_lag       |
sync_priority    | 0
sync_state       | async

postgres=# select pg_walfile_name('1/4636D590');
-[ RECORD 1 ]---+-------------------------
pg_walfile_name | 000000010000000100000046

同时验证主库wal保留情况:

手动触发几次checkpoint后,可以看到wal的数量在减少

[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$ psql
psql (11.5)
Type "help" for help.

postgres=# checkpoint;
CHECKPOINT
postgres=# exit
[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$ ll  00000001*| wc -l
175
[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$ ll  00000001*| wc -l
48
[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]$ ll | tail -5
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 000000010000000100000063
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 000000010000000100000064
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 000000010000000100000065
drwx------. 2 postgres postgres     4096 Sep 18 20:23 archive_status
-rw-rw-r--. 1 postgres postgres     1861 Oct  5 22:58 waldump.log
[postgres@postgres_primary:pg11.5:6548 /opt/postgres/postgresql-11.5/pg11debug/data/pg_wal]

但是最终保留的为个数为48个,目前主库已经停了业务,从库也在复制最新的日志,按照wal_keep_sgements和复制槽计算保留17个,那么多出来的31个应该就是重用的日志了。

目前最新的日志为000000010000000100000046,文件时间为Oct 9 14:47,而从47开始一直到65时间都是前一天的,都是被回收重用的

-rw-------. 1 postgres postgres 16777216 Oct  9 14:47 000000010000000100000046
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 000000010000000100000047
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 000000010000000100000048
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 000000010000000100000049
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 00000001000000010000004A
-rw-------. 1 postgres postgres 16777216 Oct  8 19:09 00000001000000010000004B

在跟踪过程中我保留了两次checkpoint信息,一起来看看,old wal重用数量是怎么确定的:

1.计算两次checkpoint的偏移量CheckPointDistanceEstimate

UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr)函数中计算
CheckPointDistanceEstimate = 5472965584 - 5159934961
                           = 313030623 bytes

2.计算未来最大重用wal序列号recycleSegNo

XLOGfileslop(RedoRecPtr)函数中计算:
/* 根据min_wal_size参数计算minSegNo */
minSegNo = RedoRecPtr / wal_segment_size +
		ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1
         = 5472965584 / 16777216 + 80 / 16 - 1
         = 330
/* 根据max_wal_size参数计算maxSegNo */
maxSegNo = RedoRecPtr / wal_segment_size +
		ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
		= 5472965584 / 16777216 + 1024 / 16 - 1
		= 389
/*估算两次checkpoint间的wal量*/
distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;   
	/* add 10% for good measure. */
	distance *= 1.10;
		= (1.0 + 0.5 ) * 313030623  * 1.10
		= 516500527.95
/* 计算recycleSegNo */
recycleSegNo = (XLogSegNo) ceil(((double) RedoRecPtr + distance) /wal_segment_size);
			= ceil((5472965584 + 516500527.95 ) / 16777216)
			= 5989466112 / 16777216
			= 357

	if (recycleSegNo < minSegNo)
		recycleSegNo = minSegNo;
	if (recycleSegNo > maxSegNo)
		recycleSegNo = maxSegNo;
/*recycleSegNo < maxSegNo && recycleSegNo > minSegNo,因此直接返回计算值357*/
	return recycleSegNo;

经过计算,未来最大重用号recycleSegNo 为357,对应为最后一个wal 000000010000000100000065

当前最新wal 000000010000000100000046 对应为326,
357 - 326刚好为31

刚才恢复是在从库可以正常启动的情况下,那么如果从库出现异常,无法启动呢?这种情况下,如果比较紧急,主库业务量还未减小,除了紧急扩容,可以考虑将复制槽删除掉,先规避风险。

三、总结反思

通过前边的分析,我们可以将wal保留策略用简单的公式来表达:

1.计算保留数量keep1

/*根据wal_keep_segments计算 */
segno = 最新walsegNo - wal_keep_segments;
/* 根据复制槽计算,如果有多个复制槽,取其中最小值 */
slotSegNo = XLogCtl->replicationSlotMinLSN;
/*取两者间最小值*/
keep_walsegno = min {segno,slotSegNo};
/*计算keep1*/
keep1 = 最新walsegNo - keep_walsegno

2.计算回收数量keep2

/*计算recycleSegNo*/
recycleSegNo = (XLogSegNo) ceil(((double) RedoRecPtr + (RedoRecPtr - PriorRedoPtr ) * 1.5 * 1.10) /wal_segment_size)

/*计算keep2*/
keep2 = recycleSegNo - 最新walsegNo

3.pg_wal目录下总数量keep

keep = keep1 + keep2
/*当两次checkpoint间的wal不多时,keep2的值会比较小*/
		

当然这个keep是一个参考值,无法做到精确。

结合之前的代码走读,我们可以概括出几种主库wal日志暴增的场景:

1.wal_keep_segments被调大;
2.复制槽长时间处于非活跃状态(查询pg_replication_slots视图active字段为f);
3.archiver进程异常(如果开启),archiver异常情况下,不会进入remove函数清理日志;
4.checkpointer进程异常,这个可能性很小

其中见到最多的就是复制槽长时间处于非活跃状态这种场景,因为除了物理复制,复制槽还常用来做逻辑订阅,很容易出现订阅端故障,导致restart_lsn不刷新。所以我们可以增加复制槽状态的告警,一但发现非活跃复制槽,及时告警,及时处理。

是不是感觉这个复制槽有点奇葩?初衷是为了保留足够的日志,让从库,或者订阅端稳定地进行复制。但是没有考虑磁盘暴增的风险?

不用担心,PostgreSQL13已经做了调整,新增参数:max_slot_wal_keep_size ,来控制复制槽保留的最大日志量,配置这个参数后不会无限增长

参数描述如下:
max_slot_wal_keep_size (integer)

Specify the maximum size of WAL files that replication slots are allowed to retain in the pg_wal directory at checkpoint time. If max_slot_wal_keep_size is -1 (the default), replication slots may retain an unlimited amount of WAL files. Otherwise, if restart_lsn of a replication slot falls behind the current LSN by more than the given size, the standby using the slot may no longer be able to continue replication due to removal of required WAL files. You can see the WAL availability of replication slots in pg_replication_slots.

代码如下:
不详细分析了,可以看到加入了max_slot_wal_keep_size_mb对应的控制逻辑,和wal_keep_size参数一起决策wal的保留数量。

static void
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
{
	XLogSegNo	currSegNo;
	XLogSegNo	segno;
	XLogRecPtr	keep;

	XLByteToSeg(recptr, currSegNo, wal_segment_size);
	segno = currSegNo;

	/*
	 * Calculate how many segments are kept by slots first, adjusting for
	 * max_slot_wal_keep_size.
	 */
	keep = XLogGetReplicationSlotMinimumLSN();
	if (keep != InvalidXLogRecPtr)
	{
		XLByteToSeg(keep, segno, wal_segment_size);

		/* Cap by max_slot_wal_keep_size ... */
		/* 这里新增了该参数的控制逻辑 */
		if (max_slot_wal_keep_size_mb >= 0)
		{
			uint64		slot_keep_segs;

			slot_keep_segs =
				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);

			if (currSegNo - segno > slot_keep_segs)
				segno = currSegNo - slot_keep_segs;
		}
	}

	/* but, keep at least wal_keep_size if that's set */
	if (wal_keep_size_mb > 0)
	{
		uint64		keep_segs;

		keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
		if (currSegNo - segno < keep_segs)
		{
			/* avoid underflow, don't go below 1 */
			if (currSegNo <= keep_segs)
				segno = 1;
			else
				segno = currSegNo - keep_segs;
		}
	}

	/* don't delete WAL segments newer than the calculated segment */
	if (segno < *logSegNo)
		*logSegNo = segno;
}

你可能感兴趣的:(PostgreSQL,postgresql,运维)