pg 初始化完 shmem ,给其加上索引 "ShmemIndex" 后,接着就在 shmem 里初始化 xlog 。
1 先上个图,看一下函数调用过程梗概,中间略过部分细节
初始化 xlog 方法调用流程图
2 初始化 xlog 相关结构
话说 main()-> … ->PostmasterMain()-> … ->reset_shared() -> CreateSharedMemoryAndSemaphores()> … ->XLOGSHmemInit() ,初始化控制文件 data/global/pg_control 相关数据结构及事务日志 xlog 相关数据结构,相关结构定义在下面。
typedef struct ControlFileData
{
/*
* Unique system identifier --- to ensure we match up xlog files with the
* installation that produced them.
*/
uint64 system_identifier;
/*
* Version identifier information. Keep these fields at the same offset,
* especially pg_control_version; they won't be real useful if they move
* around. (For historical reasons they must be 8 bytes into the file
* rather than immediately at the front.)
*
* pg_control_version identifies the format of pg_control itself.
* catalog_version_no identifies the format of the system catalogs.
*
* There are additional version identifiers in individual files; for
* example, WAL logs contain per-page magic numbers that can serve as
* version cues for the WAL log.
*/
uint32 pg_control_version; /* PG_CONTROL_VERSION */
uint32 catalog_version_no; /* see catversion.h */
/*
* System status data
*/
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
CheckPoint checkPointCopy; /* copy of last check point record */
/*
* These two values determine the minimum point we must recover up to
* before starting up:
*
* minRecoveryPoint is updated to the latest replayed LSN whenever we
* flush a data change during archive recovery. That guards against
* starting archive recovery, aborting it, and restarting with an earlier
* stop location. If we've already flushed data changes from WAL record X
* to disk, we mustn't start up until we reach X again. Zero when not
* doing archive recovery.
*
* backupStartPoint is the redo pointer of the backup start checkpoint, if
* we are recovering from an online backup and haven't reached the end of
* backup yet. It is reset to zero when the end of backup is reached, and
* we mustn't start up before that. A boolean would suffice otherwise, but
* we use the redo pointer as a cross-check when we see an end-of-backup
* record, to make sure the end-of-backup record corresponds the base
* backup we're recovering from.
*/
XLogRecPtr minRecoveryPoint;
XLogRecPtr backupStartPoint;
/*
* Parameter settings that determine if the WAL can be used for archival
* or hot standby.
*/
int wal_level;
int MaxConnections;
int max_prepared_xacts;
int max_locks_per_xact;
/*
* This data is used to check for hardware-architecture compatibility of
* the database and the backend executable. We need not check endianness
* explicitly, since the pg_control version will surely look wrong to a
* machine of different endianness, but we do need to worry about MAXALIGN
* and floating-point format. (Note: storage layout nominally also
* depends on SHORTALIGN and INTALIGN, but in practice these are the same
* on all architectures of interest.)
*
* Testing just one double value is not a very bulletproof test for
* floating-point compatibility, but it will catch most cases.
*/
uint32 maxAlign; /* alignment requirement for tuples */
double floatFormat; /* constant 1234567.0 */
#define FLOATFORMAT_VALUE 1234567.0
/*
* This data is used to make sure that configuration of this database is
* compatible with the backend executable.
*/
uint32 blcksz; /* data block size for this DB */
uint32 relseg_size; /* blocks per segment of large relation */
uint32 xlog_blcksz; /* block size within WAL files */
uint32 xlog_seg_size; /* size of each WAL segment */
uint32 nameDataLen; /* catalog name field width */
uint32 indexMaxKeys; /* max number of columns in an index */
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
/* flag indicating internal format of timestamp, interval, time */
bool enableIntTimes; /* int64 storage enabled? */
/* flags indicating pass-by-value status of various types */
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8, int8, etc pass-by-value? */
/* CRC of all above ... MUST BE LAST! */
pg_crc32 crc;
} ControlFileData;
/*
* Body of CheckPoint XLOG records. This is declared here because we keep
* a copy of the latest one in pg_control for possible disaster recovery.
* Changing this struct requires a PG_CONTROL_VERSION bump.
*/
typedef struct CheckPoint
{
XLogRecPtr redo; /* next RecPtr available when we began to
* create CheckPoint (i.e. REDO start point) */
TimeLineID ThisTimeLineID; /* current TLI */
uint32 nextXidEpoch; /* higher-order bits of nextXid */
TransactionId nextXid; /* next free XID */
Oid nextOid; /* next free OID */
MultiXactId nextMulti; /* next free MultiXactId */
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
Oid oldestXidDB; /* database with minimum datfrozenxid */
pg_time_t time; /* time stamp of checkpoint */
/*
* Oldest XID still running. This is only needed to initialize hot standby
* mode from an online checkpoint, so we only bother calculating this for
* online checkpoints and only when wal_level is hot_standby. Otherwise
* it's set to InvalidTransactionId.
*/
TransactionId oldestActiveXid;
} CheckPoint;
/*
* Total shared-memory state for XLOG.
*/
typedef struct XLogCtlData
{
/* Protected by WALInsertLock: */
XLogCtlInsert Insert;
/* Protected by info_lck: */
XLogwrtRqst LogwrtRqst;
XLogwrtResult LogwrtResult;
uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
TransactionId ckptXid;
XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
uint32 lastRemovedLog; /* latest removed/recycled XLOG segment */
uint32 lastRemovedSeg;
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
/*
* These values do not change after startup, although the pointed-to pages
* and xlblocks values certainly do. Permission to read/write the pages
* and xlblocks values depends on WALInsertLock and WALWriteLock.
*/
char *pages; /* buffers for unwritten XLOG pages */
XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
int XLogCacheBlck; /* highest allocated xlog buffer index */
TimeLineID ThisTimeLineID;
TimeLineID RecoveryTargetTLI;
/*
* archiveCleanupCommand is read from recovery.conf but needs to be in
* shared memory so that the bgwriter process can access it.
*/
char archiveCleanupCommand[MAXPGPATH];
/*
* SharedRecoveryInProgress indicates if we're still in crash or archive
* recovery. Protected by info_lck.
*/
bool SharedRecoveryInProgress;
/*
* SharedHotStandbyActive indicates if we're still in crash or archive
* recovery. Protected by info_lck.
*/
bool SharedHotStandbyActive;
/*
* recoveryWakeupLatch is used to wake up the startup process to continue
* WAL replay, if it is waiting for WAL to arrive or failover trigger file
* to appear.
*/
Latch recoveryWakeupLatch;
/*
* During recovery, we keep a copy of the latest checkpoint record here.
* Used by the background writer when it wants to create a restartpoint.
*
* Protected by info_lck.
*/
XLogRecPtr lastCheckPointRecPtr;
CheckPoint lastCheckPoint;
/* end+1 of the last record replayed (or being replayed) */
XLogRecPtr replayEndRecPtr;
/* end+1 of the last record replayed */
XLogRecPtr recoveryLastRecPtr;
/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
TimestampTz recoveryLastXTime;
/* Are we requested to pause recovery? */
bool recoveryPause;
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
/*
* Shared state data for XLogInsert.
*/
typedef struct XLogCtlInsert
{
XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
XLogRecPtr PrevRecord; /* start of previously-inserted record */
int curridx; /* current block index in cache */
XLogPageHeader currpage; /* points to header of block in cache */
char *currpos; /* current insertion point in cache */
XLogRecPtr RedoRecPtr; /* current redo point for insertions */
bool forcePageWrites; /* forcing full-page writes for PITR? */
/*
* exclusiveBackup is true if a backup started with pg_start_backup() is
* in progress, and nonExclusiveBackups is a counter indicating the number
* of streaming base backups currently in progress. forcePageWrites is set
* to true when either of these is non-zero. lastBackupStart is the latest
* checkpoint redo location used as a starting point for an online backup.
*/
bool exclusiveBackup;
int nonExclusiveBackups;
XLogRecPtr lastBackupStart;
} XLogCtlInsert;
在 XLOGSHmemInit() 函数里,首先在 shmem 的哈希表索引 "ShmemIndex" 上给控制文件 pg_control 增加一个 HashElement 和 ShmemIndexEnt ( entry ), 在 shmem 里根据 ControlFileData 大小调用 ShmemAlloc() 分配内存空间,使 ShmemIndexEnt 的成员 location 指向该空间, size 成员记录该空间大小 。
XLOGSHmemInit() 调用 ShmemInitStruct() , 在其中 调用 hash_search() 在哈希表索引 "ShmemIndex" 中查找 "XLOG Ctl" ,如果没有,就在 shmemIndex 中给 "XLOG Ctl" 分一个 HashElement 和 ShmemIndexEnt ( entry ) ,在其中的 Entry 中写上 "XLOG Ctl" 。返回 ShmemInitStruct() ,再调用ShmemAlloc() 在共享内存上给"XLOG Ctl" 相关结构(见下面“ XLog 相关结构图” )分配空间,设置 entry (在这儿及ShmemIndexEnt 类型变量)的成员 location 指向该空间, size 成员记录该空间大小 , 最后返回 XLOGShmemInit() ,让 XLogCtlData * 类型静态 全局变量 XLogCtl 指向在shmem 里给"XLOG Ctl" 相关结构分配的内存地址,设置其中XLogCtlData 结构类型的成员值。 初始化完成后数据结构如下图。
初始化完 xlog 的内存结构图
为了精简上图,把创建 shmem 的哈希表索引 "ShmemIndex" 时创建的 HCTL 结构删掉了,这个结构的作用是记录创建可扩展哈希表的相关信息。增加了左边灰色底的部分,描述 共享内存 /shmem 里各变量物理布局概览,由下往上,由低地址到高地址。其中的 "Control File" 即 ControlFileDate 和 "XLOG Ctl" 即 xlog 的相关结构图下面分别给出,要不上面的图太大了。
控制文件结构图
上图中 ControlFileData 结构中的 XLogRecPtr 和 CheckPoint 不是指针,因此应该用右边的相应结构图代替,把这两个合进去有点费劲,将就着看吧。
XLog 相关结构图