PostgreSQL 在何处真正开始写数据

基本关系是:

BackgroundWriterMain 循环中,调用  BgBufferSync()  -->SyncOneBuffer -->FlushBuffer -->smgrwrite

看代码:

/*                    

 * Main entry point for bgwriter process                    

 *                    

 * This is invoked from AuxiliaryProcessMain, which has already created the                    

 * basic execution environment, but not enabled signals yet.                    

 */                    

void                    

BackgroundWriterMain(void)                    

{                    

    ……                

    /*                

     * Loop forever                

     */                

    for (;;)                

    {                

        ……            

                    

        /*            

         * Do one cycle of dirty-buffer writing.            

         */            

        can_hibernate = BgBufferSync();            

        ……            

    }                

}                    

再看:

/*                            

 * BgBufferSync -- Write out some dirty buffers in the pool.                            

 *                            

 * This is called periodically by the background writer process.                            

 *                            

 * Returns true if it's appropriate for the bgwriter process to go into                            

 * low-power hibernation mode.    (This happens if the strategy clock sweep                        

 * has been "lapped" and no buffer allocations have occurred recently,                            

 * or if the bgwriter has been effectively disabled by setting                            

 * bgwriter_lru_maxpages to 0.)                            

 */                            

bool                            

BgBufferSync(void)                            

{                            

    ……                        

    /* Execute the LRU scan */                        

    while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)                        

    {                        

        int    buffer_state = SyncOneBuffer(next_to_clean, true);                

                            

        if (++next_to_clean >= NBuffers)                    

        {                    

            next_to_clean = 0;                

            next_passes++;                

        }                    

        num_to_scan--;                    

                            

        if (buffer_state & BUF_WRITTEN)                    

        {                    

            reusable_buffers++;                

            if (++num_written >= bgwriter_lru_maxpages)                

            {                

                BgWriterStats.m_maxwritten_clean++;            

                break;            

            }                

        }                    

        else if (buffer_state & BUF_REUSABLE)                    

            reusable_buffers++;                

    }                        

    ……                        

}                            

再看:

/*                        

 * SyncOneBuffer -- process a single buffer during syncing.                        

 *                        

 * If skip_recently_used is true, we don't write currently-pinned buffers, nor                        

 * buffers marked recently used, as these are not replacement candidates.                        

 *                        

 * Returns a bitmask containing the following flag bits:                        

 *    BUF_WRITTEN: we wrote the buffer.                    

 *    BUF_REUSABLE: buffer is available for replacement, ie, it has                    

 *        pin count 0 and usage count 0.                

 *                        

 * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean                        

 * after locking it, but we don't care all that much.)                        

 *                        

 * Note: caller must have done ResourceOwnerEnlargeBuffers.                        

 */                        

static int                        

SyncOneBuffer(int buf_id, bool skip_recently_used)                        

{                        

    volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];                    

    int            result = 0;        

                        

    /*                    

     * Check whether buffer needs writing.                    

     *                    

     * We can make this check without taking the buffer content lock so long                    

     * as we mark pages dirty in access methods *before* logging changes with                    

     * XLogInsert(): if someone marks the buffer dirty just after our check we                    

     * don't worry because our checkpoint.redo points before log record for                    

     * upcoming changes and so we are not required to write such dirty buffer.                    

     */                    

    LockBufHdr(bufHdr);                    

                        

    if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)                    

        result |= BUF_REUSABLE;                

    else if (skip_recently_used)                    

    {                    

        /* Caller told us not to write recently-used buffers */                

        UnlockBufHdr(bufHdr);                

        return result;                

    }                    

                        

    if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))                    

    {                    

        /* It's clean, so nothing to do */                

        UnlockBufHdr(bufHdr);                

        return result;                

    }                    

                        

    /*                    

     * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the                    

     * buffer is clean by the time we've locked it.)                    

     */                    

    PinBuffer_Locked(bufHdr);                    

    LWLockAcquire(bufHdr->content_lock, LW_SHARED);                    

                        

    FlushBuffer(bufHdr, NULL);                    

                        

    LWLockRelease(bufHdr->content_lock);                    

    UnpinBuffer(bufHdr, true);                    

                        

    return result | BUF_WRITTEN;                    

}                        

再看:

/*                        

 * FlushBuffer                        

 *        Physically write out a shared buffer.                

 *                        

 * NOTE: this actually just passes the buffer contents to the kernel; the                        

 * real write to disk won't happen until the kernel feels like it.  This                        

 * is okay from our point of view since we can redo the changes from WAL.                        

 * However, we will need to force the changes to disk via fsync before                        

 * we can checkpoint WAL.                        

 *                        

 * The caller must hold a pin on the buffer and have share-locked the                        

 * buffer contents.  (Note: a share-lock does not prevent updates of                        

 * hint bits in the buffer, so the page could change while the write                        

 * is in progress, but we assume that that will not invalidate the data                        

 * written.)                        

 *                        

 * If the caller has an smgr reference for the buffer's relation, pass it                        

 * as the second parameter.  If not, pass NULL.  In the latter case, the                        

 * relation will be marked as "transient" so that the corresponding                        

 * kernel-level file descriptors are closed when the current transaction ends,                        

 * if any.                        

 */                        

static void                        

FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)                        

{                        

    XLogRecPtr    recptr;                

    ErrorContextCallback errcontext;                    

    instr_time    io_start,                

                io_time;        

                        

    /*                    

     * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns                    

     * false, then someone else flushed the buffer before we could, so we need                    

     * not do anything.                    

     */                    

    if (!StartBufferIO(buf, false))                    

        return;                

                        

    /* Setup error traceback support for ereport() */                    

    errcontext.callback = shared_buffer_write_error_callback;                    

    errcontext.arg = (void *) buf;                    

    errcontext.previous = error_context_stack;                    

    error_context_stack = &errcontext;                    

                        

    /* Find smgr relation for buffer, and mark it as transient */                    

    if (reln == NULL)                    

    {                    

        reln = smgropen(buf->tag.rnode, InvalidBackendId);                

        smgrsettransient(reln);                

    }                    

                        

    TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,                    

                    buf->tag.blockNum,    

                    reln->smgr_rnode.node.spcNode,    

                    reln->smgr_rnode.node.dbNode,    

                    reln->smgr_rnode.node.relNode);    

                        

    /*                    

     * Force XLOG flush up to buffer's LSN.  This implements the basic WAL                    

     * rule that log updates must hit disk before any of the data-file changes                    

     * they describe do.                    

     */                    

    recptr = BufferGetLSN(buf);                    

    XLogFlush(recptr);                    

                        

    /*                    

     * Now it's safe to write buffer to disk. Note that no one else should                    

     * have been able to write it while we were busy with log flushing because                    

     * we have the io_in_progress lock.                    

     */                    

                        

    /* To check if block content changes while flushing. - vadim 01/17/97 */                    

    LockBufHdr(buf);                    

    buf->flags &= ~BM_JUST_DIRTIED;                    

    UnlockBufHdr(buf);                    

                        

    if (track_io_timing)                    

        INSTR_TIME_SET_CURRENT(io_start);                

                        

    smgrwrite(reln,                    

              buf->tag.forkNum,            

              buf->tag.blockNum,            

              (char *) BufHdrGetBlock(buf),            

              false);            

                        

    if (track_io_timing)                    

    {                    

        INSTR_TIME_SET_CURRENT(io_time);                

        INSTR_TIME_SUBTRACT(io_time, io_start);                

        pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));                

        INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);                

    }                    

                        

    pgBufferUsage.shared_blks_written++;                    

                        

    /*                    

     * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and                    

     * end the io_in_progress state.                    

     */                    

    TerminateBufferIO(buf, true, 0);                    

                        

    TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,                    

                           buf->tag.blockNum,

                           reln->smgr_rnode.node.spcNode,

                           reln->smgr_rnode.node.dbNode,

                           reln->smgr_rnode.node.relNode);

                        

    /* Pop the error context stack */                    

    error_context_stack = errcontext.previous;                    

}                        

循环里面一次写一个 buffer哇, 怪异否? 也许是有一点就写一点,设计者是故意的?

你可能感兴趣的:(PostgreSQL)