PostgreSQL程序员

PostgreSQL 源码解读(113)- WAL#9(Inse

2018-12-29  本文已影响13人  EthanHe

本节重点跟踪分析了ReserveXLogInsertLocation和CopyXLogRecordToWAL函数的实现逻辑,ReserveXLogInsertLocation函数为XLOG Record预留合适的空间,CopyXLogRecordToWAL则负责拷贝XLOG Record到WAL buffer的保留空间中。

一、数据结构

全局变量


/* flags for the in-progress insertion */
//用于插入过程中的标记信息
static uint8 curinsert_flags = 0;

/*
 * These are used to hold the record header while constructing a record.
 * 'hdr_scratch' is not a plain variable, but is palloc'd at initialization,
 * because we want it to be MAXALIGNed and padding bytes zeroed.
 * 在构建XLOG Record时通常会存储记录的头部信息.
 * 'hdr_scratch'并不是一个普通(plain)变量,而是在初始化时通过palloc初始化,
 *   因为我们希望该变量已经是MAXALIGNed并且已被0x00填充.
 *
 * For simplicity, it's allocated large enough to hold the headers for any
 * WAL record.
 * 简单起见,该变量预先会分配足够大的空间用于存储所有WAL Record的头部信息.
 */
static XLogRecData hdr_rdt;
static char *hdr_scratch = NULL;

#define SizeOfXlogOrigin    (sizeof(RepOriginId) + sizeof(char))

#define HEADER_SCRATCH_SIZE \
    (SizeOfXLogRecord + \
     MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \
     SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin)
/*
 * An array of XLogRecData structs, to hold registered data.
 * XLogRecData结构体数组,存储已注册的数据.
 */
static XLogRecData *rdatas;
static int  num_rdatas;         /* entries currently used */
//已分配的空间大小
static int  max_rdatas;         /* allocated size */
//是否调用XLogBeginInsert函数
static bool begininsert_called = false;

static XLogCtlData *XLogCtl = NULL;

/* flags for the in-progress insertion */
static uint8 curinsert_flags = 0;

/*
 * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered
 * with XLogRegisterData(...).
 * 存储WAL Record "main data"的XLogRecDatas数据链
 */
static XLogRecData *mainrdata_head;
static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head;
//链中某个位置的mainrdata大小
static uint32 mainrdata_len; /* total # of bytes in chain */

/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 * end+1 of the last record, and is reset when we end a top-level transaction,
 * or start a new one; so it can be used to tell if the current transaction has
 * created any XLOG records.
 * ProcLastRecPtr指向当前后端插入的最后一条XLOG记录的开头。
 * 它针对所有插入进行更新。
 * XactLastRecEnd指向最后一条记录的末尾位置 + 1,
 *   并在结束顶级事务或启动新事务时重置;
 *   因此,它可以用来判断当前事务是否创建了任何XLOG记录。
 *
 * While in parallel mode, this may not be fully up to date.  When committing,
 * a transaction can assume this covers all xlog records written either by the
 * user backend or by any parallel worker which was present at any point during
 * the transaction.  But when aborting, or when still in parallel mode, other
 * parallel backends may have written WAL records at later LSNs than the value
 * stored here.  The parallel leader advances its own copy, when necessary,
 * in WaitForParallelWorkersToFinish.
 * 在并行模式下,这可能不是完全是最新的。
 * 在提交时,事务可以假定覆盖了用户后台进程或在事务期间出现的并行worker进程的所有xlog记录。
 * 但是,当中止时,或者仍然处于并行模式时,其他并行后台进程可能在较晚的LSNs中写入了WAL记录,
 *   而不是存储在这里的值。
 * 当需要时,并行处理进程的leader在WaitForParallelWorkersToFinish中会推进自己的副本。
 */
XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;

/* For WALInsertLockAcquire/Release functions */
//用于WALInsertLockAcquire/Release函数
static int  MyLockNo = 0;
static bool holdingAllLocks = false;

/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 * 进程私有的可能已过期的共享LogwrtResult变量的拷贝.
 */
static XLogwrtResult LogwrtResult = {0, 0};

/* The number of bytes in a WAL segment usable for WAL data. */
//WAL segment file中可用于WAL data的字节数(不包括page header)
static int UsableBytesInSegment;

宏定义
XLogRegisterBuffer函数使用的flags

/* flags for XLogRegisterBuffer */
//XLogRegisterBuffer函数使用的flags
#define REGBUF_FORCE_IMAGE  0x01    /* 强制执行full-page-write;force a full-page image */
#define REGBUF_NO_IMAGE     0x02    /* 不需要FPI;don't take a full-page image */
#define REGBUF_WILL_INIT    (0x04 | 0x02)   /* 在回放时重新初始化page(表示NO_IMAGE);
                                             * page will be re-initialized at
                                             * replay (implies NO_IMAGE) */
#define REGBUF_STANDARD     0x08    /* 标准的page layout(数据在pd_lower和pd_upper之间的数据会被跳过)
                                     * page follows "standard" page layout,
                                     * (data between pd_lower and pd_upper
                                     * will be skipped) */
#define REGBUF_KEEP_DATA    0x10    /* include data even if a full-page image
                                      * is taken */
/*
 * Flag bits for the record being inserted, set using XLogSetRecordFlags().
 */
#define XLOG_INCLUDE_ORIGIN     0x01    /* include the replication origin */
#define XLOG_MARK_UNIMPORTANT   0x02    /* record not important for durability */    


#define XLogSegmentOffset(xlogptr, wal_segsz_bytes) \
    ((xlogptr) & ((wal_segsz_bytes) - 1))
/*
 * Calculate the amount of space left on the page after 'endptr'. Beware
 * multiple evaluation!
 * 计算page中在"endptr"后的剩余空闲空间.注意multiple evaluation! 
 */
#define INSERT_FREESPACE(endptr)    \
    (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))

XLogRecData
xloginsert.c中的函数构造一个XLogRecData结构体链用于标识最后的WAL记录

/*
 * The functions in xloginsert.c construct a chain of XLogRecData structs
 * to represent the final WAL record.
 * xloginsert.c中的函数构造一个XLogRecData结构体链用于标识最后的WAL记录
 */
typedef struct XLogRecData
{
    //链中的下一个结构体,如无则为NULL
    struct XLogRecData *next;   /* next struct in chain, or NULL */
    //rmgr数据的起始地址
    char       *data;           /* start of rmgr data to include */
    //rmgr数据大小
    uint32      len;            /* length of rmgr data to include */
} XLogRecData;

二、源码解读

ReserveXLogInsertLocation
在WAL(buffer)中为给定大小的记录预留合适的空间。*StartPos设置为预留部分的开头,*EndPos设置为其结尾+1。*PrePtr设置为前一记录的开头;它用于设置该记录的xl_prev变量。

/*
 * Reserves the right amount of space for a record of given size from the WAL.
 * *StartPos is set to the beginning of the reserved section, *EndPos to
 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
 * used to set the xl_prev of this record.
 * 在WAL(buffer)中为给定大小的记录预留合适的空间。
 * *StartPos设置为预留部分的开头,*EndPos设置为其结尾+1。
 * *PrePtr设置为前一记录的开头;它用于设置该记录的xl_prev。
 *
 * This is the performance critical part of XLogInsert that must be serialized
 * across backends. The rest can happen mostly in parallel. Try to keep this
 * section as short as possible, insertpos_lck can be heavily contended on a
 * busy system.
 * 这是XLogInsert中与性能密切相关的部分,必须在后台进程之间序列执行。
 * 其余的大部分可以同时发生。
 * 尽量精简这部分的逻辑,insertpos_lck可以在繁忙的系统上存在激烈的竞争。
 *
 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
 * where we actually copy the record to the reserved space.
 * 注意:这里计算的空间必须与CopyXLogRecordToWAL()函数一致,
 *   在CopyXLogRecordToWAL中会实际拷贝数据到预留空间中.
 */
static void
ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
                          XLogRecPtr *PrevPtr)
{
    XLogCtlInsert *Insert = &XLogCtl->Insert;//插入控制器
    uint64      startbytepos;//开始位置
    uint64      endbytepos;//结束位置
    uint64      prevbytepos;//上一位置

    size = MAXALIGN(size);//大小对齐

    /* All (non xlog-switch) records should contain data. */
    //除了xlog-switch外,所有的记录都应该包含数据.
    Assert(size > SizeOfXLogRecord);

    /*
     * The duration the spinlock needs to be held is minimized by minimizing
     * the calculations that have to be done while holding the lock. The
     * current tip of reserved WAL is kept in CurrBytePos, as a byte position
     * that only counts "usable" bytes in WAL, that is, it excludes all WAL
     * page headers. The mapping between "usable" byte positions and physical
     * positions (XLogRecPtrs) can be done outside the locked region, and
     * because the usable byte position doesn't include any headers, reserving
     * X bytes from WAL is almost as simple as "CurrBytePos += X".
     * spinlock需要持有的时间通过最小化必须持有锁的计算逻辑达到最小化。
     * 预留的WAL空间通过CurrBytePos变量(大小一个字节)保存,
     *   它只计算WAL中的“可用”字节,也就是说,它排除了所有的WAL page header。
     * “可用”字节位置和物理位置(XLogRecPtrs)之间的映射可以在锁定区域之外完成,
     *   而且由于可用字节位置不包含任何header,从WAL预留X字节的大小几乎和“CurrBytePos += X”一样简单。
     */
    SpinLockAcquire(&Insert->insertpos_lck);//申请锁
    //开始位置
    startbytepos = Insert->CurrBytePos;
    //结束位置
    endbytepos = startbytepos + size;
    //上一位置
    prevbytepos = Insert->PrevBytePos;
    //调整控制器的相关变量
    Insert->CurrBytePos = endbytepos;
    Insert->PrevBytePos = startbytepos;
    //释放锁
    SpinLockRelease(&Insert->insertpos_lck);
    //返回值
    //计算开始/结束/上一位置偏移
    *StartPos = XLogBytePosToRecPtr(startbytepos);
    *EndPos = XLogBytePosToEndRecPtr(endbytepos);
    *PrevPtr = XLogBytePosToRecPtr(prevbytepos);

    /*
     * Check that the conversions between "usable byte positions" and
     * XLogRecPtrs work consistently in both directions.
     * 检查双向转换之后的值是一致的.
     */
    Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
}
 
/*
 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
 * is the position starting from the beginning of WAL, excluding all WAL
 * page headers.
 * 将“可用字节位置”转换为XLogRecPtr。
 * 可用字节位置是从WAL开始的位置,不包括所有WAL page header。
 */
static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)
{
    uint64      fullsegs;
    uint64      fullpages;
    uint64      bytesleft;
    uint32      seg_offset;
    XLogRecPtr  result;

    fullsegs = bytepos / UsableBytesInSegment;
    bytesleft = bytepos % UsableBytesInSegment;

    if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
    {
        //剩余的字节数 < XLOG_BLCKSZ - SizeOfXLogLongPHD    
        /* fits on first page of segment */
        //填充在segment的第一个page中
        seg_offset = bytesleft + SizeOfXLogLongPHD;
    }
    else
    {
        //剩余的字节数 >= XLOG_BLCKSZ - SizeOfXLogLongPHD    
        /* account for the first page on segment with long header */
        //在segment中说明long header
        seg_offset = XLOG_BLCKSZ;
        bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;

        fullpages = bytesleft / UsableBytesInPage;
        bytesleft = bytesleft % UsableBytesInPage;

        seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    }

    XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);

    return result;
}

/* The number of bytes in a WAL segment usable for WAL data. */
//WAL segment file中可用于WAL data的字节数(不包括page header)
static int UsableBytesInSegment;

CopyXLogRecordToWAL
CopyXLogRecordToWAL是XLogInsertRecord中的子过程,用于拷贝XLOG Record到WAL中的保留区域.

/*
 * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
 * area in the WAL.
 * XLogInsertRecord中的子过程.
 * 拷贝XLOG Record到WAL中的保留区域.
 */
static void
CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
                    XLogRecPtr StartPos, XLogRecPtr EndPos)
{
    char       *currpos;//当前指针位置
    int         freespace;//空闲空间
    int         written;//已写入的大小
    XLogRecPtr  CurrPos;//事务日志位置
    XLogPageHeader pagehdr;//Page Header

    /*
     * Get a pointer to the right place in the right WAL buffer to start
     * inserting to.
     * 在合适的WAL buffer中获取指针用于确定插入的位置
     */
    CurrPos = StartPos;//赋值为开始位置
    currpos = GetXLogBuffer(CurrPos);//获取buffer指针
    freespace = INSERT_FREESPACE(CurrPos);//获取空闲空间大小

    /*
     * there should be enough space for at least the first field (xl_tot_len)
     * on this page.
     * 在该页上最起码有第一个字段(xl_tot_len)的存储空间
     */
    Assert(freespace >= sizeof(uint32));

    /* Copy record data */
    //拷贝记录数据
    written = 0;
    while (rdata != NULL)//循环
    {
        char       *rdata_data = rdata->data;//指针
        int         rdata_len = rdata->len;//大小

        while (rdata_len > freespace)//循环
        {
            /*
             * Write what fits on this page, and continue on the next page.
             * 该页能写多少就写多少,写不完就继续下一页.
             */
            //确保最起码剩余SizeOfXLogShortPHD的头部数据存储空间
            Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
            //内存拷贝
            memcpy(currpos, rdata_data, freespace);
            //指针调整
            rdata_data += freespace;
            //大小调整
            rdata_len -= freespace;
            //写入大小调整
            written += freespace;
            //当前位置调整
            CurrPos += freespace;

            /*
             * Get pointer to beginning of next page, and set the xlp_rem_len
             * in the page header. Set XLP_FIRST_IS_CONTRECORD.
             * 获取下一页的开始指针,并在下一页的header中设置xlp_rem_len.
             * 同时设置XLP_FIRST_IS_CONTRECORD标记.
             *
             * It's safe to set the contrecord flag and xlp_rem_len without a
             * lock on the page. All the other flags were already set when the
             * page was initialized, in AdvanceXLInsertBuffer, and we're the
             * only backend that needs to set the contrecord flag.
             * 就算不持有页锁,设置contrecord标记和xlp_rem_len也是安全的.
             * 在页面初始化的时候,所有其他标记已通过AdvanceXLInsertBuffer函数初始化,
             *   我们是需要设置contrecord标记的唯一一个后台进程,不会有其他进程了.
             */
            currpos = GetXLogBuffer(CurrPos);//获取buffer
            pagehdr = (XLogPageHeader) currpos;//获取page header
            pagehdr->xlp_rem_len = write_len - written;//设置xlp_rem_len
            pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;//设置标记

            /* skip over the page header */
            //跳过page header
            if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)//第一个page
            {
                CurrPos += SizeOfXLogLongPHD;//Long Header
                currpos += SizeOfXLogLongPHD;
            }
            else
            {
                CurrPos += SizeOfXLogShortPHD;//不是第一个page,Short Header
                currpos += SizeOfXLogShortPHD;
            }
            freespace = INSERT_FREESPACE(CurrPos);//获取空闲空间
        }
        //再次验证
        Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
        //内存拷贝(这时候rdata_len <= freespace)
        memcpy(currpos, rdata_data, rdata_len);
        currpos += rdata_len;//调整指针
        CurrPos += rdata_len;//调整指针
        freespace -= rdata_len;//减少空闲空间
        written += rdata_len;//调整已写入大小

        rdata = rdata->next;//下一批数据
    }
    Assert(written == write_len);//确保已写入 == 需写入大小

    /*
     * If this was an xlog-switch, it's not enough to write the switch record,
     * we also have to consume all the remaining space in the WAL segment.  We
     * have already reserved that space, but we need to actually fill it.
     * 如果是xlog-switch并且没有足够的空间写切换的记录,
     *   这时候不得不消费WAL segment剩余的空间.
     * 我们已经预留了空间,但需要执行实际的填充.
     */
    if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
    {
        /* An xlog-switch record doesn't contain any data besides the header */
        //在header后,xlog-switch没有包含任何数据.
        Assert(write_len == SizeOfXLogRecord);

        /* Assert that we did reserve the right amount of space */
        //验证预留了合适的空间
        Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);

        /* Use up all the remaining space on the current page */
        //在当前页面使用所有的剩余空间
        CurrPos += freespace;

        /*
         * Cause all remaining pages in the segment to be flushed, leaving the
         * XLog position where it should be, at the start of the next segment.
         * We do this one page at a time, to make sure we don't deadlock
         * against ourselves if wal_buffers < wal_segment_size.
         * 由于该segment中所有剩余pages将被刷出,把XLog位置指向下一个segment的开始.
         * 一个page我们只做一次,在wal_buffers < wal_segment_size的情况下,
         *   确保我们自己不会出现死锁.
         */
        while (CurrPos < EndPos)//循环
        {
            /*
             * The minimal action to flush the page would be to call
             * WALInsertLockUpdateInsertingAt(CurrPos) followed by
             * AdvanceXLInsertBuffer(...).  The page would be left initialized
             * mostly to zeros, except for the page header (always the short
             * variant, as this is never a segment's first page).
             * 刷出page的最小化动作是:调用WALInsertLockUpdateInsertingAt(CurrPos)
             *   然后接着调用AdvanceXLInsertBuffer(...).
             * 除了page header(通常为short格式,除了segment的第一个page)外,其余部分均初始化为ascii 0.
             * 
             * The large vistas of zeros are good for compressibility, but the
             * headers interrupting them every XLOG_BLCKSZ (with values that
             * differ from page to page) are not.  The effect varies with
             * compression tool, but bzip2 for instance compresses about an
             * order of magnitude worse if those headers are left in place.
             * 连续的ascii 0非常适合压缩,但每个page的头部数据(用于分隔page&page)把这些0隔开了.
             * 这种效果随压缩工具的不同而不同,但是如果保留这些头文件,则bzip2的压缩效果会差一个数量级。
             *
             * Rather than complicating AdvanceXLInsertBuffer itself (which is
             * called in heavily-loaded circumstances as well as this lightly-
             * loaded one) with variant behavior, we just use GetXLogBuffer
             * (which itself calls the two methods we need) to get the pointer
             * and zero most of the page.  Then we just zero the page header.
             * 与其让AdvanceXLInsertBuffer本身(在重载环境和这个负载较轻的环境中调用)变得复杂,
             *  不如使用GetXLogBuffer(调用了我们需要的两个方法)来初始化page(初始化为ascii 0)/
             * 然后把page header设置为ascii 0.
             */
            currpos = GetXLogBuffer(CurrPos);//获取buffer
            MemSet(currpos, 0, SizeOfXLogShortPHD);//设置头部为ascii 0

            CurrPos += XLOG_BLCKSZ;//修改指针
        }
    }
    else
    {
        /* Align the end position, so that the next record starts aligned */
        //对齐末尾位置,以便下一个记录可以从对齐的位置开始
        CurrPos = MAXALIGN64(CurrPos);
    }

    if (CurrPos != EndPos)//验证
        elog(PANIC, "space reserved for WAL record does not match what was written");
}

三、跟踪分析

测试脚本如下:

drop table t_wal_longtext;
create table t_wal_longtext(c1 int not null,c2  varchar(3000),c3 varchar(3000),c4 varchar(3000));
insert into t_wal_longtext(c1,c2,c3,c4) 
select i,rpad('C2-'||i,3000,'2'),rpad('C3-'||i,3000,'3'),rpad('C4-'||i,3000,'4') 
from generate_series(1,7) as i;

ReserveXLogInsertLocation
插入数据:

insert into t_wal_longtext(c1,c2,c3,c4) VALUES(8,'C2-8','C3-8','C4-8');

设置断点,进入ReserveXLogInsertLocation

(gdb) b ReserveXLogInsertLocation
Breakpoint 1 at 0x54d574: file xlog.c, line 1244.
(gdb) c
Continuing.

Breakpoint 1, ReserveXLogInsertLocation (size=74, StartPos=0x7ffebea9d768, EndPos=0x7ffebea9d760, PrevPtr=0x244f4c8)
    at xlog.c:1244
1244        XLogCtlInsert *Insert = &XLogCtl->Insert;
(gdb) 

输入参数:
size=74, 这是待插入XLOG Record的大小,其他三个为待设置的值.
继续执行.
对齐,74->80(要求为8的N倍,unit64占用8bytes,因此要求8的倍数)

(gdb) n
1249        size = MAXALIGN(size);
(gdb) 
1252        Assert(size > SizeOfXLogRecord);
(gdb) p size
$1 = 80
(gdb) 

查看插入控制器的信息,其中:
CurrBytePos = 5498377520,十六进制为0x147BA9530
PrevBytePos = 5498377464,十六进制为0x147BA94F8
RedoRecPtr = 5514382312,十六进制为0x148AECBE8 --> 对应pg_control中的Latest checkpoint's REDO location

(gdb) n
1264        SpinLockAcquire(&Insert->insertpos_lck);
(gdb) 
1266        startbytepos = Insert->CurrBytePos;
(gdb) p *Insert
$2 = {insertpos_lck = 1 '\001', CurrBytePos = 5498377520, PrevBytePos = 5498377464, pad = '\000' <repeats 127 times>, 
  RedoRecPtr = 5514382312, forcePageWrites = false, fullPageWrites = true, exclusiveBackupState = EXCLUSIVE_BACKUP_NONE, 
  nonExclusiveBackups = 0, lastBackupStart = 0, WALInsertLocks = 0x7f97d1eeb100}
(gdb) 

设置相应的值.
值得注意的是插入控制器Insert中的位置信息是不包括page header等信息,是纯粹可用的日志数据,因此数值要比WAL segment file的数值小.

(gdb) n
1267        endbytepos = startbytepos + size;
(gdb) 
1268        prevbytepos = Insert->PrevBytePos;
(gdb) 
1269        Insert->CurrBytePos = endbytepos;
(gdb) 
1270        Insert->PrevBytePos = startbytepos;
(gdb) 
1272        SpinLockRelease(&Insert->insertpos_lck);
(gdb) 

如前所述,需要将“可用字节位置”转换为XLogRecPtr。
计算实际的开始/结束/上一位置.
StartPos = 5514538672,0x148B12EB0
EndPos = 5514538752,0x148B12F00
PrevPtr = 5514538616,0x148B12E78

(gdb) n
1274        *StartPos = XLogBytePosToRecPtr(startbytepos);
(gdb) 
1275        *EndPos = XLogBytePosToEndRecPtr(endbytepos);
(gdb) 
1276        *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
(gdb) 
1282        Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
(gdb) p *StartPos
$4 = 5514538672
(gdb) p *EndPos
$5 = 5514538752
(gdb) p *PrevPtr
$6 = 5514538616
(gdb) 

验证相互转换是没有问题的.

(gdb) n
1283        Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
(gdb) 
1284        Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
(gdb) 
1285    }
(gdb) 
XLogInsertRecord (rdata=0xf9cc70 <hdr_rdt>, fpw_lsn=5514538520, flags=1 '\001') at xlog.c:1072
1072            inserted = true;
(gdb) 

DONE!

CopyXLogRecordToWAL-场景1:不跨WAL page
测试脚本如下:

insert into t_wal_longtext(c1,c2,c3,c4) VALUES(8,'C2-8','C3-8','C4-8');

继续上一条SQL的跟踪.
设置断点,进入CopyXLogRecordToWAL

(gdb) b CopyXLogRecordToWAL
Breakpoint 3 at 0x54dcdf: file xlog.c, line 1479.
(gdb) c
Continuing.

Breakpoint 3, CopyXLogRecordToWAL (write_len=74, isLogSwitch=false, rdata=0xf9cc70 <hdr_rdt>, StartPos=5514538672, 
    EndPos=5514538752) at xlog.c:1479
1479        CurrPos = StartPos;
(gdb) 

输入参数:
write_len=74, --> 待写入大小
isLogSwitch=false, --> 是否日志切换(不需要)
rdata=0xf9cc70 <\hdr_rdt>, --> 需写入的数据地址
StartPos=5514538672, --> 开始位置
EndPos=5514538752 --> 结束位置

(gdb) n
1480        currpos = GetXLogBuffer(CurrPos);
(gdb) 

在合适的WAL buffer中获取指针用于确定插入的位置.
进入函数GetXLogBuffer,输入参数ptr为5514538672,即开始位置.

(gdb) step
GetXLogBuffer (ptr=5514538672) at xlog.c:1854
1854        if (ptr / XLOG_BLCKSZ == cachedPage)
(gdb) p ptr / 8192 --> 取模
$7 = 673161
(gdb) 
(gdb) p cachedPage
$8 = 673161
(gdb) 

GetXLogBuffer->ptr / XLOG_BLCKSZ == cachedPage,进入相应的处理逻辑
注意:cachedPage是静态变量,具体在哪个地方赋值,后续需再行分析

(gdb) n
1856            Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
(gdb) 
1857            Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
(gdb) 
1858            return cachedPos + ptr % XLOG_BLCKSZ;

GetXLogBuffer->cachedPos开头是XLogPageHeader结构体

(gdb) p *((XLogPageHeader) cachedPos)
$14 = {xlp_magic = 53400, xlp_info = 5, xlp_tli = 1, xlp_pageaddr = 5514534912, xlp_rem_len = 71}
(gdb) 
(gdb) x/24bx (0x7f97d29fe000)
0x7f97d29fe000: 0x98    0xd0    0x05    0x00    0x01    0x00    0x00    0x00
0x7f97d29fe008: 0x00    0x20    0xb1    0x48    0x01    0x00    0x00    0x00
0x7f97d29fe010: 0x47    0x00    0x00    0x00    0x00    0x00    0x00    0x00

回到CopyXLogRecordToWAL,buffer的地址为0x7f97d29feeb0

(gdb) n
1945    }
(gdb) 
CopyXLogRecordToWAL (write_len=74, isLogSwitch=false, rdata=0xf9cc70 <hdr_rdt>, StartPos=5514538672, EndPos=5514538752)
    at xlog.c:1481
1481        freespace = INSERT_FREESPACE(CurrPos);
(gdb) 
(gdb) p currpos
$16 = 0x7f97d29feeb0 ""
(gdb) 

计算空闲空间,确保在该页上最起码有第一个字段(xl_tot_len)的存储空间(4字节).

(gdb) n
1487        Assert(freespace >= sizeof(uint32));
(gdb) p freespace
$21 = 4432
(gdb) 

开始拷贝记录数据.

(gdb) n
1490        written = 0; --> 记录已写入的大小
(gdb) 
1491        while (rdata != NULL)

rdata的分析详见第四部分,继续执行

(gdb) n
1493            char       *rdata_data = rdata->data;
(gdb) 
1494            int         rdata_len = rdata->len;
(gdb) 
1496            while (rdata_len > freespace)
(gdb) p rdata_len
$34 = 46
(gdb) p freespace
$35 = 4432
(gdb) 

rdata_len < freespace,无需进入子循环.
再次进行验证没有问题,执行内存拷贝.

(gdb) n
1536            Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
(gdb) 
1537            memcpy(currpos, rdata_data, rdata_len);
(gdb) 
1538            currpos += rdata_len;
(gdb) 
1539            CurrPos += rdata_len;
(gdb) 
1540            freespace -= rdata_len;
(gdb) 
1541            written += rdata_len;
(gdb) 
1543            rdata = rdata->next;
(gdb) 
1491        while (rdata != NULL)
(gdb) p currpos
$36 = 0x7f97d29feede ""
(gdb) p CurrPos
$37 = 5514538718
(gdb) p freespace
$38 = 4386
(gdb) p written
$39 = 46
(gdb) 

rdata共有四部分,继续写入第二/三/四部分.

...
1491        while (rdata != NULL)
(gdb) 
1493            char       *rdata_data = rdata->data;
(gdb) 
1494            int         rdata_len = rdata->len;
(gdb) 
1496            while (rdata_len > freespace)
(gdb) 
1536            Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
(gdb) 
1537            memcpy(currpos, rdata_data, rdata_len);
(gdb) 
1538            currpos += rdata_len;
(gdb) 
1539            CurrPos += rdata_len;
(gdb) 
1540            freespace -= rdata_len;
(gdb) 
1541            written += rdata_len;
(gdb) 
1543            rdata = rdata->next;
(gdb) 
1491        while (rdata != NULL)
(gdb) 

完成写入74bytes

(gdb) 
1545        Assert(written == write_len);
(gdb) p written
$40 = 74
(gdb) 

无需执行日志切换的相关操作.
对齐CurrPos

(gdb) n
1552        if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
(gdb) 
1599            CurrPos = MAXALIGN64(CurrPos);
(gdb) p CurrPos
$41 = 5514538746
(gdb) n
1602        if (CurrPos != EndPos)
(gdb) p CurrPos
$42 = 5514538752
(gdb) 
(gdb) p 5514538746 % 8
$44 = 2 --> 需补6个字节,5514538746 --> 5514538752

对齐后,CurrPos == EndPos,否则报错!

(gdb) p EndPos
$45 = 5514538752

结束调用

(gdb) n
1604    }
(gdb) 
XLogInsertRecord (rdata=0xf9cc70 <hdr_rdt>, fpw_lsn=5514538520, flags=1 '\001') at xlog.c:1098
1098            if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
(gdb) 

DONE!

CopyXLogRecordToWAL-场景2:跨WAL page 后续再行分析

四、再论WAL Record

在内存中,WAL Record通过rdata存储,该变量其实是全局静态变量hdr_rdt,类型为XLogRecData,XLOG Record通过XLogRecData链表组织起来(这个设计很赞,写入无需理会结构,按链表逐个写数据即可).
rdata由4部分组成:
第一部分是XLogRecord + XLogRecordBlockHeader + XLogRecordDataHeaderShort,共46字节
第二部分是xl_heap_header,5个字节
第三部分是tuple data,20个字节
第四部分是xl_heap_insert,3个字节

------------------------------------------------------------------- 1
(gdb) p *rdata 
$22 = {next = 0x244f2c0, data = 0x244f4c0 "J", len = 46} 
(gdb) p *(XLogRecord *)rdata->data --> XLogRecord
$27 = {xl_tot_len = 74, xl_xid = 2268, xl_prev = 5514538616, xl_info = 0 '\000', xl_rmid = 10 '\n', xl_crc = 1158677949}
(gdb) p *(XLogRecordBlockHeader *)(0x244f4c0+24) --> XLogRecordBlockHeader
$29 = {id = 0 '\000', fork_flags = 32 ' ', data_length = 25}
(gdb) x/2bx (0x244f4c0+44) --> XLogRecordDataHeaderShort
0x244f4ec:  0xff    0x03
------------------------------------------------------------------- 2 
(gdb) p *rdata->next
$23 = {next = 0x244f2d8, data = 0x7ffebea9d830 "\004", len = 5}
(gdb) p *(xl_heap_header *)rdata->next->data
$32 = {t_infomask2 = 4, t_infomask = 2050, t_hoff = 24 '\030'}
------------------------------------------------------------------- 3
(gdb) p *rdata->next->next
$24 = {next = 0x244f2a8, data = 0x24e6a2f "", len = 20}
(gdb) x/20bc  0x24e6a2f
0x24e6a2f:  0 '\000'    8 '\b'  0 '\000'    0 '\000'    0 '\000'    11 '\v' 67 'C'  50 '2'
0x24e6a37:  45 '-'  56 '8'  11 '\v' 67 'C'  51 '3'  45 '-'  56 '8'  11 '\v'
0x24e6a3f:  67 'C'  52 '4'  45 '-'  56 '8'
(gdb) 
------------------------------------------------------------------- 4
(gdb) p *rdata->next->next->next
$25 = {next = 0x0, data = 0x7ffebea9d840 "\b", len = 3}
(gdb) 
(gdb) p *(xl_heap_insert *)rdata->next->next->next->data
$33 = {offnum = 8, flags = 0 '\000'}

五、参考资料

PostgreSQL 源码解读(4)- 插入数据#3(heap_insert)
PostgreSQL 事务日志WAL结构浅析
PostgreSQL 源码解读(110)- WAL#6(Insert&WAL - XLogRecordAssemble记录组装函数)
PostgreSQL 源码解读(111)- WAL#7(Insert&WAL - XLogRecordAssemble-FPW)
PostgreSQL 源码解读(112)- WAL#8(XLogCtrl数据结构)
PG Source Code

上一篇下一篇

猜你喜欢

热点阅读