redis ziplist

2020-05-04 本文已影响0人 stanley_shen

ziplist在redis中的使用

redis数据结构hash, zset, list都会使用到ziplist。

hash的enties<hash_max_ziplist_entries配置时，使用ziplist。entries >= hash_max_ziplist_enties时，会将ziplist的数据转存到hashtable, 同时新的数据设置存储编码格式为HT，反正更换会ziplist。hash中ziplist的存储是把field和value拆成了一对entry相邻存储。例如<entry.field1><entry.value1><entry.filed2><entry.value2>
list使用quicklist底层存储使用了ziplist。
zset在enties > zset_max_ziplist_entries时，会将数据转存为skiplist + hashtable的存储结构。 Enties <= zset_max_ziplist_entries时，又会将数据转存为ziplist来存储。zset在使用ziplist时，是将value与score组成一对，例如<entry.value1><entry.score1><entry.value2><entry.score2>

存储数据结构

      <zlbytes> <zltail> <zllen> < entry> <entry>... <entry> <zlend>

all fields are stored in little endian, if not specified otherwise.
zlbytes : ziplist占用的字节数
zltail : ziplist尾部entry所在的字节位置
zllen : ziplist entry 数量
zlend : 固定ff

entry存储结构

      <prevlen> <encoding> <entry-data>

prevlen : 之前一个entry占多少个字节. 字节数<254时占1个字节(无符号int)，字节数>=254时占5个字节，第一个字节固定为FE，其他4字节表示长度
encoding : 编码格式
entry-data : 真实数据

entry编码格式

字符串

长度位

bit位00开头，占1byte 表示<63(2^7 - 1)byte长度的字符串(big endian)
bit为01开头，占2byte 表示<16383(2^14 - 1)byte长度的字符串(big endian)
bit位10开头，占5byte表示>=16383byte长度的字符串(big endian)

有符号int

1byte       2byte       3byte       4byte       5byte       6byte       7byte       8byte       9byte
11000000   xxxxxxxx    xxxxxxxx
11010000   xxxxxxxx    xxxxxxxx    xxxxxxxx    xxxxxxxx
11100000   xxxxxxxx    xxxxxxxx    xxxxxxxx    xxxxxxxx    xxxxxxxx    xxxxxxxx    xxxxxxxx    xxxxxxxx
11110000   xxxxxxxx    xxxxxxxx    xxxxxxxx
11111110   xxxxxxxx
x表示0/1

举例

ziplist存入字符串2, 5
    zlbytes      zltail     zllen  entry   entry   zlend  
[0f 00 00 00] [0c 00 00 00] [02 00] [00 f3] [02 f6] [ff]
zlbytes = 15
zltail  = 13
zllen = 2
entry.1 :  00f3,  00表示前一个entry不存在，f3为11110011表示2
entry.2 :  02f6,  02表示前一个entry长度为2字节, f6 为11110110表示5
在2，5后面再加一个Hello world
对应的entry
prevlen   encoding                 entry-data
  [02]          [0b]          [48 65 6c 6c 6f 20 57 6f 72 6c 64]
prevlen表示前一个entry(entry.2)占用2字节
encoding表示11000000(big endian)表示0b
entry-data为Hello world对应的ascii码

运行时数据结构

typedef struct zlentry {
    unsigned int prevrawlensize; /* 前驱节点的长度prevrawlen所需要的字节大小 */
    unsigned int prevrawlen;     /* 前驱节点的长度 */
    unsigned int lensize;        /* 编码当前节点长度len所需的字节数 */
    unsigned int len;            /* 当前节点值长度 */
    unsigned int headersize;     /* 当前节点header的大小 = lensize + prevrawlensize */
    unsigned char encoding;      /* 当前节点的编码格式 */
    unsigned char *p;            /* 指向当前节点的指针 */
} zlentry;

级联更新

    当插入或者删除某个entry造成了其后的多个entry的prev_entry_length域由1个字节扩展成5个字节且其这些entry的长度本来是介于249-253之间，那么新插入或者删除的entry之后的多个连续entry每个都需要扩展prev_entry_length以存储前一个entry实际占用的字节数

/**
 * p为发生变更的点
 */
unsigned char *__ziplistCascadeUpdate(unsigned char *zl, unsigned char *p) {
    size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), rawlen, rawlensize;
    size_t offset, noffset, extra;
    unsigned char *np;
    zlentry cur, next;

    while (p[0] != ZIP_END) {
        //zipEntry会解压存储数据结构生成运行时数据结构zlentry
        //获取当前的zlentry
        zipEntry(p, &cur);
        rawlen = cur.headersize + cur.len;
        rawlensize = zipStorePrevEntryLength(NULL,rawlen);

        /* 没有后继，结束级联更新*/
        if (p[rawlen] == ZIP_END) break;
        //获取当前下一个zlentry
        zipEntry(p+rawlen, &next);

        /* 后继zlentry的prevrawlen == rawlen，结束级联更新*/
        if (next.prevrawlen == rawlen) break;
        /* 当前节点长度所占字节数 > 后继zlentry中前驱节点长度所占字节数 */
        /* 更新这些节点的字节数
        if (next.prevrawlensize < rawlensize) {
            /* The "prevlen" field of "next" needs more bytes to hold
             * the raw length of "cur". */
            offset = p-zl;
            extra = rawlensize-next.prevrawlensize;
            zl = ziplistResize(zl,curlen+extra);
            p = zl+offset;

            /* Current pointer and offset for next element. */
            np = p+rawlen;
            noffset = np-zl;

            /* Update tail offset when next element is not the tail element. */
            if ((zl+intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))) != np) {
                ZIPLIST_TAIL_OFFSET(zl) =
                    intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra);
            }

            /* 将后继及之后的数据copy到新的位置np + rawlensize */
            memmove(np+rawlensize,
                np+next.prevrawlensize,
                curlen-noffset-next.prevrawlensize-1);
            zipStorePrevEntryLength(np,rawlen);

            /* 移动p的指针*/
            p += rawlen;
            curlen += extra;
        } else {
            if (next.prevrawlensize > rawlensize) {
                /* This would result in shrinking, which we want to avoid.
                 * So, set "rawlen" in the available bytes. */
                zipStorePrevEntryLengthLarge(p+rawlen,rawlen);
            } else {
                zipStorePrevEntryLength(p+rawlen,rawlen);
            }

            /* Stop here, as the raw length of "next" has not changed. */
            break;
        }
    }
    return zl;
}

插入操作

获取插入位置的前一个entry的长度
计算要插入的entry占用的字节数；
计算prev_entry_length的差值；
为ziplist重新分配内存空间；
如果不是在末尾插入，移动ziplist在p之后的字节为p的插入腾出足够的空间，并更新zltail的值；
如果prev_entry_length域的差值不为0，进行必要的连锁更新；
将新插入的entry三部分的值拷贝到对应的内存空间；

/* 在p位置插入 */
unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
    size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen;
    unsigned int prevlensize, prevlen = 0;
    size_t offset;
    int nextdiff = 0;
    unsigned char encoding = 0;
    long long value = 123456789; /* initialized to avoid warning. Using a value
                                    that is easy to see if for some reason
                                    we use it uninitialized. */
    zlentry tail;

    /* Find out prevlen for the entry that is inserted. */
    if (p[0] != ZIP_END) {
        ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
    } else {
        unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);
        if (ptail[0] != ZIP_END) {
            prevlen = zipRawEntryLength(ptail);
        }
    }

    /* See if the entry can be encoded */
    if (zipTryEncoding(s,slen,&value,&encoding)) {
        /* 'encoding' is set to the appropriate integer encoding */
        reqlen = zipIntSize(encoding);
    } else {
        /* 'encoding' is untouched, however zipStoreEntryEncoding will use the
         * string length to figure out how to encode it. */
        reqlen = slen;
    }
    /* We need space for both the length of the previous entry and
     * the length of the payload. */
    reqlen += zipStorePrevEntryLength(NULL,prevlen);
    reqlen += zipStoreEntryEncoding(NULL,encoding,slen);

    /* When the insert position is not equal to the tail, we need to
     * make sure that the next entry can hold this entry's length in
     * its prevlen field. */
    int forcelarge = 0;
    nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;
    if (nextdiff == -4 && reqlen < 4) {
        nextdiff = 0;
        forcelarge = 1;
    }

    /* Store offset because a realloc may change the address of zl. */
    offset = p-zl;
    zl = ziplistResize(zl,curlen+reqlen+nextdiff);
    p = zl+offset;

    /* Apply memory move when necessary and update tail offset. */
    if (p[0] != ZIP_END) {
        /* Subtract one because of the ZIP_END bytes */
        memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);

        /* Encode this entry's raw length in the next entry. */
        if (forcelarge)
            zipStorePrevEntryLengthLarge(p+reqlen,reqlen);
        else
            zipStorePrevEntryLength(p+reqlen,reqlen);

        /* Update offset for tail */
        ZIPLIST_TAIL_OFFSET(zl) =
            intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+reqlen);

        /* When the tail contains more than one entry, we need to take
         * "nextdiff" in account as well. Otherwise, a change in the
         * size of prevlen doesn't have an effect on the *tail* offset. */
        zipEntry(p+reqlen, &tail);
        if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {
            ZIPLIST_TAIL_OFFSET(zl) =
                intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);
        }
    } else {
        /* This element will be the new tail. */
        ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);
    }

    /* When nextdiff != 0, the raw length of the next entry has changed, so
     * we need to cascade the update throughout the ziplist */
    if (nextdiff != 0) {
        offset = p-zl;
        zl = __ziplistCascadeUpdate(zl,p+reqlen);
        p = zl+offset;
    }

    /* Write the entry */
    p += zipStorePrevEntryLength(p,prevlen);
    p += zipStoreEntryEncoding(p,encoding,slen);
    if (ZIP_IS_STR(encoding)) {
        memcpy(p,s,slen);
    } else {
        zipSaveInteger(p,value,encoding);
    }
    ZIPLIST_INCR_LENGTH(zl,1);
    return zl;
}

删除操作

获取要要删除的起始位置的entry(为first),然后根据要删除的元素的数目以及ziplist中的zlend的限制,确定要删除的位置的结束位置p,计算出总的要删除的字节数totlen=p-first.p
如果totlen>0，进行删除的操作，否则不执行任何的操作;
如果结束的位置p是zlend的位置，只需更新ziplist的zltail
如果结束的位置p不是zlend的位置，计算nextdiff = first.prevrawlen - p.prev_entry_len, 根据nextdiff计算新后继的prev_entry_len和新的zltail
计算新的zlbytes，重新分配内存空间;
更新ziplist中的zllen
如果nextdiff != 0，从删除结束位置p开始进行级联更新

unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsigned int num) {
    unsigned int i, totlen, deleted = 0;
    size_t offset;
    int nextdiff = 0;
    zlentry first, tail;

    zipEntry(p, &first);
    for (i = 0; p[0] != ZIP_END && i < num; i++) {
        p += zipRawEntryLength(p);
        deleted++;
    }

    totlen = p-first.p; /* Bytes taken by the element(s) to delete. */
    if (totlen > 0) {
        if (p[0] != ZIP_END) {
            /* Storing `prevrawlen` in this entry may increase or decrease the
             * number of bytes required compare to the current `prevrawlen`.
             * There always is room to store this, because it was previously
             * stored by an entry that is now being deleted. */
            nextdiff = zipPrevLenByteDiff(p,first.prevrawlen);

            /* Note that there is always space when p jumps backward: if
             * the new previous entry is large, one of the deleted elements
             * had a 5 bytes prevlen header, so there is for sure at least
             * 5 bytes free and we need just 4. */
            p -= nextdiff;
            zipStorePrevEntryLength(p,first.prevrawlen);

            /* Update offset for tail */
            ZIPLIST_TAIL_OFFSET(zl) =
                intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))-totlen);

            /* When the tail contains more than one entry, we need to take
             * "nextdiff" in account as well. Otherwise, a change in the
             * size of prevlen doesn't have an effect on the *tail* offset. */
            zipEntry(p, &tail);
            if (p[tail.headersize+tail.len] != ZIP_END) {
                ZIPLIST_TAIL_OFFSET(zl) =
                   intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);
            }

            /* Move tail to the front of the ziplist */
            memmove(first.p,p,
                intrev32ifbe(ZIPLIST_BYTES(zl))-(p-zl)-1);
        } else {
            /* The entire tail was deleted. No need to move memory. */
            ZIPLIST_TAIL_OFFSET(zl) =
                intrev32ifbe((first.p-zl)-first.prevrawlen);
        }

        /* Resize and update length */
        offset = first.p-zl;
        zl = ziplistResize(zl, intrev32ifbe(ZIPLIST_BYTES(zl))-totlen+nextdiff);
        ZIPLIST_INCR_LENGTH(zl,-deleted);
        p = zl+offset;

        /* When nextdiff != 0, the raw length of the next entry has changed, so
         * we need to cascade the update throughout the ziplist */
        if (nextdiff != 0)
            zl = __ziplistCascadeUpdate(zl,p);
    }
    return zl;
}

redis ziplist

ziplist在redis中的使用

存储数据结构

entry存储结构

entry编码格式

字符串

有符号int

运行时数据结构

级联更新

插入操作

删除操作

猜你喜欢

热点阅读