100行PHP解析LevelDB SSTable文件

打开LevelDB的SSTable文件,解析并打印了里面的key-value数据,需要了解SSTable格式的可以看看。

大致解析步骤:

  1. 先读取尾部的 Footer 信息,Footer 固定为48字节,从 Footer 可以知道 Index block 在文件中偏移
  2. 解析 Index Block,从 Index Block 可以知道每个 Data Block 在文件中的位置。Index Block的每一项是个key-value结构,value 存的就是对应 Data Block 的位置和大小,而 key 是对应 Data Block 中最大的 key 还大一丁点的 key,比如 Data Block 中最大 key 为 hello,那 Index Block中的 key 就是helloa
  3. 解析每个 Data Block,得到每个 key-value 对

$data = file_get_contents('testdb/000009.ldb');
$size = strlen($data);
// footer = meta block handle + index block handle + padding + 8-byte magic number
// footer is fixed 48 byte, parse from this offset
$offset = $size - 48;

// meta block is used for performance, so we can skip parse it
list($metaBlockOffset,$metaBlockSize,$offset) = decodeBlockHandle($offset);
list($idxBlockOffset,$idxBlockSize,$offset) = decodeBlockHandle($offset);

// when parsed one k-v item from index block, this callback function will run
$handleIndexBlockItem = function($key, $seq, $kType, $valBegin, $valLen) {
    global $data;
    list($datablockOffset,$datablockSize,) = decodeBlockHandle($valBegin);
    readBlock($datablockOffset, $datablockSize, function ($key, $seq, $kType, $valBegin, $valLen) {
        global $data;
        $value = substr($data, $valBegin, $valLen);
        echo $key . ' = ' . $value . PHP_EOL;
    });
};
readBlock($idxBlockOffset, $idxBlockSize, $handleIndexBlockItem);

function readBlock($offset, $size, $itemCallback) {
    global $data;
    $blockType = ord($data[$offset]);
    $blockChecksum = decodeInt32($offset+1);
    $restartNum = decodeInt32($offset+$size-4);
    printf("Block: offset=%s, size=%d, type=%d, num_of_restarts=%d, crc32c=%d\n", 
        dechex($offset), $size, $blockType, $restartNum, $blockChecksum);
    $cur = $offset;
    $currentSharedKey = '';
    while ($cur != $offset+$size-4-$restartNum*4) {
        // format: shared key length | unshared key length | value length | unshared key | value
        $begin = $cur;
        list($shareLen, $len) = decodeVarInt($cur);
        $cur += $len;
        list($unshareLen, $len) = decodeVarInt($cur);
        $cur += $len;
        list($valLen, $len) = decodeVarInt($cur);
        $cur += $len;

        $unsharedKey = substr($data, $cur, $unshareLen-8);
        if ($shareLen === 0) {
            $currentSharedKey = $unsharedKey;
        }
        // key format: user's binary key | 7-byte sequence number | 1-byte type code
        $key = substr($currentSharedKey, 0, $shareLen) . $unsharedKey;
        $int = decodeInt64($cur+$unshareLen-8);
        $seq = $int >> 8;
        $kType = $seq && 0xff; // enum ValueType { kTypeDeletion = 0x0, kTypeValue = 0x1 };

        $cur += $unshareLen;
        $valBegin = $cur;
        if (is_callable($itemCallback)) {
            $itemCallback($key, $seq, $kType, $valBegin, $valLen);
        }
        $cur += $valLen;
    }
}

function decodeBlockHandle($offset) {
    global $data;
    list($blockOffset, $offsetLen) = decodeVarInt($offset);
    list($blockSize, $sizeLen) = decodeVarInt($offset+$offsetLen);
    return [$blockOffset, $blockSize, $offset+$offsetLen+$sizeLen];
}

function decodeVarInt($offset) {
    global $data;
    $len = 0;
    $result = 0;
    while (true) {
        $byte = ord($data[$offset+$len]);
        $result |= ($byte & 0x7f) << (7 * $len);
        $len++;
        if (($byte & 0x80) === 0) {
            // stop when first bit of byte is 0
            break;
        }
    }
    return [$result, $len];
}

function decodeInt32($offset) {
    return decodeFixedInt($offset, 32);
}
function decodeInt64($offset) {
    return decodeFixedInt($offset, 64);
}
function decodeFixedInt($offset, $bitLen) {
    global $data;
    $result = 0;
    for ($i = 0; $i != $bitLen / 8; $i++) {
        $byte = ord($data[$offset+$i]);
        $result |= $byte << (8*$i);
    }
    return $result;
}

你可能感兴趣的:(php)