遍历百万级Redis的键值的大结局

背景

上次改完利用条件变量的形式来进行rdbtool和socket接受的数据联合分析,我再想能不能通过协程来实现避免条件变量这种调用系统调用的方式,当然如果算一下因为每一次接受的socket的数据都尽量的大的话其他调用条件变量的次数或许在整个性能消耗里面占比比较小的,这个方式只是想自己探索一下。

协程的改造之路

greenlet的基本使用
from greenlet import greenlet

def test1():
    print(12)
    gr2.switch()
    print(34)

def test2():
    print(56)
    gr1.switch()
    print(78)

gr1 = greenlet(test1)
gr2 = greenlet(test2)
gr1.switch()

这是greenlet官网提供的示例,输入的结果大家自行运行一下,从该实例代码可以看出greenlet保存的是执行函数的上下文信息,在调度的过程中会还原已经保存的信息,greenlet底层其实就是调用的是汇编代码来保存上下文信息,大家有兴趣可自行查看。

模拟读写过程

首先编写一个server脚本,代码如下;

import socket


def run_server():
    sock = socket.socket()
    sock.bind(("127.0.0.1", 6000))
    sock.listen(5)

    while True:
        try:
            conn, addr = sock.accept()
        except Exception as e:
            print(e)
            return

        Flag = True
        while Flag:
            try:
                data = conn.recv(1024)
            except Exception as e:
                print(e)
                Flag = False
                continue

            print("recv  ", data)
            try:
                conn.send(b"1234567890qwertyuiopasdfghjklzxcvbnm")
            except Exception as e:
                print(e)
                Flag = False
                continue


if __name__ == '__main__':
    run_server()

server代码相对简单并未做很鉴权的处理,仅是模拟一下redis的server。

利用greenlet来进行数据的读写;

import socket

from greenlet import greenlet


def read_data(data=None):
    # socket读到了数据 然后来解析该数据  如果数据不够则需要继续读取数据
    total_len = 0
    if data:
        print("read_data  ", data)
        total_len += len(data)

    while True:
        recv_data = gr_read.switch()
        print("current recv ", recv_data)
        if recv_data:
            total_len += len(data)

        if total_len >= 20:
            # 关闭连接 进行收尾工作 如果接受的数据大于20则完成任务
            return


def read_socket(host="127.0.0.1", port=6000):
    # 从socket中读取数据,然后切换到read_data中奖读取到的数据 交给read_data来解析
    conn = socket.socket()
    conn.connect((host, port))

    total = 0
    conn.send(b"1")
    while True:
        r = conn.recv(4)
        print(r)
        gr_consumer.switch(r)
  
gr_read = greenlet(read_socket)
gr_consumer = greenlet(read_data)
gr_read.switch()

运行结果如下;

b'1234'
read_data   b'1234'
b'5678'
current recv  b'5678'
b'90qw'
current recv  b'90qw'
b'erty'
current recv  b'erty'
b'uiop'
current recv  b'uiop'

因为server默认传回的内容是长度大于20的数据的,所有client会主动停止。如上的思路大致跑通之后,由于greenlet只能够对当前执行的函数栈进行恢复与调度,如果使用yield来进行操作的话,只能够先通过greenlet调度,再在greenlet的流程中包含yield的流程,修改代码如下;

import socket

from greenlet import greenlet


class Buff(object):
    def __init__(self):
        self.read_length = 0
        self.buff = b""
        self.flag = True
        self.parse_func = self.parse()
        next(self.parse_func)

    def add(self, data):
        self.buff += data

    def start(self):
        self.parse_func.send(None)

    def parse(self):
        print("start parse")
        while self.flag:
            read_three = self.read_n(5)
            print("parse read    ", read_three)
            if isinstance(read_three, bytes):
                continue
            yield read_three

    def wait_read(self):
        n = self.read_length
        while True:
            yield
            if len(self.buff) >= n:
                r = self.buff[:n]
                self.buff = self.buff[n:]
                return r

    def read_n(self, n):
        print("reand n  ", len(self.buff), n)
        if len(self.buff) >= n:
            r = self.buff[:n]
            self.buff = self.buff[n:]
            return r
        else:
            self.read_length = n
            return self.wait_read()


buff = Buff()


def read_data(data=None):
    # socket读到了数据 然后来解析该数据  如果数据不够则需要继续读取数据
    total_len = 0
    if data:
        print("read_data  ", data)
        total_len += len(data)
        buff.add(data)

    while True:
        recv_data = gr_read.switch()
        print("current recv ", recv_data)
        if recv_data:
            total_len += len(data)
            buff.add(recv_data)
            buff.start()

        if total_len >= 20:
            # 关闭连接 进行收尾工作
            return


def read_socket(host="127.0.0.1", port=6000):
    # 从socket中读取数据,然后切换到read_data中奖读取到的数据 交给read_data来解析
    conn = socket.socket()
    conn.connect((host, port))

    total = 0
    conn.send(b"1")
    while True:
        r = conn.recv(4)
        print(r)
        gr_consumer.switch(r)


if __name__ == '__main__':
    gr_read = greenlet(read_socket)
    gr_consumer = greenlet(read_data)
    gr_read.switch()

通过调用greenlet中的buff实现的parse的协程,从而完成当解析的数据不够的时候,则切换到接受数据的协程,然后再接收到数据之后再切换到解析的函数过程中执行(解析仅仅就是读出数据而已,具体业务可能是具体的场景),从而完成了两个协程交替执行读数据解析的任务。

rdb分析脚本改造
import socket
import logging
import time

from greenlet import greenlet
from rdbtools import RdbParser, KeyValsOnlyCallback
from rdbtools.encodehelpers import ESCAPE_CHOICES

logger = logging.getLogger(__package__)


start = time.time()


redis_ip = "192.168.10.202"
redis_port = 6371
key_size = 412


def encode_command(*args, buf=None):
    if buf is None:
        buf = bytearray()
    buf.extend(b'*%d\r\n' % len(args))

    try:
        for arg in args:
            if isinstance(arg, str):
                arg = arg.encode("utf-8")
            buf.extend(b'$%d\r\n%s\r\n' % (len(arg), arg))
    except KeyError:
        raise TypeError("Argument {!r} expected to be of bytearray, bytes,"
                        " float, int, or str type".format(arg))
    return buf


class RecvBuff(object):

    def __init__(self):
        self.buff = b""
        self.length = 0
        self.total_length = 0
        self.is_done = False
        self.read_length = 0
        self.gr_read = None
        self.gr_consumer = None

    def add(self, data):
        self.buff += data

    def wait_from_socket(self):
        n = self.read_length
        while True:
            time.sleep(1)
            self.gr_read.switch()
            if len(self.buff) >= n:
                r = self.buff[:n]
                self.buff = self.buff[n:]
                self.length += n
                if self.length == self.total_length:
                    self.is_done = True
                    return
                return r

    def consumer_length(self, n):
        if len(self.buff) >= n:
            r = self.buff[:n]
            self.buff = self.buff[n:]
            self.length += n
            if self.length == self.total_length:
                self.is_done = True
                raise
            return r
        else:
            self.read_length = n
            r = self.wait_from_socket()
            print("consumer length return  ", r)
            return r


recv_buff = RecvBuff()


def rdb_work():
    class Writer(object):

        def write(self, value):
            if b" " in value:
                index = value.index(b" ")
                length = len(value)
                if length - index - 1 >= key_size:
                    print(value, index, length)

    out_file_obj = Writer()
    callback = {
        'justkeyvals': lambda f: KeyValsOnlyCallback(f, string_escape=ESCAPE_CHOICES[0]),
    }["justkeyvals"](out_file_obj)
    parser = RdbParser(callback)

    def parse(self, filename=None):
        class Reader(object):
            def __init__(self, buff):
                self.buff = buff

            def __enter__(self):
                return self

            def __exit__(self, exc_type, exc_val, exc_tb):
                pass

            def read(self, n):
                if n <= 0:
                    return
                # res = self.buff.consumer_length(n)
                while True:
                    if len(self.buff.buff) < n:
                        self.buff.gr_read.switch()
                    else:
                        break
                res = self.buff.consumer_length(n)
                return res

            def close(self):
                pass
        f = Reader(recv_buff)
        self.parse_fd(f)
    setattr(parser, "parse", parse)

    print("start rdb work")
    parser.parse(parser)
    recv_buff.is_done = True
    print("finish rdb work")


class RedisServer(object):

    def __init__(self, host=None, port=None):
        self.host = host or "127.0.0.1"
        self.port = port or 6379
        self.conn = None
        self.recv_buff = recv_buff

    def init(self):
        try:
            self.conn = socket.socket()
            self.conn.connect((self.host, self.port))
        except Exception as e:
            logger.exception(e)
            self.conn = None
            return

    def slave_sync(self):
        self.send_sync()
        total_read_length = 0
        # 首先先出去sync返回的数据 b'$9337614\r\n'
        while True:
            data = self.conn.recv(1024 * 1)
            if b"$" == data[:1]:
                length = len(data)
                for i in range(length-1):
                    if b"\r\n" == data[i:(i + 2)]:
                        break

                self.recv_buff.total_length = int(data[1:(i-2)].decode())
                left_data = data[(i+2):]
                total_read_length += len(left_data)
                print("recv  length  ", len(left_data))
                if left_data:
                    self.recv_buff.add(left_data)
                # 切换到启动消费的协程
                rdb_green.switch()
                print("stop  first  rdb work")
                break
            if b"\n" == data:
                continue

        while True:
            try:
                data = self.conn.recv(1024 * 8)
            except Exception as e:
                print("recv error : {0}".format(e))
                return
            if data:
                self.recv_buff.add(data)
                # 切换到消费的协程
                rdb_green.switch()
                if self.recv_buff.is_done:
                    print("recv buff done")
                    return

    def send_sync(self):
        data = encode_command("SYNC")
        try:
            self.conn.send(data)
        except Exception as e:
            return


def main():
    rs = RedisServer(redis_ip, redis_port)
    recv_buff.gr_read = greenlet(rs.slave_sync)
    global rdb_green
    rdb_green = greenlet(rdb_work)
    rs.init()
    recv_buff.gr_read.switch()
    end = time.time()
    print("finish use time {0}  second  ".format(end - start))


if __name__ == '__main__':
    import cProfile
    cProfile.run("main()")

该脚本的改造过程中,一定要将rdb_work和rs.slave_sync的协程的切换过程一定要放在函数中,因为该函数记录了当前rdbtool解析的时候的上下文信息,如果在recv_buff中新开一个协程在该类中切换就失去了rdb_work中的上下文调用栈的信息,从而导致失败。

首先查看一下运行的性能数据;

         16281244 function calls (16278933 primitive calls) in 7.568 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      2/1    0.000    0.000    7.563    7.563 :1()
        1    0.000    0.000    0.000    0.000 callbacks.py:179(__init__)
   100012    0.103    0.000    0.218    0.000 callbacks.py:188(_start_key)
       12    0.000    0.000    0.000    0.000 callbacks.py:196(_end_key)
      468    0.000    0.000    0.001    0.000 callbacks.py:199(_write_comma)
   100000    0.182    0.000    2.944    0.000 callbacks.py:204(set)
       12    0.000    0.000    0.000    0.000 callbacks.py:208(start_hash)
      468    0.001    0.000    0.024    0.000 callbacks.py:212(hset)
       12    0.000    0.000    0.000    0.000 callbacks.py:216(end_hash)
   200948    0.100    0.000    0.165    0.000 compat.py:16(isnumber)
   200948    0.225    0.000    2.292    0.000 encodehelpers.py:126(apply_escape_bytes)
  4118558    1.141    0.000    1.466    0.000 encodehelpers.py:142()
  4018078    0.325    0.000    0.325    0.000 encodehelpers.py:20(bval)
        2    0.000    0.000    0.000    0.000 enum.py:284(__call__)
        2    0.000    0.000    0.000    0.000 enum.py:526(__new__)
        1    0.000    0.000    0.000    0.000 enum.py:836(__and__)
      469    0.000    0.000    0.002    0.000 parser.py:1019(lzf_decompress)
        8    0.000    0.000    0.000    0.000 parser.py:103(aux_field)
        3    0.000    0.000    0.000    0.000 parser.py:1069(read_signed_char)
   502905    0.308    0.000    1.516    0.000 parser.py:1072(read_unsigned_char)
        3    0.000    0.000    0.000    0.000 parser.py:1081(read_signed_int)
   100013    0.063    0.000    0.297    0.000 parser.py:1087(read_unsigned_int_be)
        1    0.000    0.000    0.000    0.000 parser.py:112(start_database)
        1    0.000    0.000    0.000    0.000 parser.py:141(db_size)
        1    0.000    0.000    0.000    0.000 parser.py:342(end_database)
        1    0.000    0.000    0.000    0.000 parser.py:354(end_rdb)
        1    0.000    0.000    0.000    0.000 parser.py:377(__init__)
        1    0.334    0.334    7.032    7.032 parser.py:396(parse_fd)
   301929    0.324    0.000    1.555    0.000 parser.py:468(read_length_with_encoding)
   100965    0.046    0.000    0.789    0.000 parser.py:490(read_length)
   200964    0.151    0.000    1.759    0.000 parser.py:493(read_string)
   100012    0.148    0.000    3.845    0.000 parser.py:531(read_object)
        1    0.000    0.000    0.000    0.000 parser.py:78(__init__)
   100480    0.051    0.000    2.183    0.000 parser.py:84(encode_key)
   100468    0.045    0.000    0.205    0.000 parser.py:92(encode_value)
        1    0.000    0.000    0.000    0.000 parser.py:954(verify_magic_string)
        1    0.000    0.000    0.000    0.000 parser.py:958(verify_version)
        1    0.000    0.000    0.000    0.000 parser.py:96(start_rdb)
        1    0.000    0.000    0.000    0.000 parser.py:964(init_filter)
   200024    0.259    0.000    0.426    0.000 parser.py:996(matches_filter)
        1    0.000    0.000    0.000    0.000 re.py:232(compile)
        1    0.000    0.000    0.000    0.000 re.py:271(_compile)
        1    0.000    0.000    0.000    0.000 socket.py:139(__init__)
        1    0.000    0.000    0.000    0.000 sre_compile.py:423(_simple)
        1    0.000    0.000    0.000    0.000 sre_compile.py:536(_compile_info)
        2    0.000    0.000    0.000    0.000 sre_compile.py:595(isstring)
        1    0.000    0.000    0.000    0.000 sre_compile.py:598(_code)
      2/1    0.000    0.000    0.000    0.000 sre_compile.py:71(_compile)
        1    0.000    0.000    0.000    0.000 sre_compile.py:759(compile)
        2    0.000    0.000    0.000    0.000 sre_parse.py:111(__init__)
        4    0.000    0.000    0.000    0.000 sre_parse.py:160(__len__)
        8    0.000    0.000    0.000    0.000 sre_parse.py:164(__getitem__)
        1    0.000    0.000    0.000    0.000 sre_parse.py:168(__setitem__)
        1    0.000    0.000    0.000    0.000 sre_parse.py:172(append)
      2/1    0.000    0.000    0.000    0.000 sre_parse.py:174(getwidth)
        1    0.000    0.000    0.000    0.000 sre_parse.py:224(__init__)
        3    0.000    0.000    0.000    0.000 sre_parse.py:233(__next)
        2    0.000    0.000    0.000    0.000 sre_parse.py:249(match)
        2    0.000    0.000    0.000    0.000 sre_parse.py:254(get)
        2    0.000    0.000    0.000    0.000 sre_parse.py:286(tell)
        1    0.000    0.000    0.000    0.000 sre_parse.py:417(_parse_sub)
        1    0.000    0.000    0.000    0.000 sre_parse.py:475(_parse)
        1    0.000    0.000    0.000    0.000 sre_parse.py:76(__init__)
        2    0.000    0.000    0.000    0.000 sre_parse.py:81(groups)
        1    0.000    0.000    0.000    0.000 sre_parse.py:903(fix_flags)
        1    0.000    0.000    0.000    0.000 sre_parse.py:919(parse)
        1    0.000    0.000    7.032    7.032 t.py:103(parse)
        1    0.000    0.000    0.000    0.000 t.py:104(Reader)
        1    0.000    0.000    0.000    0.000 t.py:105(__init__)
        1    0.000    0.000    0.000    0.000 t.py:108(__enter__)
        1    0.000    0.000    0.000    0.000 t.py:111(__exit__)
   803885    0.564    0.000    2.067    0.000 t.py:114(read)
        1    0.000    0.000    0.000    0.000 t.py:140(__init__)
        1    0.000    0.000    0.005    0.005 t.py:146(init)
        1    0.000    0.000    0.001    0.001 t.py:194(send_sync)
        1    0.000    0.000    7.563    7.563 t.py:202(main)
        1    0.000    0.000    0.000    0.000 t.py:24(encode_command)
     1152    0.002    0.000    0.002    0.000 t.py:51(add)
   803885    0.971    0.000    1.041    0.000 t.py:68(consumer_length)
        1    0.000    0.000    7.032    7.032 t.py:87(rdb_work)
        1    0.000    0.000    0.000    0.000 t.py:88(Writer)
   300959    0.236    0.000    0.294    0.000 t.py:90(write)
        1    0.000    0.000    0.000    0.000 t.py:99()
        1    0.000    0.000    0.000    0.000 {built-in method _sre.compile}
   602924    0.164    0.000    0.164    0.000 {built-in method _struct.unpack}
        2    0.000    0.000    0.000    0.000 {built-in method builtins.__build_class__}
   100480    0.413    0.000    1.879    0.000 {built-in method builtins.all}
      2/1    0.000    0.000    7.568    7.568 {built-in method builtins.exec}
   902896    0.137    0.000    0.137    0.000 {built-in method builtins.isinstance}
1709421/1709419    0.170    0.000    0.170    0.000 {built-in method builtins.len}
        4    0.000    0.000    0.000    0.000 {built-in method builtins.min}
      473    0.010    0.000    0.010    0.000 {built-in method builtins.print}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.setattr}
      469    0.001    0.000    0.001    0.000 {built-in method lzf.decompress}
        1    0.000    0.000    0.000    0.000 {built-in method time.time}
   302879    0.033    0.000    0.033    0.000 {method 'append' of 'list' objects}
        1    0.005    0.005    0.005    0.005 {method 'connect' of '_socket.socket' objects}
        1    0.000    0.000    0.000    0.000 {method 'decode' of 'bytes' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
   100013    0.033    0.000    0.033    0.000 {method 'encode' of 'str' objects}
        2    0.000    0.000    0.000    0.000 {method 'extend' of 'bytearray' objects}
        1    0.000    0.000    0.000    0.000 {method 'extend' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'format' of 'str' objects}
   100480    0.037    0.000    0.037    0.000 {method 'index' of 'bytes' objects}
        1    0.000    0.000    0.000    0.000 {method 'items' of 'dict' objects}
   100012    0.085    0.000    0.085    0.000 {method 'match' of 're.Pattern' objects}
     1154    0.898    0.001    0.898    0.001 {method 'recv' of '_socket.socket' objects}
        1    0.000    0.000    0.000    0.000 {method 'send' of '_socket.socket' objects}
   2305/0    0.003    0.000    0.000          {method 'switch' of 'greenlet.greenlet' objects}

从指标上来看,性能耗时较大的是rdbtool的encodehelpers中的转换函数这里耗时大约1.14秒,占总耗时7.56秒的15%,read的耗时大于是0.56秒,接受数据recv的耗时大约是0.898秒,从数据来看大部分的性能消耗都发生在rdbtool的代码解析过程中。所以本次性能消耗的主要的地方还是rdbtool工具的本身。

在运行的过程中,测试的机器还是那个虚拟机,尝试用strace来进行跟踪查看一下;

....
recvfrom(3, "0c6996(257_ee359ea8-e52b-490a-9f"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "bd(368_d1fbef4b-39b2-4c1f-8626-3"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "06_8457c6f1-c445-4683-ace0-3d4ea"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "05a3572-2662-431b-8e1a-cd931519c"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "02f-1182-4cef-bc34-721a476b12e9\370"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "3e2f-437a-b03f-afebc0a4ae9d\370\200\0\0358"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "-46d8-9f1c-1f75417cc880\370\200\0\0357@\0(3"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "1-bc1e-9aa6fd70e167\370\200\0\03579\0(383_b"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "97-e199ee70654c\370\200\0\0357\276\0(347_f44b1"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "0d901db4647\370\200\0\0358\330\0(404_2de2885b-"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "6db3a48\370\200\0\0357\35\0(452_431a7d99-9adc"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "73e\370\200\0\0357I\0(491_b3813e18-fc8b-46c"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "\200\0\0358<\0(329_ae4dd41b-0aef-4e1f-ab"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "D\0(393_25c61115-e9d4-4d77-a5d4-f"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "61_b8c0bcf0-a605-4837-9716-73ae8"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "6ab5b5d-fa14-45ed-b567-b6687fee9"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "0d0-1464-4e43-98bc-202bbc42e722("..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "cef1-4509-83e6-30a57dcf7aa7(486_"..., 8192, 0, NULL, NULL) = 8192
recvfrom(3, "-4288-ae49-a5895974c77e(387_2f36"..., 8192, 0, NULL, NULL) = 6940
write(1, "finish rdb work\n", 16finish rdb work
)       = 16
write(1, "finish use time 7.93073630332946"..., 44finish use time 7.930736303329468  second
) = 44
rt_sigaction(SIGINT, {SIG_DFL, [], SA_RESTORER, 0x7f8a94bc35d0}, {0x57a4c0, [], SA_RESTORER, 0x7f8a94bc35d0}, 8) = 0
sigaltstack(NULL, {ss_sp=0x10fb460, ss_flags=0, ss_size=8192}) = 0
sigaltstack({ss_sp=NULL, ss_flags=SS_DISABLE, ss_size=0}, NULL) = 0
exit_group(0)                           = ?
+++ exited with 0 +++


从strace的跟踪来看,确实脚本的运行过程中发生的主要的系统调用都是recvfrom和write这样的系统调用上,如果还需要在优化的话,一个比较好的方向就是去优化rdbtool的解析过程。

总结

本文还是将条件变量的方式,改为了协程实现的方式,总体上规避了部分条件变量获取时的系统调用的方法,但是对于整个脚本的性能提升相对有效,在测试520万key的遍历的时候,其实并没有太大的优化过程,只是自己在思考这个方向的时候做的一个探索吧。由于本人才疏学浅,如有错误请批评指正。

你可能感兴趣的:(python)