自动熔断 python

# encoding=utf8
import time
import json
import logging
import threading
from random import random

import redis

class AutoKnightError(Exception):
    pass

class AutoKnight(object):
    """
    judge处理方式:
    1:判断服务qps
    2:更新catch_present
    3:按比例降级服务,当前服务级百分比为100%时,仍然保持max(0.1%, 100)流量进行尝试,以便更改需降级比例。

    save_status 处理方式
    1:将当前status,delay 存储到redis中
    2:如果当前正常状态,且status = False or delay > tolerate_max_delay, 计算降级比例
    3: 如果当前为降级状态,计算降级比例

    calculate_present 计算降级比例
    1:如果当前为正常状态,获取全部队列错误和delay,计算需降级的比例,存储至redis
    2:如果当前为部分降级状态,计算降级比例应为前present * granularity 个(保证试错的实时响应),存储至redis
    3:如果当前为全部降级状态,有一次成功后,将redis错误队列清空,qps释放至20%,存储至redis

    """

    KNIGHT_ERROR_DELAY_PREFIX = 'auto_knight/granularity/error_delay/{consul_name}'  # 时间窗口内的服务情况
    CONSUL_KNIGHT_PRESENT_PREFIX = 'auto_knight/present/{consul_name}'  # float 需要降级的比例
    CONSUL_QPS_PREFIX = 'auto_knight/qps/{consul_name}/{time_stamp}'  # 服务qps

    def __init__(self, redis_client, consul_name,
                 granularity=100,
                 tolerate_max_delay=3000,
                 tolerate_max_error=0.1,
                 tolerate_present=0.1,
                 care_min_qps=None,
                 qps_window=1,
                 min_retry_qps_present=0.01,
                 fuse_recover_step=0.01,
                 fuse_recover_delay=1000,
                 **kwargs):
        """
        自动熔断类
        在监控粒度内,发生错误 或 延迟时,将会计算此时服务延迟率或错误率
        当平均延迟大于tolerate_max_delay(ms)或错误率大于tolerate_max_error,此时的熔断比例将产生,公式为max(错误比例, 延迟比例)
        当产生的熔断比例大于tolerate_present时,开启熔断
        当熔断到100%时,仍然会开启min_retry_qps_present流量尝试恢复,最小0.001
        当计算熔断比例小于当前熔断比例时,熔断比例下降,下降步长=fuse_recover_step,下降间隔=fuse_recover_delay(ms)
        用法:
        import redis
        redis_client = redis.StrictRedis(host,port)
        autoknight = AutoKnight(redis_client, 'text_consul_name')
        # 用法1
        try:
            with autoknight:
                you_method()
        except AutoKnightError:
            print 'consul was fused'
        except Exception as e:
            print e
        # 用法2
        try:
            autoknight.judge()
            start_time = time.time()
            you_method()
            autoknight.save_status(True, int((time.time() - start_time) * 1000))
        except AutoKnightError:
            print 'consul was fused'
            # 注意,当熔断时不要将此状态存入autoknight.save_status,这样会导致恢复时的异常。
        except Exception as e:
            autoknight.save_status(False, int((time.time() - start_time) * 1000))
            print e

        :param redis_client: redis client,存储窗口和服务信息
        :param granularity: 监控粒度,次
        :param tolerate_max_delay: 监控窗口内平均延迟熔断标志 毫秒
        :param tolerate_max_error: 监控窗口内错误量熔断标志 占监控窗口中的比例
        :param care_min_qps: 熔断起作用的最小qps
        :param qps_window: qps 统计窗口 秒
        :param min_retry_qps_present: 完全不可用时的测试qps
        :param fuse_recover_step: 熔断恢复比例步长,用于熔断恢复时,防止流量突然暴涨 百分比
        :param fuse_recover_delay: 熔断恢复步长之间的延迟,用于熔断恢复时,防止流量突然暴涨 毫秒
        :param kwargs:
        :except AutoKnightError: 熔断error
        """
        assert isinstance(redis_client, redis.StrictRedis), 'redis_client should be kv/pyredis.redis_client.RedisClient'
        self.redis_client = redis_client
        self.consul_name = consul_name
        self.granularity = granularity
        self.tolerate_max_delay = tolerate_max_delay
        self.tolerate_max_error = tolerate_max_error
        self.tolerate_present = tolerate_present
        self.care_min_qps = care_min_qps
        self.qps_window = max(qps_window, 1)
        self.min_retry_qps_present = 1 - float(max(min_retry_qps_present, 0.001))
        self.fuse_recover_step = min(fuse_recover_step, 1)
        self.fuse_recover_delay = max(fuse_recover_delay, 0)
        self.kwargs = kwargs

        self.local_param = threading.local()
        self.cache_present = 0  # 缓存上次获取的降级比例
        self._last_recover_time_stemp = time.time()  # 缓存上次解除熔断时间

        self.knight_error_delay_key = AutoKnight.KNIGHT_ERROR_DELAY_PREFIX.format(consul_name=self.consul_name)
        self.knight_present_key = AutoKnight.CONSUL_KNIGHT_PRESENT_PREFIX.format(consul_name=self.consul_name)

    def _get_qps(self, auto_increase=True, check_size=3):
        """
        获取前2个window 的平均qps
        :param auto_increase: 是否当前访问加一
        :param check_size: 检查窗口数量,不要超过6个
        :return:
        """
        now = int(int(time.time()) / self.qps_window) * self.qps_window
        keys = [AutoKnight.CONSUL_QPS_PREFIX.format(consul_name=self.consul_name,
                                                    time_stamp=str(now - (i * self.qps_window))) for i in range(check_size)]
        if auto_increase:
            value = self.redis_client.incrbyfloat(keys[0], 1.0)
            if value < 2:
                # 第一次加入这个key,设置过期时间
                self.redis_client.expire(keys[0], 60)
        values = self.redis_client.mget(keys[1:])
        value_len, sum_value = 0, 0
        for v in values:
            if not v:
                continue
            value_len, sum_value = value_len + self.qps_window, sum_value + float(v)
        return 0 if value_len <= 0 else sum_value * 1.0 / value_len

    def _present_return(self, present):
        # 全部降级时,也要留1%流量重试
        if random() < min(present, self.min_retry_qps_present):
            raise AutoKnightError('%s fuse error, present=%s' % (self.consul_name, present))
        return True

    def judge(self):
        """
        判断consul 是否可用
        :return: True or raise AutoKnightError
        """
        if self.care_min_qps:
            # 判断当前qps
            qps = self._get_qps()
            if self.cache_present == 0 and qps < self.care_min_qps:
                return True
        # 获取当前降级比例
        redis_present = self.redis_client.get(self.knight_present_key)
        if not redis_present:
            redis_present = 0
        present = float(redis_present)
        if present < self.tolerate_present:
            present = 0.0
        self.cache_present = present
        return self._present_return(present)

    def save_status(self, status, delay):
        """
        保存当前访问状态和延迟
        :param status: bool True or False
        :param delay: 延迟时间 毫秒
        :return: None or present
        """
        status = int(bool(status))
        delay = int(delay)
        self._save2redis(status, delay)
        # 有问题,需要进行
        if status == 0 or delay > self.tolerate_max_delay:
            present = self.calculate_present(status, delay)
            if present > self.tolerate_present:
                return present
        # 已经在熔断状态,必须进行熔断检查,方便后续恢复
        if self.cache_present:
            present = self.calculate_present(status, delay)
            if present > self.tolerate_present:
                return present
        return None

    def calculate_present(self, status=None, delay=None):
        """
        计算当前降级比例
        :return: present
        """
        present = 0
        range_len = self.granularity
        if self.cache_present:
            # 如果当前处于降级状态,则只取前cache_present 数据进行计算, 降级比例为100%时,取最近访问的一次结果
            range_len = max(int(range_len * (1 - self.cache_present)), 1)
        error_delay_list = self.redis_client.lrange(self.knight_error_delay_key, 0, range_len)
        if not error_delay_list:
            error_delay_list = []
        error_list, delay_list = [], []
        for i in error_delay_list:
            try:
                error, delay = json.loads(i)
                error_list.append(int(error))
                delay_list.append(float(delay))
            except:
                continue
        # 计算延迟比例
        if delay:
            delay_list.append(delay)
        delay_list_len = len(delay_list)
        avg_delay = sum(delay_list) * 1.0 / delay_list_len
        if avg_delay > self.tolerate_max_delay:
            present = len(map(lambda x: x > self.tolerate_max_delay, delay_list)) * 1.0 / delay_list_len
        else:
            pass

        # 计算错误比例
        if status is not None:
            error_list.append(int(bool(not status)))
        error_list_len = len(error_list)
        avg_error = sum(error_list) * 1.0 / error_list_len
        if avg_error > self.tolerate_max_error:
            present = max(present, avg_error)
        else:
            pass

        # 当从完全熔断恢复时,清空所有error,delay队列
        if self.cache_present >= 1 and present == 0:
            self.redis_client.ltrim(self.knight_error_delay_key, 0, 1)
            # 计算恢复熔断step
            if (self.cache_present - present) > self.fuse_recover_step:
                time_stemp = time.time()
                if (time_stemp - self._last_recover_time_stemp) * 1000 > self.fuse_recover_delay:
                    present = max(0.0, (self.cache_present - self.fuse_recover_step))
                    self._last_recover_time_stemp = time_stemp
                else:
                    present = self.cache_present
            else:
                # 直接存储present就可以了
                pass
        elif self.cache_present > 0 and present > 0:
            # 计算熔断上涨
            present = min(1.0, self.cache_present + present)
        elif self.cache_present > 0.0 and present == 0:
            # 当熔断恢复时,控制恢复流量比例
            if (self.cache_present - present) > self.fuse_recover_step:
                time_stemp = time.time()
                if (time_stemp - self._last_recover_time_stemp) * 1000 > self.fuse_recover_delay:
                    present = max(0.0, (self.cache_present - self.fuse_recover_step))
                    self._last_recover_time_stemp = time_stemp
                else:
                    present = self.cache_present
            else:
                pass

        # 存储降级比例到redis
        self.redis_client.set(self.knight_present_key, present)
        return present

    def _save2redis(self, status, delay):
        """
        存储当前状态
        :param status: 调用是否成功
        :param delay: 调用延迟
        :return: None
        """
        try:
            if not status:
                logging.debug('%s save error' % self.consul_name)
            if delay > self.tolerate_max_delay:
                logging.debug('%s save delay' % self.consul_name)
            status = 0 if status else 1
            error_delay = json.dumps([status, delay])
            self.redis_client.lpush(self.knight_error_delay_key, error_delay)
            self.redis_client.ltrim(self.knight_error_delay_key, 0, self.granularity)
        except Exception as e:
            logging.error('%s save_status error(%s)' % (self.consul_name, e))

    def __enter__(self):
        self.judge()
        self.local_param.start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.save_status(exc_type is None, (time.time() - self.local_param.start_time) * 1000)
        return False

工作中遇到自动降级的需求,花半天写了一个,效果还可以

你可能感兴趣的:(Python实用工具)