redis基础监控

redis监控redis基础监控_第1张图片

redis server 监控:
redis存活判断:ping判断,如果指定时间返回PONG表示存活,否则redis不能响应请求,可能阻塞或死亡
机器端口检查:nc 判断端口是否正常。
连接数:connected_clients >5000 时告警
连接数使用率:connected_clients/maxclients >90% 告警
list阻塞调用被阻塞的连接个数 blocked_clients >0 告警
redis内存使用率 ,占用最大内存使用率 > 80% 告警
最大内存容量限制而被驱逐(evict)的键数量 :evicted_keys>0 说明内存超设置最大内存
因为最大客户端数量限制而被拒绝的连接请求数量: rejected_connections >0
请求键的命中率:keyspace_hits#查找数据库键成功的次数 / 总查询(失败+成功)<50%
redis_cluster 监控
集群健康状态:cluster_state不为OK则告警
集群的节点数 :cluster_known_nodes 集群中redis节点的个数。
检测下线的数据槽slots个数:集群正常运行时,cluster_slots_fail 应该为0. 如果大于0说明集群有slot存在故障

#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 功能说明: redis监控
# 创建者: zhouwj
# 创建时间: 2019/12/03
# 修改历史: 

import redis
import sys
import subprocess
import json
import  logging
import time
import requests
import logging.handlers
data=[]   
log_filename='/home/zhouwj/zhouwj/bin/redis_monitor/redis_monitor.log'
#logging.basicConfig(
#     filename='/home/zhouwj/zhouwj/bin/redis_monitor/redis_monitor.log',
#     level=logging.INFO,
#     format='%(levelname)s:%(asctime)s:%(message)s')
#with open('/home/zhouwj/zhouwj/bin/redis_monitor/nodes_ip.txt','rt') as f:
#        ip_list=f.read().splitlines()
#进入redis集群模式,如果异常,记录到日志中,并终止脚本
localtime = time.strftime("[%H:%M:%S]", time.localtime())
headers={"Content-Type": "application/json"}
dl="\n-------------------------------------\n"
#logging
def loghandler(name):
    #初始化logging
    logging.basicConfig()
    log=logging.getLogger(name)

    #设置日志级别
    log.setLevel(logging.INFO)

    #添加TimeRoatingFileHandler
    #定义一个1天换一次log文件的handler
    #保留7个旧log文件
    timefilehandler=logging.handlers.TimedRotatingFileHandler(filename=log_filename,when='D',interval=1,backupCount=7)
    timefilehandler.suffix="%Y-%m-%d.log"

    #设置log记录输出的格式
    formatter=logging.Formatter('%(asctime)s %(levelname)s: %(name)s %(message)s')
    timefilehandler.setFormatter(formatter)

    #添加到logger中
    log.addHandler(timefilehandler)
    return log

#调用函数,实现日志输出
log=loghandler('redis')
def nodes_ip():
    with open('/home/zhouwj/zhouwj/bin/redis_monitor/nodes_ip.txt','rt') as f:
        lines=[line.split() for line in f]
    for line  in lines :
        yield line 
 
        #获取连接数,>5000 时告警
def redis_connections():
    try:
        return info['connected_clients']
    except Exception, e:
        return 0
    #redis的连接使用率
def redis_connections_usage():
    try:
        curr_connections = redis_connections()
        max_clients = parse_config('maxclients')
        rate = float(curr_connections) / float(max_clients)
        return "%.2f" % (rate * 100)
    except Exception, e:
        return 0
    #redis内存使用量
def redis_used_memory():
    try:
        print info['used_memory']
        return info['used_memory']
    except Exception, e:
        return 0
    #redis内存使用率
def redis_memory_usage():
    try:
        used_memory = info['used_memory']
        max_memory = info['maxmemory']
        system_memory = info['total_system_memory']
        if max_memory:
            rate = float(used_memory) / float(max_memory)
        else:
            rate = float(used_memory) / float(system_memory)
        return "%.2f" % (rate * 100)
    except Exception, e:
        return 0
    #拒绝连接数
def rejected_connections():
    try:
        return info['rejected_connections']
    except Exception, e:
        return 999
    #运行以来删除过的key的数量
def evicted_keys():
    try:
        return info['evicted_keys']
    except Exception, e:
        return 999
    #正在等待阻塞客户端数量
def blocked_clients():
    try:
        return info['blocked_clients']
    except Exception, e:
        return 0
    #redis的OPS,redis内部较实时的每秒执行的命令数
def ops(self):
    try:
        return info['instantaneous_ops_per_sec']
    except Exception, e:
        return 0
    #请求键的命中率,命中率低于50%告警
def hitRate():
    try:
        misses = info['keyspace_misses']
        hits = info['keyspace_hits']
        rate = float(hits) / float(int(hits) + int(misses))
        return "%.2f" % (rate * 100)
    except Exception, e:
        return 0
    #获取最大连接数
def parse_config(type):
    try:
        return redisconn.config_get(type)[type]
    except Exception, e:
        return None
def send_alarm(localtime,dl,headers,param,ip):
    MSG = localtime+dl+"DCR-db_error:"+param
    Secret = '3e15a344-f620-47a6-aa7a-afde087a8104'
    url = 'https://qyapi.weixin.qq.com'
    send_msg = '{ "msgtype": "text","text": {"content": "%s"}}' % (MSG)
    send_url = '%s/cgi-bin/webhok/send?key=%s' % (url,Secret)
    #print send_url
    #print headers
    #print send_msg
    #p_post=requests.post(url=send_url,headers=headers,data=send_msg)
    #print p_post
    alarm_name='DCR_db_error'
    alarm_cmd='msalarm -h %s -n  %s -p %s' % (ip,alarm_name,param)
    #alarm_cmd='msalarm -n "%s"-p "%s"' % (alarm_name,param)
    log.error(MSG)
    log.error(alarm_cmd)

    subprocess.call(alarm_cmd,shell=True)
def alarm(ip):
        if str(redisconn.ping()) != 'True':
            param=ip+":"+'redis_ping:'+str(redisconn.ping())
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s redis_ping: normal",ip)
        if check_alive(ip, 12201) != 0 :
            param=ip+":"+'check_alive:'+'port_fail'
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s check_alive: normal",ip)
        if redis_connections() > 5000 :
            param=ip+":"+'redis_connections:'+str(redis_connections())
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s redis_connections: normal",ip)
        if blocked_clients() > 0 :
            param=ip+":"+'blocked_clients:'+str(blocked_clients())
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s blocked_clients: normal",ip)
        if float(redis_connections_usage().strip("%"))  > 90 :
            param=ip+":"+'redis_connections_usage:'+str(redis_connections_usage())
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s redis_connections_usage: normal",ip)
        if float(redis_memory_usage()) > 80 :
            param=ip+":"+'redis_memory_usage:'+str(redis_memory_usage())
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s redis_memory_usage: normal",ip)
        if evicted_keys() > 0 :
            param=ip+":"+'evicted_keys:'+str(evicted_keys())
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s evicted_keys: normal",ip)
        if rejected_connections() > 0 :
            param=ip+":"+'rejected_connections:'+str(rejected_connections())
            send_alarm(localtime,dl,headers,param,ip)
            log.error(param)
        else:
            log.info("%s rejected_connections: normal",ip)
#        if float(hitRate().strip("%")) < 50 :
#            param=ip+":"+'hitRate:'+str(hitRate())
#            send_alarm(localtime,dl,headers,param,ip)
#            log.error(param)
#        else:
#            log.info("%s is single_mode: normal",ip)
def check_alive(host, port):
    cmd = 'nc -z %s %s > /dev/null 2>&1' % (host, port)
    return subprocess.call(cmd, shell=True)
#定义函数,抓取集群info

def clusterstatus(var):
    if var == 'ok' :
        item = 0
    else:
        item = 1
    return item
    
def clusterslotsfail(var):
    item = var
    return item

def clusterknownnodes(var):
    item = var
    return item
if __name__ == '__main__':
    for x in  nodes_ip():
        ip,mode,keyword,nodes= x
        try:
            redisconn=redis.StrictRedis(host=ip,port=12201,password=keyword,socket_connect_timeout=1)
            #info=redisconn.info()
            #alarm(ip)
        except Exception,e:
            param=ip+":""连接失败"
            send_alarm(localtime,dl,headers,param,ip)
            continue
        info=redisconn.info()
        alarm(ip) 
        if mode == 'cluster':
            try:
                cluster_info = redisconn.execute_command('cluster','info')
                cluster_info = cluster_info.split('\r\n')
            except:
                param=ip+":""集群查询失败"
                send_alarm(localtime,dl,headers,param,ip)
                continue
            try:
                for i in cluster_info:
                    data.append([i.split(':')[0],i.split(':')[1]])
            except:
                pass
            for key,var in data:
                if key == 'cluster_state':
                    clusters_status=clusterstatus(var)
                elif key == 'cluster_slots_fail':
                    clusters_lotsfail=clusterslotsfail(var)
                elif key == 'cluster_known_nodes':
                    clusters_knownnodes=clusterknownnodes(var)
            if str(clusters_status) == '0':
                log.info("%s clusters_status: normal ",ip)
            else:
                param='clusters_status:'+ip+":"+clusters_status
                send_alarm(localtime,dl,headers,param,ip)
            if clusters_lotsfail == '0':
                log.info("%s clusters_lotsfail: normal ",ip)
            else:
                param='clusters_lotsfail:'+ip+":"+clusters_lotsfail
                send_alarm(localtime,dl,headers,parami,ip)
            if clusters_knownnodes== nodes:
                log.info("%s clusters_knownnodes:normal ",ip)
            else:
                param='clusters_knownnodes:'+ip+":"+clusters_knownnodes
                send_alarm(localtime,dl,headers,param,ip)

nodes_ip.txt 格式:
ip single/cluster null/槽数
1.1.1.1 single null
2.2.2.2 cluster 100

你可能感兴趣的:(redis,redis)