redis server 监控:
redis存活判断:ping判断,如果指定时间返回PONG表示存活,否则redis不能响应请求,可能阻塞或死亡
机器端口检查:nc 判断端口是否正常。
连接数:connected_clients >5000 时告警
连接数使用率:connected_clients/maxclients >90% 告警
list阻塞调用被阻塞的连接个数 blocked_clients >0 告警
redis内存使用率 ,占用最大内存使用率 > 80% 告警
最大内存容量限制而被驱逐(evict)的键数量 :evicted_keys>0 说明内存超设置最大内存
因为最大客户端数量限制而被拒绝的连接请求数量: rejected_connections >0
请求键的命中率:keyspace_hits#查找数据库键成功的次数 / 总查询(失败+成功)<50%
redis_cluster 监控
集群健康状态:cluster_state不为OK则告警
集群的节点数 :cluster_known_nodes 集群中redis节点的个数。
检测下线的数据槽slots个数:集群正常运行时,cluster_slots_fail 应该为0. 如果大于0说明集群有slot存在故障
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 功能说明: redis监控
# 创建者: zhouwj
# 创建时间: 2019/12/03
# 修改历史:
import redis
import sys
import subprocess
import json
import logging
import time
import requests
import logging.handlers
data=[]
log_filename='/home/zhouwj/zhouwj/bin/redis_monitor/redis_monitor.log'
#logging.basicConfig(
# filename='/home/zhouwj/zhouwj/bin/redis_monitor/redis_monitor.log',
# level=logging.INFO,
# format='%(levelname)s:%(asctime)s:%(message)s')
#with open('/home/zhouwj/zhouwj/bin/redis_monitor/nodes_ip.txt','rt') as f:
# ip_list=f.read().splitlines()
#进入redis集群模式,如果异常,记录到日志中,并终止脚本
localtime = time.strftime("[%H:%M:%S]", time.localtime())
headers={"Content-Type": "application/json"}
dl="\n-------------------------------------\n"
#logging
def loghandler(name):
#初始化logging
logging.basicConfig()
log=logging.getLogger(name)
#设置日志级别
log.setLevel(logging.INFO)
#添加TimeRoatingFileHandler
#定义一个1天换一次log文件的handler
#保留7个旧log文件
timefilehandler=logging.handlers.TimedRotatingFileHandler(filename=log_filename,when='D',interval=1,backupCount=7)
timefilehandler.suffix="%Y-%m-%d.log"
#设置log记录输出的格式
formatter=logging.Formatter('%(asctime)s %(levelname)s: %(name)s %(message)s')
timefilehandler.setFormatter(formatter)
#添加到logger中
log.addHandler(timefilehandler)
return log
#调用函数,实现日志输出
log=loghandler('redis')
def nodes_ip():
with open('/home/zhouwj/zhouwj/bin/redis_monitor/nodes_ip.txt','rt') as f:
lines=[line.split() for line in f]
for line in lines :
yield line
#获取连接数,>5000 时告警
def redis_connections():
try:
return info['connected_clients']
except Exception, e:
return 0
#redis的连接使用率
def redis_connections_usage():
try:
curr_connections = redis_connections()
max_clients = parse_config('maxclients')
rate = float(curr_connections) / float(max_clients)
return "%.2f" % (rate * 100)
except Exception, e:
return 0
#redis内存使用量
def redis_used_memory():
try:
print info['used_memory']
return info['used_memory']
except Exception, e:
return 0
#redis内存使用率
def redis_memory_usage():
try:
used_memory = info['used_memory']
max_memory = info['maxmemory']
system_memory = info['total_system_memory']
if max_memory:
rate = float(used_memory) / float(max_memory)
else:
rate = float(used_memory) / float(system_memory)
return "%.2f" % (rate * 100)
except Exception, e:
return 0
#拒绝连接数
def rejected_connections():
try:
return info['rejected_connections']
except Exception, e:
return 999
#运行以来删除过的key的数量
def evicted_keys():
try:
return info['evicted_keys']
except Exception, e:
return 999
#正在等待阻塞客户端数量
def blocked_clients():
try:
return info['blocked_clients']
except Exception, e:
return 0
#redis的OPS,redis内部较实时的每秒执行的命令数
def ops(self):
try:
return info['instantaneous_ops_per_sec']
except Exception, e:
return 0
#请求键的命中率,命中率低于50%告警
def hitRate():
try:
misses = info['keyspace_misses']
hits = info['keyspace_hits']
rate = float(hits) / float(int(hits) + int(misses))
return "%.2f" % (rate * 100)
except Exception, e:
return 0
#获取最大连接数
def parse_config(type):
try:
return redisconn.config_get(type)[type]
except Exception, e:
return None
def send_alarm(localtime,dl,headers,param,ip):
MSG = localtime+dl+"DCR-db_error:"+param
Secret = '3e15a344-f620-47a6-aa7a-afde087a8104'
url = 'https://qyapi.weixin.qq.com'
send_msg = '{ "msgtype": "text","text": {"content": "%s"}}' % (MSG)
send_url = '%s/cgi-bin/webhok/send?key=%s' % (url,Secret)
#print send_url
#print headers
#print send_msg
#p_post=requests.post(url=send_url,headers=headers,data=send_msg)
#print p_post
alarm_name='DCR_db_error'
alarm_cmd='msalarm -h %s -n %s -p %s' % (ip,alarm_name,param)
#alarm_cmd='msalarm -n "%s"-p "%s"' % (alarm_name,param)
log.error(MSG)
log.error(alarm_cmd)
subprocess.call(alarm_cmd,shell=True)
def alarm(ip):
if str(redisconn.ping()) != 'True':
param=ip+":"+'redis_ping:'+str(redisconn.ping())
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s redis_ping: normal",ip)
if check_alive(ip, 12201) != 0 :
param=ip+":"+'check_alive:'+'port_fail'
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s check_alive: normal",ip)
if redis_connections() > 5000 :
param=ip+":"+'redis_connections:'+str(redis_connections())
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s redis_connections: normal",ip)
if blocked_clients() > 0 :
param=ip+":"+'blocked_clients:'+str(blocked_clients())
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s blocked_clients: normal",ip)
if float(redis_connections_usage().strip("%")) > 90 :
param=ip+":"+'redis_connections_usage:'+str(redis_connections_usage())
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s redis_connections_usage: normal",ip)
if float(redis_memory_usage()) > 80 :
param=ip+":"+'redis_memory_usage:'+str(redis_memory_usage())
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s redis_memory_usage: normal",ip)
if evicted_keys() > 0 :
param=ip+":"+'evicted_keys:'+str(evicted_keys())
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s evicted_keys: normal",ip)
if rejected_connections() > 0 :
param=ip+":"+'rejected_connections:'+str(rejected_connections())
send_alarm(localtime,dl,headers,param,ip)
log.error(param)
else:
log.info("%s rejected_connections: normal",ip)
# if float(hitRate().strip("%")) < 50 :
# param=ip+":"+'hitRate:'+str(hitRate())
# send_alarm(localtime,dl,headers,param,ip)
# log.error(param)
# else:
# log.info("%s is single_mode: normal",ip)
def check_alive(host, port):
cmd = 'nc -z %s %s > /dev/null 2>&1' % (host, port)
return subprocess.call(cmd, shell=True)
#定义函数,抓取集群info
def clusterstatus(var):
if var == 'ok' :
item = 0
else:
item = 1
return item
def clusterslotsfail(var):
item = var
return item
def clusterknownnodes(var):
item = var
return item
if __name__ == '__main__':
for x in nodes_ip():
ip,mode,keyword,nodes= x
try:
redisconn=redis.StrictRedis(host=ip,port=12201,password=keyword,socket_connect_timeout=1)
#info=redisconn.info()
#alarm(ip)
except Exception,e:
param=ip+":""连接失败"
send_alarm(localtime,dl,headers,param,ip)
continue
info=redisconn.info()
alarm(ip)
if mode == 'cluster':
try:
cluster_info = redisconn.execute_command('cluster','info')
cluster_info = cluster_info.split('\r\n')
except:
param=ip+":""集群查询失败"
send_alarm(localtime,dl,headers,param,ip)
continue
try:
for i in cluster_info:
data.append([i.split(':')[0],i.split(':')[1]])
except:
pass
for key,var in data:
if key == 'cluster_state':
clusters_status=clusterstatus(var)
elif key == 'cluster_slots_fail':
clusters_lotsfail=clusterslotsfail(var)
elif key == 'cluster_known_nodes':
clusters_knownnodes=clusterknownnodes(var)
if str(clusters_status) == '0':
log.info("%s clusters_status: normal ",ip)
else:
param='clusters_status:'+ip+":"+clusters_status
send_alarm(localtime,dl,headers,param,ip)
if clusters_lotsfail == '0':
log.info("%s clusters_lotsfail: normal ",ip)
else:
param='clusters_lotsfail:'+ip+":"+clusters_lotsfail
send_alarm(localtime,dl,headers,parami,ip)
if clusters_knownnodes== nodes:
log.info("%s clusters_knownnodes:normal ",ip)
else:
param='clusters_knownnodes:'+ip+":"+clusters_knownnodes
send_alarm(localtime,dl,headers,param,ip)
nodes_ip.txt 格式:
ip single/cluster null/槽数
1.1.1.1 single null
2.2.2.2 cluster 100