线上zookeeper集群越来多,很有必要对其运行状态进行监控,由于线上监控大多采用的是zabbix,因此,只需要写好采集数据的脚本即可,不用关心web端的展示和监控报警,比较方便。
1.导入zookeeper模板,模板内容在最后,复制粘贴保存为zookeeper.xml即可
点击template/import,选择文件,导入,如果报错,会提示哪一行有问题,删除对应的那一块,再导入即可。
2.导入后,在template页面就可以看到了,然后添加相应的host。
3.模板中item的采集数据方式为zabbix trapper。也就是通过zabbix_sender来发送数据。关于zabbix_trapper,请参考https://www.2goo.info/weblog/detail/233434
4.测试数据的采集
利用zabbix_sender命令,参数介绍:
-z: zabbix_server的IP地址
-s: zabbix_agent的HostName名
-k: 模板里key的名字
-o:key对应的value
-vv: 开启debug模式,输出详细信息
bin/zabbix_sender -z 10.10.64.202 -s "oss-mon-kafka-bjc-001" -k "zookeeper.status[zk_num_alive_connections]" -o 100 -vv
发送成功后,在页面端点击monitor/latest data/,然后筛选对应的HostName和application,就能查看到对应的value
5.采集发送打通后,接下来就要编写采集脚本,然后定时发送。采集脚本采用python,如下:
#!/usr/bin/python
#coding:utf-8
""" Check Zookeeper Cluster
zookeeper version should be newer than 3.4.x
# echo mntr|nc 127.0.0.1 2181
zk_version 3.4.6-1569965, built on 02/20/2014 09:09 GMT
zk_avg_latency 0
zk_max_latency 4
zk_min_latency 0
zk_packets_received 84467
zk_packets_sent 84466
zk_num_alive_connections 3
zk_outstanding_requests 0
zk_server_state follower
zk_znode_count 17159
zk_watch_count 2
zk_ephemerals_count 1
zk_approximate_data_size 6666471
zk_open_file_descriptor_count 29
zk_max_file_descriptor_count 102400
# echo ruok|nc 127.0.0.1 2181
imok
"""
import sys
import socket
import re
import subprocess
from StringIO import StringIO
import os
#这两个参数依据你安装zabbix目录而定。
zabbix_sender = '/usr/bin/zabbix_sender'
zabbix_conf = '/etc/zabbix/zabbix_agentd.conf'
send_to_zabbix = 1
############# get zookeeper server status
class ZooKeeperServer(object):
def __init__(self, host='localhost', port='2181', timeout=1):
self._address = (host, int(port))
self._timeout = timeout
self._result = {}
def _create_socket(self):
return socket.socket()
def _send_cmd(self, cmd):
""" Send a 4letter word command to the server """
s = self._create_socket()
s.settimeout(self._timeout)
s.connect(self._address)
s.send(cmd)
data = s.recv(2048)
s.close()
return data
def get_stats(self):
""" Get ZooKeeper server stats as a map """
data_mntr = self._send_cmd('mntr')
data_ruok = self._send_cmd('ruok')
if data_mntr:
result_mntr = self._parse(data_mntr)
if data_ruok:
result_ruok = self._parse_ruok(data_ruok)
self._result = dict(result_mntr.items() + result_ruok.items())
if not self._result.has_key('zk_followers') and not self._result.has_key('zk_synced_followers') and not self._result.has_key('zk_pending_syncs'):
##### the tree metrics only exposed on leader role zookeeper server, we just set the followers' to 0
leader_only = {'zk_followers':0,'zk_synced_followers':0,'zk_pending_syncs':0}
self._result = dict(result_mntr.items() + result_ruok.items() + leader_only.items() )
return self._result
def _parse(self, data):
""" Parse the output from the 'mntr' 4letter word command """
h = StringIO(data)
result = {}
for line in h.readlines():
try:
key, value = self._parse_line(line)
result[key] = value
except ValueError:
pass # ignore broken lines
return result
def _parse_ruok(self, data):
""" Parse the output from the 'ruok' 4letter word command """
h = StringIO(data)
result = {}
ruok = h.readline()
if ruok:
result['zk_server_ruok'] = ruok
return result
def _parse_line(self, line):
try:
key, value = map(str.strip, line.split('\t'))
except ValueError:
raise ValueError('Found invalid line: %s' % line)
if not key:
raise ValueError('The key is mandatory and should not be empty')
try:
value = int(value)
except (TypeError, ValueError):
pass
return key, value
def get_pid(self):
# ps -ef|grep java|grep zookeeper|awk '{print $2}'
pidarg = '''ps -ef|grep java|grep zookeeper|grep -v grep|awk '{print $2}' '''
pidout = subprocess.Popen(pidarg,shell=True,stdout=subprocess.PIPE)
pid = pidout.stdout.readline().strip('\n')
return pid
def send_to_zabbix(self, metric):
key = "zookeeper.status[" + metric + "]"
if send_to_zabbix > 0:
#print key + ":" + str(self._result[metric])
try:
subprocess.call([zabbix_sender, "-c", zabbix_conf, "-k", key, "-o", str(self._result[metric]) ], stdout=FNULL, stderr=FNULL, shell=False)
except OSError, detail:
print "Something went wrong while exectuting zabbix_sender : ", detail
else:
print "Simulation: the following command would be execucted :\n", zabbix_sender, "-c", zabbix_conf, "-k", key, "-o", self._result[metric], "\n"
def usage():
"""Display program usage"""
print "\nUsage : ", sys.argv[0], " alive|all"
print "Modes : \n\talive : Return pid of running zookeeper\n\tall : Send zookeeper stats as well"
sys.exit(1)
accepted_modes = ['alive', 'all']
if len(sys.argv) == 2 and sys.argv[1] in accepted_modes:
mode = sys.argv[1]
else:
usage()
zk = ZooKeeperServer()
# print zk.get_stats()
pid = zk.get_pid()
if pid != "" and mode == 'all':
zk.get_stats()
# print zk._result
FNULL = open(os.devnull, 'w')
for key in zk._result:
zk.send_to_zabbix(key)
FNULL.close()
print pid
elif pid != "" and mode == "alive":
print pid
else:
print 0
执行结果:
#获取zookeeper的所有key的值
[root@oss-mon-kafka-bjc-001 scripts]# python check_zookeeper.py all
{'zk_followers': 0, 'zk_outstanding_requests': 0, 'zk_approximate_data_size': 50747, 'zk_packets_sent': 31693631, 'zk_pending_syncs': 0, 'zk_avg_latency': 0, 'zk_version': '3.4.6-1569965, built on 02/20/2014 09:09 GMT', 'zk_watch_count': 47, 'zk_packets_received': 31693282, 'zk_open_file_descriptor_count': 33, 'zk_server_ruok': 'imok', 'zk_server_state': 'follower', 'zk_synced_followers': 0, 'zk_max_latency': 1643, 'zk_num_alive_connections': 6, 'zk_min_latency': 0, 'zk_ephemerals_count': 83, 'zk_znode_count': 700, 'zk_max_file_descriptor_count': 655350}
2750
#获取zookeeper的pid
[root@oss-mon-kafka-bjc-001 scripts]# python check_zookeeper.py alive
2750
6.配置zabbix的采集配置策略
增加脚本可执行权限
chmod +x /usr/local/zabbix-agent/scripts/check_zookeeper.py
zabbix配置文件
vim /etc/zabbix/zabbix_agentd.d/check_zookeeper.conf
增加以下内容
UserParameter=zookeeper.status[*],/usr/bin/python /usr/local/zabbix-agent/scripts/check_zookeeper.py $1
重新启动zabbix-agent服务
service zabbix-agent restart
7.web端观察采集数据是否正常,如果没有数据,则需调试check_zookeeper.py,看问题在哪。
最后附上zookeeper的监控模板,比较长,貌似不支持附件,只能贴上来了。
2.0
2016-02-27T15:15:09Z
Templates
Template ZooKeeper
Template ZooKeeper
Templates
ZooKeeper Status
-
zookeeper alive connections
2
0
zookeeper.status[zk_num_alive_connections]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper approximate data size
2
0
zookeeper.status[zk_approximate_data_size]
0
90
365
0
3
B
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper average latency
2
0
zookeeper.status[zk_avg_latency]
0
90
365
0
3
tick
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper ephemerals count
2
0
zookeeper.status[zk_ephemerals_count]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper leader's followers
2
0
zookeeper.status[zk_followers]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper leader's pending syncs
2
0
zookeeper.status[zk_pending_syncs]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper leader's synced followers
2
0
zookeeper.status[zk_synced_followers]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper max file descriptor count
2
0
zookeeper.status[zk_max_file_descriptor_count]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper max latency
2
0
zookeeper.status[zk_max_latency]
0
90
365
0
3
tick
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper min latency
2
0
zookeeper.status[zk_min_latency]
0
90
365
0
3
tick
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper opened file descriptor count
2
0
zookeeper.status[zk_open_file_descriptor_count]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper outstanding requests
2
0
zookeeper.status[zk_outstanding_requests]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper packages received
2
0
zookeeper.status[zk_packets_received]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper packages sent
2
0
zookeeper.status[zk_packets_sent]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper pid
0
0
zookeeper.status[all]
30
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper response checking
2
0
zookeeper.status[zk_server_ruok]
0
90
365
0
1
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper state role
2
0
zookeeper.status[zk_server_state]
0
90
365
0
1
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper version
2
0
zookeeper.status[zk_version]
0
90
365
0
1
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper watches count
2
0
zookeeper.status[zk_watch_count]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
-
zookeeper znodes count
2
0
zookeeper.status[zk_znode_count]
0
90
365
0
3
0
0
0
0
1
0
0
0
ZooKeeper Status
{Template ZooKeeper:zookeeper.status[zk_outstanding_requests].last()}>10
big outstanding requests number
0
4
0
{Template ZooKeeper:zookeeper.status[zk_pending_syncs].last()}>10
big pending syncs
0
4
0
{Template ZooKeeper:zookeeper.status[zk_avg_latency].last()}>10
large average latency
0
4
0
{Template ZooKeeper:zookeeper.status[zk_open_file_descriptor_count].last()} > {Template ZooKeeper:zookeeper.status[zk_max_file_descriptor_count].last()}*0.85
large file descriptor used
0
4
0
{Template ZooKeeper:zookeeper.status[all].last()}=0
zookeeper is not running
0
4
0
{Template ZooKeeper:zookeeper.status[zk_server_state].abschange()}>0
zookeeper state role has been changed
0
2
0
ZooKeeper Alive Connections
900
200
0.0000
100.0000
1
1
1
1
0
0.0000
0.0000
0
0
0
0
0
0
00DDDD
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_num_alive_connections]
ZooKeeper Data Size
900
200
0.0000
100.0000
1
1
1
1
0
0.0000
0.0000
0
0
0
0
0
0
00C800
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_approximate_data_size]
ZooKeeper Latency
900
200
0.0000
100.0000
1
1
0
1
0
0.0000
0.0000
0
0
0
0
0
2
00C800
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_avg_latency]
1
2
C80000
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_min_latency]
2
2
0000C8
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_max_latency]
ZooKeeper Packages Received/Sent
900
200
0.0000
100.0000
1
1
1
1
0
0.0000
0.0000
0
0
0
0
1
0
00C800
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_packets_received]
0
0
FF3333
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_packets_sent]
ZooKeeper Watches Count
900
200
0.0000
100.0000
1
1
1
1
0
0.0000
0.0000
0
0
0
0
0
0
660066
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_watch_count]
ZooKeeper Znodes Count
900
200
0.0000
100.0000
1
1
0
1
0
0.0000
0.0000
0
0
0
0
0
1
FFCCFF
0
2
0
-
Template ZooKeeper
zookeeper.status[zk_znode_count]
参考文章
http://blog.csdn.net/reblue520/article/details/52352689