分布式监测即为被监测服务器将信息传给分布式nagios服务器,然后分布式nagios服务器将各个被监测服务器信息上传给nagois中心服务器。由nagios中心服务器对外提供web信息。具体可参考“分布式监测(转载-参考)”。以下对本项目进行改造,使其成为一个分布式监测系统。
将192.168.50.22改造成分布式服务器。在一台windows机器上,安装centos5.5虚拟机,用虚拟机作为nagios中心服务器。
由于这台windows连接着Internet,因此虚拟机也能连接到Internet。可由这台虚拟机的nagios中心服务器对外发送邮件,或者对外发送飞信短信。
以下对192.168.50.22进行改造。
(网上有说NRDP比NSCA好用,以后有机会研究下NRDP)
安装tar -zxvf nsca-2.7.2.tar.gz
# tar -zxvf nsca-2.7.2.tar.gz
cd nsca-2.7.2
./configure
make all
cp sample-config/send_nsca.cfg /usr/local/nagios/etc/
cd /usr/local/nagios/etc/
chown nagios.nagios send_nsca.cfg
cp src/send_nsca /usr/local/nagios/bin/
cd /usr/local/nagios/bin/
chown nagios.nagios send_nsca
vi /usr/local/nagios/libexec/submit_service_check_result
#!/bin/sh
# Arguments:
# $1 = host_name (Short name of host that the service is
# associated with)
# $2 = svc_description (Description of the service)
# $3 = state_string (A string representing the status of
# the given service - "OK", "WARNING", "CRITICAL"
# or "UNKNOWN")
# $4 = plugin_output (A text string that should be used
# as the plugin output for the service checks)
#
# Convert the state string to the corresponding return code
return_code=-1
case "$3" in
OK)
return_code=0
;;
WARNING)
return_code=1
;;
CRITICAL)
return_code=2
;;
UNKNOWN)
return_code=-1
;;
esac
# pipe the service check info into the send_nsca program, which
# in turn transmits the data to the nsca daemon on the central
# monitoring server
/usr/bin/printf "%s\t%s\t%s\t%s\n" "$1" "$2" "$return_code" "$4" | /usr/local/nagios/bin/send_nsca 192.168.50.114 -c /usr/local/nagios/etc/send_nsca.cfg
#此处的IP地址为监控中心服务器。
#chmod +x /usr/local/nagios/libexec/submit_service_check_result
#chown nagios.nagios /usr/local/nagios/libexec/submit_service_check_result
在nagios center监测中心如果启动了nsca,则可以用以下命令进行测试:
submit_service_check_result 主机名 '服务名' 服务状态 '信息描述'
# /usr/local/nagios/libexec/submit_service_check_result 0.15_rudp 'Uptime' OK 'aaa'
1 data packet(s) sent to host successfully.
#
信息已经成功发送。否则会报异常,则根据异常信息修改脚本。
vi /usr/local/nagios/libexec/submit_host_check_result
#!/bin/sh
# Arguments:
# $1 = host_name (Short name of host)
# $2 = state_string (A string representing the status of
# the given host - "UP", "DOWN ", or "UNREACHABLE ")
# $3 = plugin_output (A text string that should be used
# as the plugin output for the host checks)
#
# Convert the state string to the corresponding return code
return_code=-1
case "$2" in
UP)
return_code=0
;;
DOWN)
return_code=1
;;
UNREACHABLE)
return_code=2
;;
esac
# pipe the host check info into the send_nsca program, which
# in turn transmits the data to the nsca daemon on the central
# monitoring server
/usr/bin/printf "%s\t%s\t%s\n" "$1" "$return_code" "$3" | /usr/local/nagios/bin/send_nsca -H 192.168.50.114 -c /usr/local/nagios/etc/send_nsca.cfg
#此处的IP地址为监控中心服务器。
#chmod +x /usr/local/nagios/libexec/submit_host_check_result
#chown nagios.nagios /usr/local/nagios/libexec/submit_host_check_result
在nagios center监测中心如果启动了nsca,则可以用以下命令进行测试:
submit_host_check_result 主机名 服务状态 '信息描述'
# /usr/local/nagios/libexec/./submit_host_check_result 0.15_rudp UP 'lalala'
1 data packet(s) sent to host successfully.
#
信息已经成功发送。否则会报异常,则根据异常信息修改脚本。
#vi /usr/local/nagios/etc/objects/commands.cfg #增加如下检测命令
define command{
command_name submit_service_check_result
command_line /usr/local/nagios/libexec/submit_service_check_result $HOSTNAME$ '$SERVICEDESC$' $SERVICESTATE$ '$SERVICEOUTPUT$'
}
define command{
command_name submit_host_check_result
command_line /usr/local/nagios/libexec/submit_host_check_result $HOSTNAME$ $HOSTSTATE$ '$HOSTOUTPUT$'
}
vi /usr/local/nagios/etc/nagios.cfg
enable_notifications=0 #禁用告警
obsess_over_services=1 #开启被动监控
ocsp_command=submit_service_check_result #定义每次执行完服务检查后执行的命令(将数据传到监测中心)
obsess_over_hosts=1 #开启主机被动监控
ochp_command=submit_host_check_result #定义每次执行完主机检查后执行的命令(将数据传到监测中心)
vi /usr/local/nagios/etc/send_nsca.cfg
password=admin #设置密码,此处设置的密码要和监控中心服务器一致
encryption_method=1
vi /usr/local/nagios/etc/nsca.cfg
server_address=192.168.50.114 #设置Nagios监测中心ip地址
password=admin #设置密码,此处设置的密码要和监控中心服务器一致
decryption_method=1
被监测服务器监测脚本文件所有的服务增加以下参数(以下参数没有经过仔细确认):
# passive_checks_enabled 1
# active_checks_enabled 0
check_freshness 1
freshness_threshold 10
由于所有的服务都是继承generic-service,因此,可以在模板文件/usr/local/nagios/etc/objects/templates.cfg中的generic-service定义中进行添加:
define service{
name generic-service ; The 'name' of this service template
# active_checks_enabled 1 ; Active service checks are enabled
# passive_checks_enabled 1 ; Passive service checks are enabled/accepted
parallelize_check 1 ; Active service checks should be parallelized (disabling this can lead to major performance problems)
obsess_over_service 1 ; We should obsess over this service (if necessary)
# check_freshness 0 ; Default is to NOT check service 'freshness'
# notifications_enabled 1 ; Service notifications are enabled
event_handler_enabled 1 ; Service event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
failure_prediction_enabled 1 ; Failure prediction is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
is_volatile 0 ; The service is not volatile
check_period 24x7 ; The service can be checked at any time of the day
max_check_attempts 3 ; Re-check the service up to 3 times in order to determine its final (hard) state
normal_check_interval 10 ; Check the service every 10 minutes under normal conditions
retry_check_interval 2 ; Re-check the service every two minutes until a hard state can be determined
contact_groups admins ; Notifications get sent out to everyone in the 'admins' group
notification_options w,u,c,r ; Send notifications about warning, unknown, critical, and recovery events
notification_interval 60 ; Re-notify about service problems every hour
notification_period 24x7 ; Notifications can be sent out at any time
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
# passive_checks_enabled 1
# active_checks_enabled 0
check_freshness 1
freshness_threshold 10
}
由于绝大部分的服务器都是继承linux-vod-sw和linux-anyview-sw,因此,可以在模板文件/usr/local/nagios/etc/objects/templates.cfg中的linux-vod-sw和linux-anyview-sw定义中进行添加:
define host{
name linux-vod-sw ; Name of this template
use generic-host ; Inherit default values
check_period 24x7
check_interval 5
retry_interval 1
max_check_attempts 10
check_command check-host-alive
icon_image linux40.png
statusmap_image linux40.gd2
# parents 50.252_VodSw
# passive_checks_enabled 1
# active_checks_enabled 0
# notification_period 24x7
# notification_interval 30
# notification_options d,r
notifications_enabled 0
contact_groups admins
register 0 ; DONT REGISTER THIS - ITS A TEMPLATE
check_freshness 1
freshness_threshold 10
# passive_checks_enabled 1
# active_checks_enabled 0
}
define host{
name linux-anyview-sw ; Name of this template
use generic-host ; Inherit default values
check_period 24x7
check_interval 5
retry_interval 1
max_check_attempts 10
check_command check-host-alive
icon_image linux40.png
statusmap_image linux40.gd2
# parents 50.253_AnyviewSw
# passive_checks_enabled 1
# active_checks_enabled 0
# notification_period 24x7
# notification_interval 30
# notification_options d,r
notifications_enabled 0
contact_groups admins
register 0 ; DONT REGISTER THIS - ITS A TEMPLATE
check_freshness 1
freshness_threshold 10
# passive_checks_enabled 1
# active_checks_enabled 0
}
/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
service nagios restart
请注意分布式服务端与主服务端定义主机与服务配置的区别,当分布式服务端定义了主机与服务时,同样需要在主服务端定义一次。以下几个参数无需在分布式服务端特殊定义。
check_freshness,freshness_threshold,passive_checks_enabled,active_checks_enabled
freshness_threshold 强制刷新时间,主要的作用是当分布式服务端未提交新的数据时,服务端可以强制进行状态刷新,进行及时的预警。
中心服务器的安装跟192.168.50.22的安装类似,特别是安装nscp之前的安装和配置,可参考1.2小节。
1, 首先确认监控中心服务器已经安装了apache且禁用了Selinux
2, 创建系统用户,安装nagios.tar.gz,创建nagios的ie登录用户,安装nagios-plugins.tar.gz,nsca.tar.gz,修改配置文件等;
1) 创建nagios用户
#useradd -m nagios
passwd nagios
groupadd nagcmd
usermod -a -G nagcmd nagios
usermod -a -G nagcmd apache
2) 安装nagios-3.5.0.tar.gz
#tar zxvf nagios-3.5.0.tar.gz
#cd nagios-3.5.0
#./configure --with-command-group=nagcmd --prefix=/usr/local/nagios --with-gd-lib=/usr/local/lib --with-gd-inc=/usr/local/include
#make all && make install
#make install-init && make install-commandmode && make install-config
#chown -R nagios.nagios /usr/local/nagios (执行这个命令前可以先查看这个文件的属性,如果权限已经是nagios,就不需要再次执行了)
#make install-webconf
#htpasswd -c /usr/local/nagios/etc/htpasswd.users nagiosadmin
输入密码如:admin
#/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
#service httpd start
#service nagios start
在ie上使用http://192.168.50.22/nagios,输入nagiosadmin以及刚才创建的密码就能登录nagios主页了
如果出现以下错误,则应该是php包没有装全:
请安装以下几个php包:
php-cli-5.1.6-32.el5、php-common-5.1.6-32.el5、php-5.1.6-32.el5
安装完毕后重启httpd和nagios就能进去了。
3) 安装nagios-plugins-1.4.11.tar.gz
tar xzf nagios-plugins-1.4.11.tar.gz
cd nagios-plugins-1.4.11
./configure --with-nagios-user=nagios --with-nagios-group=nagcmd
make
make install
chkconfig --add nagios
chkconfig nagios on
安装nsca-2.7.2.tar.gz
tar -zxvf nsca-2.7.2.tar.gz
cd nsca-2.7.2
./configure
make all
cp src/nsca /usr/local/nagios/bin/
chown nagios:nagios /usr/local/nagios/bin/nsca
cp sample-config/nsca.cfg /usr/local/nagios/etc
chown nagios:nagios /usr/local/nagios/etc/nsca.cf
vi /usr/local/nagios/etc/nsca.cfg
password=admin #此处和分布式监控服务器密码一致
vi /usr/local/nagios/etc/nagios.cfg
check_external_commands=1 # 配置nagios检查扩展命令
accept_passive_service_checks=1 # 配置接受被动服务检测的结果
accept_passive_host_checks=1 #配置接受被动主机检测的结果
在host和service定义中,增加以下4个参数:
check_freshness 1 #开启强制刷新
freshness_threshold 480 #主服务端强制刷新的时间,具体含义请参考手册
passive_checks_enabled 1 #开启被动检测模式
active_checks_enabled 0 #关闭主服务端对该服务的主动检测
如果有防火墙,则开启5667端口:
iptables -I RH-Firewall-1-INPUT -m state --state NEW -m tcp -p tcp --dport 5667 -j ACCEPT
启动nsca:
/usr/local/nagios/bin/nsca -c /usr/local/nagios/nsca.cfg
service nagios restart
在nagios监测中心会发现有很多warning和critical。
1, 分布式服务器监测本机问题
由于192.168.50.22本机也进行了监测,而监测脚本为了方便,是复制localhost.cfg,里面的监测service项都是监测本机的命令,因此,复制到监测中心后,使用这个脚本居然监测到了监测中心机器的一些信息,如硬盘空间。由于监测中心配置的硬盘偏小,因此报警了。
处理方式是将监测192.168.50.22的脚本改成通过nrpe对本机进行监测。因此,需要在50.22上安装nrpe并启动nrpe。
2, 发现dhcp机器check_dhcp报错:Warning: This plugin must be either run as root or setuid root.
查看ls -a /usr/local/nagios/libexec/check_dhcp,居然分布式服务器上和监测中心服务器上两个文件权限不一致。
192.168.50.22上为root.nagios;而50.114上为nagios.nagios。
在50.114上进行如下处理:
chown root.nagios check_dhcp
chmod u+s check_dhcp
3, Return code of 127 is out of bounds - plugin may be missing
然后安装nrpe后,发现报错变成了:CHECK_NRPE: Error - Could not complete SSL handshake.
将所有的被监测机器的nrpe.cfg文件中的allowed_host都加上监控中心的ip地址192.168.50.114后问题解除。
不过将以上的allowed_host增加后,还有不少机器出现connect time out等问题。为了定位问题,将主机和服务正常的6台cdn、4台vss和4台tss配置脚本在分布式服务器50.22和中心服务器50.114删除后,重新启动服务,诧异的是原来那些有问题的机器,居然都好了。感觉可能是监测的轮询时间过短,于是将分布式服务器和监测中心服务器的nagios.cfg的参数command_check_interval由原来的10s改成20s,然后重启服务,所有的被监测信息都正常了!!!
不过观察一段时间下来,有时候还会出现很多问题。可能是分布式服务器和中心服务器的某个参数,比如时间等参数设置不正确导致的。或者是虚拟机性能不够强大导致。
http://blog.chinaunix.net/uid-23886490-id-3205869.html
nagios分布式部署详细文档2012-05-13 00:00:06
分类:
原文地址:nagios分布式部署详细文档 作者:qingchn
由于公司业务服务器分布比较广,如果按照之前的监控架构的话,就是每个IDC增加一个nagios监控,想来这样子也有好处,可以互相监控,但是由于每个IDC部署一个nagios,无疑增加了监控人员的查看难度,所以就研究了一下分布式nagios监控。
一,分角色
1,监控中心服务器,分布式服务器,被监控服务器
监控中心服务器:通过NSCA获取分布式监控服务器的相关状态,呈现相关服务器状态和发出报警等;
分布式服务器:通过对被监控服务器状态采集并且把被监控服务器的状态通过NSCA_send发送给监控中心服务器。
被监控服务器:被监控服务器就是生产环境服务器。
二,详细部署
1,被监控服务器
tar -zxvf nagios-plugins-1.4.15.tar.gz
cd nagios-plugins-1.4.15
./configure
make
make install
chown nagios.nagios /usr/local/nagios
chown nagios.nagios /usr/local/nagios
cd ..
ls
tar -zxvf nrpe-2.12.tar.gz
pwd
ls
cd nrpe-2.12
./configure
make all
make install-plugin
make install-daemon
make install-daemon-config
vi /usr/local/nagios/etc/nrpe.cfg
将allowed_hosts=127.0.0.1
修改成你的nagios分布式服务器的ip
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d #启动nrpe
netstat -anl|grep 5666 #测试监听端口
2,安装分布式服务器
useradd nagios
passwd nagios
groupadd nagcmd
usermod -G nagcmd nagios
usermod -G nagcmd apache #创建Nagios用户 创建组 把用户加入组 并加入apche
tar -zxvf nagios-3.2.3.tar.gz
cd nagios-3.2.3
./configure --with-command-group=nagcmd
make all
make install
make install-init
make install-config
make install-commandmode
tar -zxvf nagios-plugins-1.4.15.tar.gz
cd nagios-plugins-1.4.15
./configure --with-nagios-user=nagios --with-nagios-group=nagcmd
make
make install
chkconfig --add nagios
chkconfig nagios on
tar -zxvf nrpe-2.12.tar.gz
cd nrpe-2.12
./configure
make all
make install-plugin
/usr/local/nagios/libexec/check_nrpe -H 192.168.20.100 #测试被监控服务器是否连通,正常情况下会返回被监控端的NRPE版本
vi /usr/local/nagios/etc/objects/commands.cfg
#check nrpe
define command{
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
} #添加nrpe外部检测命令
tar -zxvf nsca-2.7.2.tar.gz
cd nsca-2.7.2
./configure
make all
cp sample-config/send_nsca.cfg /usr/local/nagios/etc/
cd /usr/local/nagios/etc/
chown nagios.nagios send_nsca.cfg
cp src/send_nsca /usr/local/nagios/bin/
cd /usr/local/nagios/bin/
chown nagios.nagios send_nsca
vi /usr/local/nagios/libexec/submit_check_result #创建脚本
(这里有问题,这个脚本只是上传service服务信息,并不适合上转host主机信息,否则会出现监测站中心的host都为pending。对主机上传需要另外创建脚本,参考http://blog.sina.com.cn/s/blog_8db5baf90100xlnp.html。本文上述已经对此进行了修正)
#!/bin/sh
# Arguments:
# $1 = host_name (Short name of host that the service is
# associated with)
# $2 = svc_description (Description of the service)
# $3 = state_string (A string representing the status of
# the given service - "OK", "WARNING", "CRITICAL"
# or "UNKNOWN")
# $4 = plugin_output (A text string that should be used
# as the plugin output for the service checks)
#
# Convert the state string to the corresponding return code
return_code=-1
case "$3" in
OK)
return_code=0
;;
WARNING)
return_code=1
;;
CRITICAL)
return_code=2
;;
UNKNOWN)
return_code=-1
;;
esac
# pipe the service check info into the send_nsca program, which
# in turn transmits the data to the nsca daemon on the central
# monitoring server
/bin/printf "%s\t%s\t%s\t%s\n" "$1" "$2" "$return_code" "$4" | /usr/local/nagios/bin/send_nsca 192.168.20.195 -c /usr/local/nagios/etc/send_nsca.cfg #此处的IP地址为监控中心服务器。
chmod +x /usr/local/nagios/libexec/submit_check_result
chown nagios.nagios /usr/local/nagios/libexec/submit_check_result
vi /usr/local/nagios/etc/objects/commands.cfg #增加如下检测命令
define command{
command_name submit_check_result
command_line /usr/local/nagios/libexec/submit_check_result $HOSTNAME$ '$SERVICEDESC$' $SERVICESTATE$ '$SERVICEOUTPUT$'
}
vi /usr/local/nagios/etc/nagios.cfg
enable_notifications=0 #禁用告警
obsess_over_services=1 #开启被动监控
ocsp_command=submit_check_result #定义每次执行完检查后执行的命令
obsess_over_hosts=1 #开启主机被动监控
ochp_command=submit_check_result #指定每次执行完主机检查后执行的命令
vi /usr/local/nagios/etc/send_nsca.cfg
password=urgamer #设置密码,此处设置的密码要和监控中心服务器一致
配置被监控的服务器,此处配置在分布式监控服务器上配置
cd /usr/local/nagios/etc/objects/
vi hosts.cfg
define host{
use linux-server ; Name of host template to use
; This host definition will inherit all variables that are defined
; in (or inherited by) the linux-server host template definition.
host_name urg-test01
alias linux-test01
address 192.168.20..100
}
vi services.cfg
define service{
use local-service ; Name of service template to use
host_name urg-test01
service_description PING
check_command check_ping!100.0,20%!500.0,60%
}
define service{
use local-service ; Name of service template to use
host_name urg-test01
service_description Root Partition
check_command check_nrpe!check_local_disk!20%!10%!/
}
define service{
use local-service ; Name of service template to use
host_name urg-test01
service_description Current Users
check_command check_nrpe!check_local_users!20!50
}
define service{
use local-service ; Name of service template to use
host_name urg-test01
service_description Total Processes
check_command check_nrpe!check_local_procs!250!400!RSZDT
}
vi nagios.cfg #添加以下两行配置
cfg_file=/usr/local/nagios/etc/objects/hosts.cfg
cfg_file=/usr/local/nagios/etc/objects/services.cfg
/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg #检查配置文件
service nagios start #启动nagios
3,安装监控中心服务器
首先确认监控中心服务器已经安装了apache且禁用了SElinux
useradd nagios
passwd nagios
groupadd nagcmd
usermod -G nagcmd nagios
usermod -G nagcmd apache #创建Nagios用户 创建组 把用户加入组 并加入apche
tar -zxvf nagios-3.2.3.tar.gz
cd nagios-3.2.3
./configure --with-command-group=nagcmd
make all
make install
make install-init
make install-config
make install-commandmode
make install-webconf
htpasswd -c /usr/local/nagios/etc/htpasswd.users nagiosadmin
tar xzf nagios-plugins-1.4.11.tar.gz
cd nagios-plugins-1.4.11
./configure --with-nagios-user=nagios --with-nagios-group=nagcmd
make
make install
chkconfig --add nagios
chkconfig nagios on
tar -zxvf nsca-2.7.2.tar.gz
cd nsca-2.7.2
./configure
make all
cp /usr/local/src/nsca-2.7.2/src/nsca /usr/local/nagios/bin/
chown nagios:nagios /usr/local/nagios/bin/nsca
cp /usr/local/src/nsca-2.7.2/sample-config/nsca.cfg /usr/local/nagios/etc
chown nagios:nagios /usr/local/nagios/etc/nsca.cf
vi /usr/local/nagios/etc/nsca.cfg
password=urgamer #此处和分布式监控服务器密码一致
vi /usr/local/nagios/etc/nagios.cfg
check_external_commands=1 # 配置nagios检查扩展命令
accept_passive_service_checks=1 # 配置接受被动服务检测的结果
accept_passive_host_checks=1 #配置接受被动主机检测的结果
cd /usr/local/nagios/etc/
mkdir monitor
cd monitor
vi monitor.cfg
define host{
use linux-server
host_name urg-test01
address 192,168,20.100
passive_checks_enabled 1
active_checks_enabled 0
}
define service{
use local-service
host_name urg-test01
service_description Root Partiton
check_command check_local_disk!30%!10!/
check_freshness 1
freshness_threshold 450
passive_checks_enabled 1
active_checks_enables 0
}
/usr/local/nagios/bin/nsca -d -c /usr/local/nagios/nsca.cfg
service nagios restart
此时重新打开浏览器就换显示,新加的服务器。