nagios监控配置

作为Linux系统管理员,系统的状态不可能随时查看,流量异常,负载突然增高,可能引起事故的发生。通过监控软件可以自动监控系统状态,发现异常就会报警,也可通过脚本监控。

1、Nagios

是开源软件,免费使用,可以监控主机状态,windows、linux、打印机都可以使用。

基于web界面,登录网站查看各项指标。

支持短信或邮件通知。

可以自定义脚本实现自定义化监控。

官网www.nagios.org

2、安装nagios服务端

分为服务端(centos6.4 192.168.0.105)和客户端(centos6.4 192.168.0.104)。客户端监控主机的状态,数据上报给服务端,服务端去处理数据。

安装扩展源

[root@client ~]# rpm -ivh http://www.lishiming.net/data/attachment/forum/month_1211/epel-release-6-7.noarch.rpm
//有些系统默认已经安装

安装nagios

[root@client ~]# yum install -y httpd nagios nagios-plugins nagios-plugins-all nrpe nagios-plugins-nrpe

设置http登录密码

[root@client ~]# htpasswd -c /etc/nagios/passwd nagiosadmin
New password:
Re-type new password:
Adding password for user nagiosadmin

查看配置文件

[root@client ~]# vim /etc/nagios/nagios.cfg

检测配置文件是否出错

[root@client ~]# nagios -v /etc/nagios/nagios.cfg

启动服务

[root@client ~]# service httpd start
Starting httpd: httpd: apr_sockaddr_info_get() failed for client
httpd: Could not reliably determine the server's fully qualified domain name, using 127.0.0.1 for ServerName
                                                           [  OK  ]
[root@client ~]# service nagios start
Starting nagios: done.IEshang

浏览器上访问:http://192.168.0.105/nagios

3、安装nagios客户端

安装扩展源

[root@localhost ~]# rpm -ivh http://www.lishiming.net/data/attachment/forum/month_1211/epel-release-6-7.noarch.rpm

安装nagios

[root@localhost ~]# yum install -y nagios-plugins nagios-plugins-all nrpe nagios-plugins-nrpe

修改配置文件

[root@localhost ~]# vim /etc/nagios/nrpe.cfg
allowed_hosts=127.0.0.1,192.168.0.105
ont_blame_nrpe=1

启动客户端

[root@localhost ~]# /etc/init.d/nrpe start
Starting nrpe:                                             [  OK  ]

4、监控中心添加被监控主机(服务端)

[root@client ~]# cd /etc/nagios/conf.d/
[root@client conf.d]# vim 192.168.0.104.cfg   //客户端IP命令
define host{
        use                     linux-server            ; Name of host template to use
                                                        ; This host definition will inherit all variables that are defined
                                                        ; in (or inherited by) the linux-server host template definition.
        host_name               192.168.0.104
        alias                   0.12
        address                 192.168.0.104
        }
define service{
        use                     generic-service
        host_name               192.168.0.104
        service_description     check_ping
        check_command           check_ping!100.0,20%!200.0,50%
        max_check_attempts 5
        normal_check_interval 1
}
//监控ping服务
define service{
        use                     generic-service
        host_name               192.168.0.104
        service_description     check_ssh
        check_command           check_ssh
        max_check_attempts      5
        normal_check_interval 1
}
//监控ssh服务
define service{
        use                     generic-service
        host_name               192.168.0.104
        service_description     check_http
        check_command           check_http
        max_check_attempts      5
        normal_check_interval 1
}
//监控http服务
[root@client ~]# cd /etc/nagios/conf.d/
[root@client conf.d]# vim 192.168.0.104.cfg
define host{
        use                     linux-server           
//Name of host template to use
 //This host definition will inherit all variables that are defined
 //in (or inherited by) the linux-server host template definition.
        host_name               192.168.0.12
        alias                   0.12
        address                 192.168.0.12
        }
define service{
        use                     generic-service
        host_name               192.168.0.12
        service_description     check_ping
        check_command           check_ping!100.0,20%!200.0,50%
        max_check_attempts 5     //遇到问题,检测5次在报警
        normal_check_interval 1   //重新检测时间间隔,1分钟
        notification_interval  60   //服务出现异常后,故障一直没解决,对使用者隔60分钟发出通知
}
//监控ping服务
define service{
        use                     generic-service
        host_name               192.168.0.12
        service_description     check_ssh
        check_command           check_ssh
        max_check_attempts      5
        normal_check_interval 1
}
//监控ssh服务
define service{
        use                     generic-service
        host_name               192.168.0.12
        service_description     check_http
        check_command           check_http
        max_check_attempts      5
        normal_check_interval 1
}
//监控http服务

5、实现监控远程的服务(服务端)

[root@client conf.d]# vim /etc/nagios/objects/commands.cfg
//添加下面的语句
define command{
        command_name    check_nrpe
        command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
        }
[root@client conf.d]# vim /etc/nagios/conf.d/192.168.0.104.cfg
define service{
        use     generic-service
        host_name       192.168.0.105
        service_description     check_load
        check_command           check_nrpe!check_load
//check_load是远程主机上的检测脚本
        max_check_attempts 5
        normal_check_interval 1
}
define service{
        use     generic-service
        host_name       192.168.0.105
        service_description     check_disk_hda1
        check_command           check_nrpe!check_hda1
        max_check_attempts 5
        normal_check_interval 1
}
define service{
        use     generic-service
        host_name       192.168.0.105
        service_description     check_disk_hda2
        check_command           check_nrpe!check_hda2
        max_check_attempts 5
        normal_check_interval 1
}

查看check_load(客户端)

[root@localhost ~]# vim /etc/nagios/nrpe.cfg
command[check_users]=/usr/lib/nagios/plugins/check_users -w 5 -c 10
command[check_load]=/usr/lib/nagios/plugins/check_load -w 15,10,5 -c 30,25,20
command[check_hda1]=/usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /dev/hda1
//hda1修改成sda1
//剩余20%就报警,剩余10%报红色警
command[check_zombie_procs]=/usr/lib/nagios/plugins/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 150 -c 200
[root@localhost ~]# /usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /dev/sda1
DISK OK - free space: /boot 429 MB (93% inode=99%);| /boot=29MB;387;435;0;484
//可以自定义写监控脚本,只要产生的结果格式是一致的

定义一个check_hda2

[root@localhost ~]# vim /etc/nagios/nrpe.cfg   //客户端
//添加下面一句
command[check_hda2]=/usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /dev/sda2

客户端、服务端重启服务,刷新页面测试

查看监控服务

[root@client conf.d]# ls /usr/lib/nagios/plugins/
check_breeze    check_icmp         check_nrpe      check_smtp
check_by_ssh    check_ide_smart    check_nt        check_snmp
check_clamd     check_imap         check_ntp       check_spop
check_cluster   check_ircd         check_ntp_peer  check_ssh
check_dhcp      check_jabber       check_ntp.pl    check_ssmtp
check_dig       check_ldap         check_ntp_time  check_swap
check_disk      check_ldaps        check_nwstat    check_tcp
check_disk_smb  check_load         check_oracle    check_time
check_dns       check_log          check_overcr    check_udp
check_dummy     check_mailq        check_pgsql     check_ups
check_file_age  check_mrtg         check_ping      check_users
check_flexlm    check_mrtgtraf     check_pop       check_wave
check_fping     check_mysql        check_procs     eventhandlers
check_ftp       check_mysql_query  check_real      negate
check_game      check_nagios       check_rpc       urlize
check_hpjd      check_nntp         check_sensors   utils.pm
check_http      check_nntps        check_simap     utils.sh
//监控命令,大多是二进制文件

6、配置邮件告警

[root@client conf.d]# vim /etc/nagios/objects/contacts.cfg
define contact{
        contact_name               123    //自定义名字
        use                             generic-contact  //模板
        alias                           aming
        email              zhouyan_wy@163.com //邮件
        }

自定义告警策略

notifications_enabled
//是否开启提醒功能。1为开启,0为禁用。一般,这个选项会在主配置文件(nagios.cfg)中定义,效果相同。
notification_interval
//之前刚介绍过,表示重复发送提醒信息的最短间隔时间。默认间隔时间是60分钟。如果这个值设置为0,将不会发送重复提醒。
notification_period
//发送提醒的时间段。非常重要的主机(服务)我定义为7×24,一般的主机(服务)就定义为上班时间。如果不在定义的时间段内,无论什么问题发生,都不会发送提醒。
notification_options
//这个参数定义了发送提醒包括的情况:d = 状态为DOWN, u = 状态为UNREACHABLE , r = 状态恢复为OK ,  f = flapping,n=不发送提醒。

































































你可能感兴趣的:(监控,nagios)