nagios 笔记

nagios安装

apt-get install nagios3
#Nagios web administration password 
#123123

默认安装apache2 监听80端口
访问 http://10.10.100.54/nagios3/
默认管理员是nagiosadmin 密码自己设置的
密码文件/etc/nagios3/htpasswd.users

修改管理员密码

htpasswd -c /etc/nagios3/htpasswd.users admin
New password: 
123123
Re-type new password: 
123123

设置管理员权限

vim /etc/nagios3/cgi.cfg

use_authentication=1
authorized_for_system_information=admin
authorized_for_configuration_information=admin
authorized_for_system_commands=admin
authorized_for_all_services=admin
authorized_for_all_hosts=admin
authorized_for_all_service_commands=admin
authorized_for_all_host_commands=admin

手动执行任务

vim /etc/nagios3/nagios.cfg 
check_external_commands=1 #允许手动这页面上执行任务 **System>Scheduling Queue**

错误

#有可能报错
Error: Could not stat() command file '/var/lib/nagios3/rw/nagios.cmd'!
sudo /etc/init.d/nagios3 stop
sudo dpkg-statoverride --update --add nagios www-data 2710 /var/lib/nagios3/rw
sudo dpkg-statoverride --update --add nagios nagios 751 /var/lib/nagios3
sudo /etc/init.d/nagios3 start

重启服务就可以看到页面的Current Status>Hosts已经默认监控本机

添加需要监控的主机

vim /etc/nagios3/conf.d/hosts.cfg #默认没这文件

define host {
    use generic-host
    host_name lvs  #主机名称 
    alias lvs #主机别名不设置默认为host_name
    address 10.10.100.100 #需要监控主机的IP
    check_interval 1#检查的间隔 1分钟
}

定义主机组

vim /etc/nagios3/conf.d/hostgroups_nagios2.cfg

# Some generic hostgroup definitions

# A simple wildcard hostgroup
define hostgroup {
        hostgroup_name  all
                alias           All Servers
                members         *  #所有的主机的组
        }

# A list of your Debian GNU/Linux servers
define hostgroup {
        hostgroup_name  debian-servers
                alias           Debian GNU/Linux Servers
                members         localhost
        }

# A list of your web servers
define hostgroup {
        hostgroup_name  http-servers
                alias           HTTP servers
                members         localhost,lvs #指定组的主机,按,分割,可以添加多个,
        }

# A list of your ssh-accessible servers
define hostgroup {
        hostgroup_name  ssh-servers
                alias           SSH servers
                members         localhost
        }

#添加ftp监控
define hostgroup {
        hostgroup_name  ftp-servers
        alias           FTP Servers
        members         lvs
}

定义服务项

vim /etc/nagios3/conf.d/services_nagios2.cfg
# check that web services are running
define service {
        hostgroup_name                  http-servers
        service_description             HTTP
        check_command                   check_http
        use                             generic-service
        notification_interval           0 ; set > 0 if you want to be renotified
}

# check that ssh services are running
define service {
        hostgroup_name                  ssh-servers
        service_description             SSH
        check_command                   check_ssh
        use                             generic-service
        notification_interval           0 ; set > 0 if you want to be renotified
}

# check that ssh services are running
define service {
        hostgroup_name                  ftp-servers  #这必须在hostgroups_nagios2.cfg有这个主机组 
        service_description             FTP
        check_command                   check_ftp ;检查FTP,插件默认路径为/usr/lib/nagios/plugins/
        use                             generic-service
        notification_interval           1 ; 通知间隔 0为不通知

}

}

NRPE

我要监控远程主机的 CPU、硬盘空间、内存等等
Nagios 提供了一个外挂插件,叫 NRPE
它可以让 nagios server 在固定时间去抓 nagios client 被监控的项目回来判断是否ok。 和zabbix agent功能类似

远端

#这需要监控的机器上安装
apt-get install nagios-nrpe-server
vim /etc/nagios/nrpe.cfg
......
allowed_hosts=10.10.100.54 #Nagios服务器端的地址
......

#添加监控项
#当前登录的用户大于1警告(warning),大于2危急(critical)
command[check_users]=/usr/lib/nagios/plugins/check_users -w 1 -c 2
#同load average
#当1分钟多于15个进程等待,5分钟多于10个,15分钟多于5个则为警告状态
#当1分钟多于30个进程等待,5分钟多于25个,15分钟多于20个则为危急状态
command[check_load]=/usr/lib/nagios/plugins/check_load -w 15,10,5 -c 30,25,20
#如果空闲空间小于40%就是警告阀值
#如果空闲空间小于10%就是危急阀值
#-p分区
command[check_hda1]=/usr/lib/nagios/plugins/check_disk -w 40% -c 10% -p /dev/sda1 
#检查进程
command[check_zombie_procs]=/usr/lib/nagios/plugins/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 150 -c 200
......
#重启服务
service nagios-nrpe-server restart

监控端

apt-get install nagios-nrpe-plugin
#检查通讯是否正常
/usr/lib/nagios/plugins/check_nrpe -H 10.10.100.100
NRPE v2.15
vim /etc/nagios-plugins/config/check_nrpe.cfg

.......
#添加
define service {
        use                             generic-service
        hostgroup_name                  all
        service_description             NRPE check_hda1 #和监控端的 command[check_hda1]对应
        check_command                   check_nrpe_1arg!check_hda1
        notification_interval           0
}

define service {
        use                             generic-service
        hostgroup_name                  all
        service_description             NRPE check_total_procs
        check_command                   check_nrpe_1arg!check_total_procs
        notification_interval           0
}

define service {
        use                             generic-service
        hostgroup_name                  all
        service_description             NRPE check_users
        check_command                   check_nrpe_1arg!check_users
        notification_interval           0
}

重启服务后就可以看到效果

通知

定义联系人

vim /etc/nagios3/conf.d/contacts_nagios2.cfg


define contact{
        contact_name                    hu  #联系人称呼
        service_notification_period     24x7 #当服务出现异常时,发送通知的时间段,这个时间段"7x24"在timeperiods_nagios2.cfg文件中定义 
        host_notification_period        24x7 #当主机出现异常时,发送通知的时间段,这个时间段"7x24"在timeperiods_nagios2.cfg文件中定义 
        service_notification_options    w,u,c,r  #这个定义的是"通知可以被发出的情况"。w(warn)表示警告状态,u(unknown)表示不明状态,c(criticle)表示紧急状态,r(recover)表示恢复状态。也就是在服务出现警告状态、未知状态、紧急状态和重新恢复状态时都发送通知给使用者。 
        host_notification_options       d,r #定义主机在什么状态下需要发送通知给使用者,d(down)表示宕机状态,r(recovery)表示重新恢复状态。 
        service_notification_commands   notify-service-by-email  #服务故障时,发送通知的方式,可以是邮件和短信,这里发送的方式是邮件,在commands.cfg文件中定义 
        host_notification_commands      notify-host-by-email #主机故障时,发送通知的方式,可以是邮件和短信,这里发送的方式是邮件,在commands.cfg文件中定义
        email                           xxx@xxx.com #发送邮件的邮箱
}

定义联系人组

define contactgroup{
        contactgroup_name       hus
        members                 hu  #多个人用,分割
        }

定义服务器异常的联系人

vim /etc/nagios-plugins/config/check_nrpe.cfg

define service {
        use                             generic-service
        hostgroup_name                  all
        service_description             NRPE check_hda1 
        check_command                   check_nrpe_1arg!check_hda1
        contact_groups                  hus #如果有异常通知hus组的人
}

添加发送邮件

vim /etc/nagios3/commands.cfg

#测试使用sendEmail发送QQ邮件
define command{
        command_name    notify-host-by-email #contact里面定义的host_notification_commands
        command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" |sendEmail -f 9656951@qq.com -t $CONTACTEMAIL$ -s smtp.qq.com -u "** 主机: $HOSTALIAS$ is $HOSTSTATE$ **" -xu 9656951@qq.com -xp xxxxxx
        }

define command{
        command_name    notify-service-by-email #contact里面定义的service_notification_commands
        command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$" |sendEmail -f 9656951@qq.com -t $CONTACTEMAIL$ -s smtp.qq.com -u "** 主机: $HOSTALIAS$ 服务: $SERVICEDESC$ is $SERVICESTATE$ **" -xu 9656951@qq.com -xp xxx } 

你可能感兴趣的:(运维,nagios)