nagios安装.配置与监控的整个过程

Nagios

1#########################

通过rz或sftp命令将CentOS-Base.repo.oldboy上传到linux上

/bin/cp CentOS-Base.repo.oldboy /etc/yum.repos.d

mv CentOS-Base.repo CentOS-Base.repo.save

/bin/cp CentOS-Base.repo.oldboy CentOS-Base.repo

这样yum安装源就配好了

 

解决perl编译问题:

echo 'export LC_ALL=C'>> /etc/profile

source /etc/profile              #←使修改生效

 

关闭nagios server iptables防火墙

/etc/init.d/iptables stop

 

禁止自启动

chkconfig iptables off

 

解决时间同步

/usr/sbin/ntpdate pool.ntp.org

 

安装nagios需要的基础软件包

yum install gcc glibc glibc-common -y

yum install gd gd-devel -y

yum install httpd php -y

 

vi /etc/yum.conf

more /etc/yum.repos.d/CentOS-Base.repo

 

创建nagios需要的用户及组

/usr/sbin/useradd -m nagios

/usr/sbin/useradd apache -M -s /sbin/nologin

/usr/sbin/groupadd nagcmd

/usr/sbin/usermod -a -G nagcmd nagios

 

/usr/sbin/usermod -a -G nagcmd apache

 

上传软件包到指定目录或通过URL下载

mkdir -p /home/rhy/tools/nagios

cd /home/rhy/tools/nagios

rz -y #从本地上传oldboytraining_nagiossoft.zip

 

 

安装nagios软件包

cd /home/rhy/tools/nagios

unzip oldboytraining_nagiossoft.zip #←使用unzip解压

tar xzf nagios-3.2.0.tar.gz

cd nagios-3.2.0

./configure --with-command-group=nagcmd

make all

make install

make install-init

make install-config

make install-commandmode

 

 

安装nagios web配置文件及创建登陆用户

#安装nagios web配置文件

make install-webconf

cd ../

:如果为单独的apache源代码编译,则操作如下

mkdir -p /etc/httpd/conf.d/

make install-webconf

cp /etc/httpd/conf.d/nagios.conf /usr/local/apache2/conf/extra/

假定apache安装路径/usr/local/apache2,以下同,不在提及。

编辑vi /usr/local/apache2/conf/httpd.conf 增加

Include conf/extra/nagios.conf

;如果为yum安装的httpd 上面几行绿色的不需要操作。

 

#创建nagios监控界面登入需要的用户名及密码

htpasswd -c /usr/local/nagios/etc/htpasswd.users oldboy

 

添加监控报警的接收email

vi /usr/local/nagios/etc/objects/contacts.cfg +35

修改如下行:

email nagios@localhost

改为:

email [email protected]

保存,退出。

注意:此功能依赖本机的sendmail服务,可执行service sendmail start 开启。(
可稍后再启动)

 

配置apache服务

vi /etc/httpd/conf/httpd.conf +231

启动apache并加入系统自启动:

service httpd start

chkconfig httpd on

检查apache port

netstat -lntup|grep 80

tcp 0 0 :::80 :::* LISTEN

 

安装nagios插件软件包

tar zxzf nagios-plugins-1.4.13.tar.gz

cd nagios-plugins-1.4.13

./configure --with-nagios-user=nagios --with-nagios-group=nagios --enable-perl-modules

make

make install

cd ..

 

#此编译如遇到make: *** [all] Error 2 configure--with-mysql=/usr/local/mysql解决

检查插件个数:

ls /usr/local/nagios/libexec/|wc -l

63

#

 

添加到系统开机自启动

chkconfig --add nagios

chkconfig nagios on

验证nagios配置文件(检查语法)

/etc/init.d/nagios checkconfig #←用此方法检查需要需要做下特殊处理详见附录九

[root@linux training_soft]# /etc/init.d/nagios checkconfig

Running configuration check... OK.

不显示错误  用

vim /etc/init.d/nagios +178

把>后面的删掉即可

Total Warnings: 0

Total Errors: 0

Things look okay - No serious problems were detected during the pre-flight check

 

 

启动nagios

service nagios start或/etc/init.d/nagios start

出现问题

[root@linux ~]# /etc/init.d/nagios start

Starting nagios:su: warning: cannot change directory to /home/nagios: No such file or directory

需要建立目录

#mkdir –p /home/nagios

 

 

检查进程:

[root@nagiosserver nagios-plugins-1.4.13]# ps -ef|grep nagios

nagios 17686 1 0 21:29 ? 00:00:00 /usr/local/nagios/bin/nagios -d /usr/local/nagios/etc/nagios.cfg

 

关闭服务器开启的SElinux

setenforce 0 #使之直接生效,

然后编辑文件

vi /etc/selinux/config #修改SELINUX项为disabled

SELINUX=disabled #重起生效

或者

chcon -R -t httpd_sys_content_t /usr/local/nagios/sbin/

chcon -R -t httpd_sys_content_t /usr/local/nagios/share/

 

打开网页输入 虚拟机ip/nagios

 

 

 

安装nrpe

tar zxvf nrpe-2.12.tar.gz

cd nrpe-2.12

./configure

make all

make install-plugin

make install-daemon

make install-daemon-config

cd ..

2#######################################################

Nagios客户端安装部分

nagios client端基础准备:

echo 'export LC_ALL=C'>> /etc/profile

source /etc/profile  #←使修改生效

 

关闭nagios server iptables防火墙

/etc/init.d/iptables stop

禁止自启动

chkconfig iptables off

 

解决时间同步

/usr/sbin/ntpdate pool.ntp.org

 

安装nagios 客户端插件

mkdir -p /home/rhy/tools/nagios

cd /home/rhy/tools/nagios

rz -y #←从本地上传oldboytraining_nagiossoft.zip 或自行下载nagios软件:

unzip oldboytraining_nagiossoft.zip

/usr/sbin/adduser nagios -M

tar zxvf nagios-plugins-1.4.13.tar.gz

cd nagios-plugins-1.4.13

./configure --prefix=/usr/local/nagios --enable-perl-modules --enable-redhat-pthread-workaround

make

make install

cd ..

 

检查插件个数:

ls /usr/local/nagios/libexec/|wc -l

63

 

#此编译如遇到make: *** [all] Error 2 configure--with-mysql=/usr/local/mysql解决

#如果为red hat linux,可能会卡到如下位置:

configure: See http://nagiosplugins.org/faq/compile/configure_appears_to_hang if this next part takes a long time

checking for redhat spopen problem...

解决:

多等会就OK了。或增加编译参数 --enable-redhat-pthread-workaround重新编译

即:./configure --prefix=/usr/local/nagios --enable-perl-modules --enable-redhat-pthread-workaround

 

 

#安装nrpe

tar zxvf nrpe-2.12.tar.gz

cd nrpe-2.12

./configure

make all

make install-plugin

make install-daemon

##生成nrpe.cfg

make install-daemon-config

cd ..

 

#安装其它相关的插件

tar zxvf Params-Validate-0.91.tar.gz

cd Params-Validate-0.91

perl Makefile.PL

make

make install

cd ..

 

tar zxvf Class-Accessor-0.31.tar.gz

cd Class-Accessor-0.31

perl Makefile.PL

make

make install

cd ..

 

tar zxvf Config-Tiny-2.12.tar.gz

cd Config-Tiny-2.12

perl Makefile.PL

make

make install

cd ..

 

tar zxvf Math-Calc-Units-1.07.tar.gz

cd Math-Calc-Units-1.07

perl Makefile.PL

make

make install

cd ..

 

tar zxvf Regexp-Common-2010010201.tar.gz

cd Regexp-Common-2010010201

perl Makefile.PL

make

make install

cd ..

 

tar zxvf Nagios-Plugin-0.34.tar.gz

cd Nagios-Plugin-0.34

perl Makefile.PL

make

make install

cd ..

 

yum install sysstat -y

*:上文提到的各种插件本文附带了一个大的软件包,用户也可自行去下载

 

 

配置开发的几个基础脚本插件

/bin/cp /home/rhy/tools/nagios/training_soft/check_memory.pl /usr/local/nagios/libexec

/bin/cp /home/rhy/tools/nagios/training_soft/check_iostat /usr/local/nagios/libexec  #注意路径

chmod 755 /usr/local/nagios/libexec/check_memory.pl

chmod 755 /usr/local/nagios/libexec/check_iostat

dos2unix /usr/local/nagios/libexec/check_memory.pl

dos2unix /usr/local/nagios/libexec/check_iostat

 

配置nrpe

#config nrpe.cfg cd /usr/local/nagios/etc

vi nrpe.cfg +79

 

加入可以监控该服务器的nagios server端的IP。

 

allowed_hosts=127.0.0.1, 172.16.1.196,172.16.1.190  ←主机的IP

shift+g到结尾

 

注释掉或者干脆删除199-203行即下面几行

#command[check_users]=/usr/local/nagios//libexec/check_users -w 5 -c 10

#command[check_load]=/usr/local/nagios//libexec/check_load -w 15,10,5 -c 30,25,20

#command[check_hda1]=/usr/local/nagios//libexec/check_disk -w 20% -c 10% -p /dev/hda1

#command[check_zombie_procs]=/usr/local/nagios//libexec/check_procs -w 5 -c 10 -s Z

#command[check_total_procs]=/usr/local/nagios//libexec/check_procs -w 150 -c 200

同时在下面新添加要监控的内容:

 

启动:nagios client

/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

echo "/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d" >> /etc/rc.local

ps -ef|grep nagios

nagios 28466 1 0 21:50 ? 00:00:00 /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

注意:

1#执行完启动命令,进行检查是个良好的习惯

2tips:重起nagios nrpe组合命令

pkill nrpe && /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d

 

另外注意防火墙要允许5666端口加到/etc/sysconfig/iptables里,重起iptables.

vi /etc/sysconfig/iptables

-A INPUT -p udp --dport 5666 -j ACCEPT

-A INPUT -s 10.0.0.0/255.255.255.0 -p tcp -m tcp -j ACCEPT

-A INPUT -s 10.0.0.0/255.255.255.0 -p udp -m udp -j ACCEPT

 

测试环境,建议最好先把iptables关掉,等nagios都测试通了在来调试iptables,否则给学习测试增加了复杂度。关闭命令/etc/init.d/iptables stop

 

 

配置 nagios 监控服务

以下为server 端的操作

nagios.cfg 基本配置

在 nagios.cfg 文件中找到cfg_file 的部分,进行如下设置:

 

vi /usr/local/nagios/etc/nagios.cfg +34

#包含配置目录,目录下所有cfg 文件将被包含,通过这种方式便于写脚本批量处理。

需要增加的仅为以下三行

cfg_dir=/usr/local/nagios/etc/ objects/commands

cfg_dir=/usr/local/nagios/etc/ objects/services

cfg_dir=/usr/local/nagios/etc/ objects/hosts

 

#注*:相关配置文件用途上面已列表格说明,在此不重复了

下面绿色的配置当前不需要添加但以后会经常编辑,因此说明下

# You can specify individual object config files as shown below:

cfg_file=/usr/local/nagios/etc/objects/commands.cfg

cfg_file=/usr/local/nagios/etc/objects/contacts.cfg

cfg_file=/usr/local/nagios/etc/objects/timeperiods.cfg

cfg_file=/usr/local/nagios/etc/objects/templates.cfg

# Definitions for monitoring etiantian.org server and service

cfg_file=/usr/local/nagios/etc/objects/host.cfg

cfg_file=/usr/local/nagios/etc/objects/service.cfg

#cfg_file=/usr/local/nagios/etc/objects/servicegroups.cfg

注意:需要注释掉如下localhost.cfg 行,否则和我们要配的会冲突原 36 行

# Definitions for monitoring the local (Linux) host

#cfg_file=/usr/local/nagios/etc/objects/localhost.cfg

生产库各配置文件实战配置例子写法

 

nagios 实战实例配置过程细节:

a.配置监控客户端服务器的磁盘分区、LoadMemSwap、磁盘io

首先需要在host.cfg 里添加客户端主机和主机组(HOST GROUP)

cd /usr/local/nagios/etc/objects/  #←进入到辅助配置文件的目录

vi host.cfg     #←默认情况下host.cfg 是不存在的,需要手工创建

 

define host {

use linux-server

host_name 197-etiantian-1-1

alias 197-etiantian-1-1

address 172.16.1.197                    #←注意是改成客户端的IP

check_command check-host-alive

max_check_attempts 3

normal_check_interval 2

retry_check_interval 2

check_period 24x7

notification_interval 300

notification_period 24x7

notification_options d,u,r

contact_groups admins

process_perf_data 1

}

#######################################################

# HOST GROUP DEFINITION

#create by ryan (qq:49000448)

#######################################################

# Define an optional hostgroup for Linux machines

define hostgroup{

hostgroup_name linux-servers ; The name of the hostgroup

alias Linux Servers ; Long name of the group

members 197-etiantian-1-1

}

3#################################################################

同理添加要监控的服务配置到service.cfg

vi service.cfg

define service {

use generic-service

host_name 197-etiantian-1-1

service_description Current Load

check_command check_nrpe!check_load

max_check_attempts 2

normal_check_interval 4

retry_check_interval 4

check_period 24x7

notification_interval 1440

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

process_perf_data 1

}

define service {

use generic-service

host_name 197-etiantian-1-1

service_description MEM Useage

check_command check_nrpe!check_mem

max_check_attempts 2

normal_check_interval 4

retry_check_interval 4

check_period 24x7

notification_interval 1440

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

process_perf_data 1

}

define service {

use generic-service

host_name 197-etiantian-1-1

service_description Swap Useage

check_command check_nrpe!check_swap

max_check_attempts 10

normal_check_interval 3

retry_check_interval 4

check_period 24x7

notification_interval 480

notification_period workhours

notification_options w,u,c,r

contact_groups admins

process_perf_data 1

}

define service {

use generic-service

host_name 197-etiantian-1-1

service_description Disk Partition

check_command check_nrpe!check_disk

max_check_attempts 8

normal_check_interval 3

retry_check_interval 2

check_period 24x7

notification_interval 360

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

process_perf_data 1

}

define service {

use generic-service

host_name 197-etiantian-1-1

service_description Disk Iostat

check_command check_nrpe!check_iostat!5!11

max_check_attempts 2

normal_check_interval 4

retry_check_interval 4

check_period 24x7

notification_interval 1440

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

process_perf_data 1

}

#*:

1.以上service.cfg 中添加了对磁盘分区、LoadMemSwap、磁盘io 的监控

2.以上hots.cfg service 的内容的详细解释见附录八

 

 

若此时执行如下检查nagios 语法命令:

检查语法:

/etc/init.d/nagios checkconfig

会发现报错:

Checking services...

Error: Service check command 'check_nrpe' specified in service 'Swap Useage' for host '197-etiantian-1-1' not

defined anywhere!

省略若干。。

Total Warnings: 0

Total Errors: 5

根据错误提示,我们可以知道,是check_nrpe 插件没有定义导致。

 

#注意没有出现

Total Warnings: 0

Total Errors: 5

而是出现的

[root@linux objects]# /etc/init.d/nagios checkconfig

Running configuration check...

Nagios Core 3.2.0

……………………….

Processing object config file '/usr/local/nagios/etc/objects/commands.cfg'...

Processing object config file '/usr/local/nagios/etc/objects/contacts.cfg'...

Processing object config file '/usr/local/nagios/etc/objects/timeperiods.cfg'...

Processing object config file '/usr/local/nagios/etc/objects/templates.cfg'...

Processing object config directory '/usr/local/nagios/etc/commands'...

Error: Could not open config directory '/usr/local/nagios/etc/commands' for reading.

………………………….

 CONFIG ERROR!  Check your Nagios configuration.

#配置文件没有改IP

 

 

 

需要在commands.cfg 中加入check_nrpe 的插件配置

#vi commands.cfg 进入后按shift+g 切到结尾加入下面内容。

# 'check_nrpe' command definition

define command{

command_name check_nrpe

command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$

}

此时重新执行检查语法命令:

检查语法:

/etc/init.d/nagios checkconfig

Total Warnings: 0

Total Errors: 0

注*:修改配置不需要restart

 

打开网页会出现

It appears as though you do not have permission to view information for any of theservices you requested...

解决方法

[root@nagiosserver objects]#cd /usr/local/nagios/etc

[root@nagiosserver objects]#vi cgi.cfg +119

#把在1.6 节建立的用户oldboy 加到后面,注意用逗号隔开。

#default_user_name=oldboy

authorized_for_system_information=nagiosadmin,oldboy

authorized_for_configuration_information=nagiosadmin,oldboy

authorized_for_system_commands=nagiosadmin,oldboy

authorized_for_all_services=nagiosadmin,oldboy

authorized_for_all_hosts=nagiosadmin,oldboy

authorized_for_all_service_commands=nagiosadmin,oldboy

authorized_for_all_host_commands=nagiosadmin,oldboy

 

 

记得reload nagios 命令为:/etc/init.d/nagios reload

 

 

 

增加从nagios 服务器端发起的监控:如url 地址,端口监控

1.添加要监控的服务配置到service.cfg

########check url

#check_weburl (http://blog.etiantian.org) 197-etiantian-1-1

define service{

use generic-service

host_name 197-etiantian-1-1

service_description blog_url

check_command check_weburl!-H blog.etiantian.org

max_check_attempts 3

normal_check_interval 2

retry_check_interval 1

check_period 24x7

notification_interval 30

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

}

#check_weburl(http://blog.etiantian.org/oldboy/) 197-etiantian-1-1

define service{

use generic-service

host_name 197-etiantian-1-1

service_description blog_oldboy_url

check_command check_weburl!-H blog.etiantian.org -u /oldboy/

max_check_attempts 3

normal_check_interval 2

retry_check_interval 1

check_period 24x7

notification_interval 30

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

}

#请注意看以上监控URL 的不同

# 如果你的机器没有blog.etiantian.org 的WEB 配置, 可以直接用我的博客的公网地址

http://blog.etiantian.org/oldboy/,这样下面就不需要本地解析了。

 

 

检查语法:

/etc/init.d/nagios checkconfig

会发现报错:

Checking services...

Error: Service check command 'check_weburl' specified in service 'blog_oldboy_url' for host '197-etiantian-1-1' not defined

anywhere!

Error: Service check command 'check_weburl' specified in service 'blog_url' for host '197-etiantian-1-1' not defined anywhere!

省略若干

Total Warnings: 0

Total Errors: 2

根据错误可以知道,是check_weburl 插件没有定义导致。

 

 

 

、需要在commands.cfg 中加入check_weburl 的插件配置

# 'check_weburl' command definition

define command{

command_name check_weburl

command_line $USER1$/check_http $ARG1$ -w 10 -c 30

}

注意还要添加本地监控的插件:

# 'check_mem' command definition

define command{

command_name check_mem

command_line $USER1$/check_mem -w $ARG1$ -c $ARG2$

}

# 'check_load' command definition

define command{

command_name check_load

command_line $USER1$/check_load -w $ARG1$ -c $ARG2$

#command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$

}

# 'check_disk' command definition

define command{

command_name check_disk

command_line $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$

}

注意:以上三个插件命令没加也是正常的,这点有空偶在细看看。

 

如果是测试,注意在nagios 服务器端的/etc/hosts 下加

192.168.1.106 blog.etiantian.org

192.168.1.107 www.etiantian.org

 

检查语法:

/etc/init.d/nagios checkconfig

Total Warnings: 0

Total Errors: 0

4########################################################

同理可以监控任意端口如:80,110,25,8080,873 都可以。

cd /usr/local/nagios/etc/objects/

vi host.cfg

切到末尾加上

define service{

use generic-service

host_name 197-etiantian-1-1

service_description ssh_22

check_command check_tcp!22

max_check_attempts 3

normal_check_interval 2

retry_check_interval 1

check_period 24x7

notification_interval 30

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

}

define service{

use generic-service

host_name 197-etiantian-1-1

service_description http_80

check_command check_tcp!80

max_check_attempts 3

normal_check_interval 2

retry_check_interval 1

check_period 24x7

notification_interval 30

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

}

define service{

use generic-service

host_name 197-etiantian-1-1

service_description http_80_ssh_22

check_command check_tcp!80!22

max_check_attempts 5

normal_check_interval 3

retry_check_interval 3

check_period 24x7

notification_interval 20

notification_period 24x7

notification_options w,u,c,r

contact_groups admins

process_perf_data 1

}

 


你可能感兴趣的:(nagios安装.配置与监控的整个过程)