Nagiso 客户端要求必须在dell服务器上安装 OMSA(Openmanage Server Administrator) Nagios客户端安装OMSA (可参考http://linux.dell.com/repo/hardware/OMSA_7.4.0/) 1、增加dell的yum库 ( 可以访问 http://linux.dell.com/repo/hardware 查看最新版本 ) wget -q -O - http://linux.dell.com/repo/hardware/OMSA_7.4.0/bootstrap.cgi | bash 2、安装srvadmin yum install srvadmin-all -y 3、启动srvadmin /opt/dell/srvadmin/sbin/srvadmin-services.sh start Nagios 服务端配置 1、dell官方OMSA监控脚本下载(下载至Nagios 服务器 /usr/local/nagios/libexec下,并赋nagios执行权限) wget http://folk.uio.no/trondham/software/check_openmanage-3.7.11/check_openmanage 另,由于check_openmanage为perl脚本,故需perl解释器 需安装 perl-Net-SNMP yum install perl-Net-SNMP 2、以下是手动执行脚本获取硬件状态 #电压 ./check_openmanage -H 192.168.1.100 --only voltage VOLTAGE OK - 20 voltage probes checked #cpu ./check_openmanage -H 192.168.1.100 --only cpu PROCESSORS OK - 1 processors checked #风扇转速 ./check_openmanage -H 192.168.1.100 --only fans FANS OK - 12 fan probes checked #存储 ./check_openmanage -H 192.168.1.100 --only storage STORAGE OK - 3 physical drives, 1 logical drives #内存 ./check_openmanage -H 192.168.1.100 --only memory MEMORY OK - 2 memory modules, 32768 MB total memory #电池 ./check_openmanage -H 192.168.1.100 --only batteries BATTERIES OK - 1 batteries checked check_openmanage 脚本更多详细用法请参考: http://folk.uio.no/trondham/software/check_openmanage.html 如果以上无报错时,便可配置到Nagios 中了,配置方法网上较多,在这里就不多说了 #卸载openManage Server Administrator yum erase $(rpm -qa | grep srvadmin)
1.当系统日志出现 Server Administrator (Shared Library): Data Engine EventID: 0 A semaphore set has to be created but the system limit for the maximum number of semaphore sets has been exceeded 出现这个东西的时候表示你的监控已经不能用了。 大概意思是说:由于系统最大信号数量的限制,Data Engine未能成功开启。 这需要修改系统内核对于 semaphore sets 的设定。方法如下: ipcs -l ------ Shared Memory Limits -------- max number of segments = 4096 max seg size (kbytes) = 67108864 max total shared memory (kbytes) = 17179869184 min seg size (bytes) = 1 ------ Semaphore Limits -------- max number of arrays = 128 max semaphores per array = 250 max semaphores system wide = 32000 max ops per semop call = 32 semaphore max value = 32767 ------ Messages: Limits -------- max queues system wide = 16 max size of message (bytes) = 65536 default max size of queue (bytes) = 65536 sysctl -a | grep shm vm.hugetlb_shm_group = 0 kernel.shmmni = 4096 kernel.shmall = 4294967296 kernel.shmmax = 68719476736 解决办法 调整 max queues system wide max number of arrays sysctl -w kernel.msgmni=16384 sysctl -w kernel.sem="250 32000 100 1024" ################################################## echo "kernel.msgmni=16384" >> /etc/sysctl.conf echo "kernel.sem=\"250 32000 100 1024\"" >> /etc/sysctl.conf 再次查看 ipcs -l ------ Shared Memory Limits -------- max number of segments = 4096 max seg size (kbytes) = 67108864 max total shared memory (kbytes) = 17179869184 min seg size (bytes) = 1 ------ Semaphore Limits -------- max number of arrays = 1024 max semaphores per array = 250 max semaphores system wide = 32000 max ops per semop call = 100 semaphore max value = 32767 ------ Messages: Limits -------- max queues system wide = 16384 max size of message (bytes) = 65536 default max size of queue (bytes) = 65536 重启 /opt/dell/srvadmin/sbin/srvadmin-services.sh restart
2
refused smux peer: oid SNMPv2-SMI::enterprises.674.10892.1, descr Systems Management SNMP MIB Plug-in Manager SNMP 被拒绝
/etc/init.d/snmpd restart 即可 如果你的配置是正确的
3
ipmi_si: Could not enable interrupts, failed set, using polled mode. 不能中断,错误的设置,使用轮询的方式。这个问题多半是服务器的共享内存队列太长所导致,使用 ipcs -a | grep nagios | awk '{print $2}' | xargs -n1 ipcrm -s 来删除nagios用户的共享内存和队列,让系统重新创建。 脚本check_openmanage超时也可以使用这个命令来操作 /opt/dell/srvadmin/sbin/srvadmin-services.sh stop /etc/init.d/ipmi stop /opt/dell/srvadmin/sbin/srvadmin-services.sh start 即可解决超时问题
下面我们将用自己的办法监控机器的状态
Get_Dell_Server_Detail.py 搜集DELL硬件信息 保存到/tmp目录下
cat /data/program/nagios-client/libexec/Get_Dell_Server_Detail.py #!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ The Dell Server Hardware Detail author jastme """ import commands,os try: if os.path.exists('/tmp/Dell_Hardware_Detail.txt'): pass except IOError: f=open('/tmp/Dell_Hardware_Detail.txt','w') f.close() def DellServer(): detail=commands.getoutput('/data/program/nagios-client/libexec/check_openmanage -s -d') ff=open('/tmp/Dell_Hardware_Detail.txt','w') ff.write(detail) ff.close() if __name__ == '__main__': DellServer()
###########################################
cat check_raid_card.py ################# #!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ CHECK DELL SERVER --> RAID CARD author jastme """ import commands,os,sys def RaidCard(): Controller=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep Controller') l=[] for i in Controller.split('|'): l.append(i.strip(' ')) NUM=str(l.count('OK')) if 'OK' in Controller and 'Ready' in Controller: print '%s | Status = %sOK;' %(' '.join(l),NUM) sys.exit(0) else: print '%s | Status = %sOK;' %(' '.join(l),NUM) sys.exit(2) if __name__ == '__main__': RaidCard() ########################################################################
./check_raid_card.py OK 0 Controller 0 [PERC H710 Mini] is Ready | Status = 1OK; 这个是输出格式,可用于pnp4nagios出图
######################################################################
cat check_raid.py #!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ CHECK DELL SERVER --> RAID author jastme """ import commands,os,sys def Raid(): Physical_Drive=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep Logical') l=[] for i in Physical_Drive.split('|'): l.append(i.strip(' ')) NUM=str(l.count('OK')) if 'OK' in Physical_Drive and 'Ready' in Physical_Drive: print '%s | Status = %sOK;' %(' '.join(l),NUM) sys.exit(0) else: print '%s | Status = %sOK;' %(' '.join(l),NUM) sys.exit(2) if __name__ == '__main__': Raid()
./check_raid.py OK 0:0 Logical Drive '/dev/sda' [RAID-5, 1,115.50 GB] is Ready | Status = 1OK;
###########################################################
cat check_power_supply.py #!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ CHECK DELL SERVER --> Power Supply author jastme """ import commands,os,sys def Power_Supply(): Power=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep Power | grep Supply') l=[] NUM=0 for i in Power.split('\n'): l.append(' '.join(i.split('|'))+' \n') if 'OK' in i: NUM=NUM+1 if NUM == 2: print '%s | Status = %sOK;' %(l,NUM) sys.exit(0) else: print '%s | Status = %sOK;' %(l.join(l),NUM) sys.exit(2) if __name__ == '__main__': Power_Supply()
./check_power_supply.py [' OK 0 Power Supply 0 [AC]: Presence Detected \n', ' OK 1 Power Supply 1 [AC]: Presence Detected \n'] | Status = 2OK;
##########################################
cat check_fans.py #!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ CHECK DELL SERVER --> Physical fan author jastme """ import commands,os,sys def Fans(): fan=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep fan') fans_num=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep fan | wc -l') l=[] NUM=0 for i in fan.split('\n'): l.append(' '.join(i.split('|'))+' \n') if 'OK' in i: NUM=NUM+1 ll=[] for i in l: ll.append(' '.join([x for x in i.split(' ') if x !='' and x !='[System' and x !='Chassis' and x != 'Board' and x !='RPM]'] )) if int(NUM) == int(fans_num): print '%s | Status = %sOK;' %(ll,NUM) sys.exit(0) else: print '%s | Status = %sOK;' %(ll,NUM) sys.exit(2) if __name__ == '__main__': Fans()
./check_fans.py ['OK 0 fan 0 Fan1A reading: 3360 RPM \n', 'OK 1 fan 1 Fan2A reading: 3480 RPM \n', 'OK 2 fan 2 Fan3A reading: 3600 RPM \n', 'OK 3 fan 3 Fan4A reading: 3240 RPM \n', 'OK 4 fan 4 Fan5A reading: 3480 RPM \n', 'OK 5 fan 5 Fan6A reading: 3480 RPM \n', 'OK 6 fan 6 Fan7A reading: 3600 RPM \n', 'OK 7 fan 7 Fan1B reading: 3120 RPM \n', 'OK 8 fan 8 Fan2B reading: 3240 RPM \n', 'OK 9 fan 9 Fan3B reading: 3240 RPM \n', 'OK 10 fan 10 Fan4B reading: 3120 RPM \n', 'OK 11 fan 11 Fan5B reading: 3240 RPM \n', 'OK 12 fan 12 Fan6B reading: 3240 RPM \n', 'OK 13 fan 13 Fan7B reading: 3120 RPM \n'] | Status = 14OK;
################################################################
cat check_Physical_Disk.py #!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ CHECK DELL SERVER --> Physical Disk author wubo """ import commands,os,sys def Disk(): Disk=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep Physical') Disk_num=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep Physical | wc -l') l=[] NUM=0 for i in Disk.split('\n'): l.append(' '.join(i.split('|'))+' \n') if 'OK' in i: NUM=NUM+1 if int(NUM) == int(Disk_num): print '%s | Status = %sOK;' %(l,NUM) sys.exit(0) else: print '%s | Status = %sOK;' %(l,NUM) sys.exit(2) if __name__ == '__main__': Disk()
./check_Physical_Disk.py [' OK 0:0:1:0 Physical Disk 0:1:0 [SAS-HDD 300GB] on ctrl 0 is Online \n', ' OK 0:0:1:1 Physical Disk 0:1:1 [SAS-HDD 300GB] on ctrl 0 is Online \n', ' OK 0:0:1:2 Physical Disk 0:1:2 [SAS-HDD 300GB] on ctrl 0 is Online \n', ' OK 0:0:1:3 Physical Disk 0:1:3 [SAS-HDD 300GB] on ctrl 0 is Online \n', ' OK 0:0:1:4 Physical Disk 0:1:4 [SAS-HDD 300GB] on ctrl 0 is Online \n', ' OK 0:0:1:5 Physical Disk 0:1:5 [SAS-HDD 300GB] on ctrl 0 is Ready (Global HS) \n'] | Status = 6OK;
#################################################
cat check_memory_module.py #!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ CHECK DELL SERVER --> Memory module author jastme """ import commands,os,sys def Memory_module(): Memory=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep Memory') Memory_num=commands.getoutput('cat /tmp/Dell_Hardware_Detail.txt | grep Memory | wc -l') l=[] NUM=0 for i in Memory.split('\n'): l.append(' '.join(i.split('|'))+' \n') if 'OK' in i and 'Ok' in i: NUM=NUM+1 if int(NUM) == int(Memory_num): print '%s | Status = %sOK;' %(l,NUM) sys.exit(0) else: print '%s | Status = %sOK;' %(l,NUM) sys.exit(2) if __name__ == '__main__': Memory_module()
./check_memory_module.py [' OK 0 Memory module 0 [DIMM_A1, 16384 MB] is Ok \n', ' OK 1 Memory module 1 [DIMM_A2, 16384 MB] is Ok \n', ' OK 2 Memory module 2 [DIMM_A3, 16384 MB] is Ok \n', ' OK 3 Memory module 3 [DIMM_A4, 16384 MB] is Ok \n', ' OK 4 Memory module 4 [DIMM_A5, 16384 MB] is Ok \n', ' OK 5 Memory module 5 [DIMM_A6, 16384 MB] is Ok \n', ' OK 6 Memory module 6 [DIMM_A7, 16384 MB] is Ok \n', ' OK 7 Memory module 7 [DIMM_A8, 16384 MB] is Ok \n'] | Status = 8OK;
##############################################################################################
以上就是全部的代码
为nagios用户创建cron #dell */5 * * * * /data/program/nagios-client/libexec/Get_Dell_Server_Detail.py 每5分钟搜集一次系统状态
然后启动nrpe
后面就是页面的展示了