MegaCLI是LSI提供的用户空间管理RAID卡(LSI芯片)工具,适用于大多数的Dell服务器。
MegaCLI介绍:
http://zh.community.dell.com/techcenter/b/weblog/archive/2013/03/07/megacli-command-share
http://blog.chinaunix.net/uid-25135004-id-3139293.html
Zabbix提供low_level_discovery的机制去实现自动发现监控目标,自动添加监项的功能。Zabbix默认就基于low_level_discovery提供了文件系统挂载点和网卡的自动发现和监控。
所以,物理硬盘的自动发现和监控也是基于zabbix的low_level_discovery机制,我所需要做的就是写一个Python脚本来衔接Zabbix和MegaCLI。后面就不再阐述原理和细节了,过程如下:
去LSI官网上下载一个最新版本的MegaCLI,注意操作系统32位还是64位。
安装包默认是rpm的,CentOS等系统能轻松安装。
Ubuntu活debian可参考下面步骤安装:
mkdir /opt/MegaCLI cd /opt/MegaCLI wget -c http://xxx/8.07.14_MegaCLI.zip . unzip 8.07.14_MegaCLI.zip cd /opt/MegaCLI/Linux apt-get install rpm2cpio rpm2cpio MegaCli-8.07.14-1.noarch.rpm | cpio -idmv mv opt/MegaRAID /opt/
root@controller:~# ls -lh /opt/MegaRAID/MegaCli
total 5.7M
-rw-r--r-- 1 root root 296 Sep 24 19:10 CmdTool.log
-rwx------ 1 root root 528K Dec 16 2013 libstorelibir-2.so.14.07-0
-rwxr-xr-x 1 root root 2.4M Dec 16 2013 MegaCli
-rwsr-sr-x 1 root root 2.6M Dec 16 2013 MegaCli64
-rw-r--r-- 1 root root 139K Oct 10 17:43 MegaSAS.log
后面都默认MegaCli安装在/opt/MegaRAID/MegaCli
https://gist.github.com/AlexYangYu/14161ce866417f817508
/opt/DiskMonitoring/raid.py (chmod +x /opt/DiskMonitoring/raid.py)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Description: # This application is used to discovery the pyhsical disk by using the MegaCLI tool. # # Author: Alex Yang <[email protected]> # import commands import os import sys import json from optparse import OptionParser MEGACLI_EXEC = '/opt/MegaRAID/MegaCli/MegaCli64' LIST_DISK_OPT = '-PDList -aALL' SLOT_NUMBER = 'Slot Number' DEVICE_ID = 'Device Id' WWN = 'WWN' MEC = 'Media Error Count' OEC = 'Other Error Count' PFC = 'Predictive Failure Count' PD_TYPE = 'PD Type' RAW_SIZE = 'Raw Size' FIRMWARE_STATE = 'Firmware state' INQUIRY_DATA = 'Inquiry Data' class Disk(object): def __init__(self, dev_id, slot_number, wwn, mec, oec, pfc, pd_type, raw_size, firmware_state, inquiry_data): self.dev_id = dev_id self.slot_number = slot_number self.wwn = wwn # Media Error Count self.mec = mec # Other Error Count self.oec = oec # Predictive Failure Count self.pfc = pfc # PD Type self.pd_type = pd_type # Size self.raw_size = raw_size # Firmware State ("Failed", "Online, Spun Up", "Online, Spun Down", "Unconfigured(bad)", "Unconfigured(good), Spun down", "Hotspare, Spun down", "Hotspare, Spun up" or "not Online") self.firmware_state = firmware_state # Inquiry data self.inquiry_data = inquiry_data def jsonfiy(self): pass def __str__(self): return '%s %s %s %s %s %s %s %s %s %s' % ( self.dev_id, self.slot_number, self.wwn, self.mec, self.oec, self.pfc, self.pd_type, self.raw_size, self.firmware_state, self.inquiry_data ) def check_megacli(cli_path): if not os.path.exists(cli_path) or not os.access(cli_path, os.X_OK): print 'MegaCLI is needed in %s with executable priviledge.' % (cli_path) os.exit(1) def line_generator(string): line = [] for c in string: if c != '\n': line.append(c) else: yield ''.join(line) line = [] def get_value(line): return line.split(':')[1].strip() def make_disk_array(mega_output): disk_array = [] for line in line_generator(mega_output): if line.startswith(SLOT_NUMBER): slot_number = get_value(line) elif line.startswith(DEVICE_ID): dev_id = get_value(line) elif line.startswith(WWN): wwn = get_value(line) elif line.startswith(MEC): mec = get_value(line) elif line.startswith(OEC): oec = get_value(line) elif line.startswith(PFC): pfc = get_value(line) elif line.startswith(PD_TYPE): pd_type = get_value(line) elif line.startswith(RAW_SIZE): raw_size = get_value(line) elif line.startswith(FIRMWARE_STATE): fw_state = get_value(line) elif line.startswith(INQUIRY_DATA): inquiry_data = get_value(line) disk = Disk(dev_id, slot_number, wwn, mec, oec, pfc, pd_type, raw_size, fw_state, inquiry_data) disk_array.append(disk) return disk_array def discovery_physical_disk(disk_array): array = [] for d in disk_array: disk = {} disk['{#DISK_ID}'] = d.dev_id disk['{#WWN}'] = d.wwn array.append(disk) return json.dumps({'data': array}, indent=4, separators=(',',':')) def count_media_error(disk_array, disk_id): for disk in disk_array: if int(disk.dev_id) == int(disk_id): return disk.mec return '-1' def count_other_error(disk_array, disk_id): for disk in disk_array: if int(disk.dev_id) == int(disk_id): return disk.oec return '-1' def count_predictive_error(disk_array, disk_id): for disk in disk_array: if int(disk.dev_id) == int(disk_id): return disk.pfc return '-1' def get_disk_array(): check_megacli(MEGACLI_EXEC) (status, output) = commands.getstatusoutput('%s %s' % (MEGACLI_EXEC, LIST_DISK_OPT)) if status != 0: print 'Exec MegaCLI failed, please check the log.' os.exit(1) disk_array = make_disk_array(output) return disk_array def init_option(): usage = """ """ parser = OptionParser(usage=usage, version="0.1") return parser parser = init_option() if __name__ == '__main__': (options, args) = parser.parse_args() if len(args) < 1: print parser.print_help() sys.exit(1) disk_array = get_disk_array() command = args.pop(0) if command == 'pd_discovery': print discovery_physical_disk(disk_array) elif command == 'mec': print count_media_error(disk_array, args.pop()) elif command == 'oec': print count_other_error(disk_array, args.pop()) elif command == 'pfc': print count_predictive_error(disk_array, args.pop())
编辑zabbix_agentd.conf,确保如下两个配置正确。
Include=/etc/zabbix/zabbix_agentd.conf.d/ UnsafeUserParameters=1
将zabbix用户添加到sudoers中
echo "zabbix ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/zabbix
编辑/etc/zabbix/zabbix_agentd.conf.d/disk.conf,
添加自定义用户参数
UserParameter=raid.phy.discovery,sudo /opt/DiskMonitoring/raid.py pd_discovery UserParameter=raid.phy.mec[*],sudo /opt/DiskMonitoring/raid.py mec $1 UserParameter=raid.phy.oec[*],sudo /opt/DiskMonitoring/raid.py oec $1 UserParameter=raid.phy.pfc[*],sudo /opt/DiskMonitoring/raid.py pfc $1
4. 配置Zabbix Server
创建一个template,然后创建一个discovery rule,然后创建3个ITEM原型
Media Error Count的配置参考
后面只需要将模板关联到相关机器,并在相关机器上部署监控脚本即可。报警什么的就可以按自己的需求去设置。