需求: 远程unix主机cpu load 有时过高,现在需要监控它,当超过阀值则email报警. 访问该主机只能通过telnet,不能安装其他任何library。
现有资源: 已经有nagios环境,python已自带telnetlib模块,可以通过执行uptime命令获取1(5,10)分钟平均cpu load。
通常,用shell script 编写nagios命令规范如下:
echo "OK status: ….” exit 0 echo "WARNING status: ….” exit 1 echo "CRITICAL status: ….” exit 2 echo "UNKNOWN status: ….” exit 3
接口规范依赖exit code.
完成后的python(2.7) 代码如下:
#!/usr/bin/python # -*- coding: utf-8 -*- import telnetlib import sys,os import datetime import traceback import logging import logging.handlers import getopt def script_path(): path = os.path.realpath(sys.argv[0]) if os.path.isfile(path): path = os.path.dirname(path) return os.path.abspath(path) LOGGING_MSG_FORMAT = '[%(asctime)s] [%(levelname)s] [%(module)s] [%(funcName)s] [%(lineno)d] %(message)s' LOGGING_DATE_FORMAT = '%Y-%m-%d %H:%M:%S' logging.basicConfig(level=logging.DEBUG,format=LOGGING_MSG_FORMAT,datefmt=LOGGING_DATE_FORMAT) log = logging.getLogger('check_cpu_load_average_via_telnet') if os.path.exists("/usr/local/nagios"): log_path = os.path.join("/usr/local/nagios",'logs') else: log_path = os.path.join(script_path(),'logs') if not os.path.exists(log_path): os.makedirs(log_path) log_file = os.path.join(log_path,'check_cpu_load_average_via_telnet.log') logger = logging.handlers.TimedRotatingFileHandler(log_file,'midnight',1) logger.setFormatter(logging.Formatter(LOGGING_MSG_FORMAT)) log.addHandler(logger) if __name__ == '__main__': try: reload(sys) sys.setdefaultencoding("utf-8") #accept -t via command check_nrpe opts, args = getopt.getopt(sys.argv[1:],"t:", ["host=","port=","user=","password=","cpuLoadLimit=","timeout="]) host = None port = 23 username = None password = None cpuLoadLimit = None login_Timeout = 30 for a,o in opts: if a in ('--host'): host=o elif a in ('--port'): port=int(o) elif a in ('--user'): username=o elif a in ('--password'): password=o elif a in ('--cpuLoadLimit'): cpuLoadLimit=float(o) elif a in ('-t',"--timeout"): login_Timeout = round(float(o)) log.info("telnet %s:%s with user:%s, set cpu load limit=%s,timeout=%s", host,str(port),username,str(cpuLoadLimit),str(login_Timeout)) CRLF = '\r\n' finish = 'telecom> ' #finish = '$ ' tn = telnetlib.Telnet(host=host,port=port, timeout=login_Timeout) #tn.set_debuglevel(2) tn.read_until('login: ',timeout=5) tn.write(username + CRLF) tn.read_until('Password: ',timeout=5) tn.write(password + CRLF) tn.read_until(finish,timeout=5) tn.write('uptime && sleep 1 '+CRLF) log.info("run command: uptime") tn.write("exit && sleep 1" + CRLF) log.info("run command: exit") resultOfCommands = tn.read_all() log.info("Result of Commands:%s",resultOfCommands) cpu_load_1_minute = resultOfCommands.split('load average:')[-1].strip().split(",")[0] try: cpu_load_1_minute = float(cpu_load_1_minute) except: print "exception occured while parsing the cpu_load_1_minute" sys.exit(3) log.error(traceback.format_exc()) tn.close() if cpu_load_1_minute < cpuLoadLimit: print "[ok] cpu load = %s for %s" % (str(cpu_load_1_minute), host) sys.exit(0) elif cpu_load_1_minute == cpuLoadLimit: print "[warn] cpu load approach to %s for %s" % (str(cpu_load_1_minute), host) sys.exit(1) else: print "[Not ok] cpu load = %s[limit:%s] for %s" % (str(cpu_load_1_minute),str(cpuLoadLimit), host) sys.exit(2) except SystemExit as e: sys.exit(e) except: errMsg = traceback.format_exc() print "exception occured, err msg:%s" % errMsg exit(3)
备注: 防止check_nrpe命令timeout(默认10sec), 加了参数-t 30.