脚本背景:
我所在的公司为运营CDN业务的IDC公司,客户域名的流量图经常会出现毛刺,但是服务的域名非常多,每天挨个流量图看耗时耗力。因此用python写了个可以自动检测异常rrd里异常数值并发送报警邮件的脚本。
由于我们的rrd文件是以服务域名命名的,所以先在相应的API上获取服务域名,然后根据域名扫描rrd文件。我设的是扫描半小时的数值,每10分钟执行一次,大概有2000来个rrd文件,执行一次6、7秒左右。
代码如下:
#!/usr/bin/env python #coding:utf-8 from pyrrd.graph import DEF,CDEF,AREA from pyrrd.graph import Graph from pyrrd.graph import ColorAttributes from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.mime.image import MIMEImage from datetime import datetime import calendar import os import time import urllib2 import smtplib import email import sys def graphrrd(files): now_utc =calendar.timegm(datetime.utcnow().utctimetuple()) def1 = DEF(rrdfile=files, vname='back',dsName='RX') def2 = DEF(rrdfile=files, vname='CDN',dsName='TX') cdef1 = CDEF(vname='back_flow',rpn='%s,0.026,*' % def1.vname) cdef2 = CDEF(vname='CDN_flow',rpn='%s,0.026,*' % def2.vname) area1 = AREA(defObj=cdef1, color='#002A97FF', legend='back_flow') area2 = AREA(defObj=cdef2, color='#00CF00FF', legend='CDN_flow') ca = ColorAttributes() ca.back = '#333333' ca.canvas = '#333333' ca.shadea = '#000000' ca.shadeb = '#111111' ca.mgrid = '#CCCCCC' ca.axis = '#FFFFFF' ca.frame = '#AAAAAA' ca.font = '#FFFFFF' ca.arrow = '#FFFFFF' graphfile = image_dir title_url=files[23:-4] g = Graph(graphfile, start= now_utc-43200, end= now_utc,vertical_label='flow',title=title_url ) g.data.extend([def1, def2, cdef1, cdef2, area2, area1]) g.write() def connect(): server=smtplib.SMTP(smtpserver) server.ehlo() server.login(smtpuser,smtppass) return server def sendmessage(server,to,subj,content): msg = MIMEMultipart('related') msg['Subject'] = subj msg['From'] = smtpuser msg['To'] = to msg['Date'] = email.Utils.formatdate() msgText = MIMEText(content,"html", "utf-8") msg.attach(msgText) fp = open(image_dir, 'rb') msgImage = MIMEImage(fp.read()) fp.close() msgImage.add_header('Content-ID', '<image1>') msg.attach(msgImage) try: server.sendmail(smtpuser, to, msg.as_string()) except Exception ,ex: print Exception,ex print 'Error - send failed' def aver(rrd_file,n=6): global dict_data sum1=0 sum2=0 sum3=0 data = os.popen('rrdtool fetch %s AVERAGE -s -1d | tail -%d | grep -v nan| grep -v RX ' % (rrd_file,n)).readlines() if len(data)< (n/2): log("[ERRORS: %s] has not enough record ! please check it!!\n" % rrd_file) return [] for i in data: if len(i) > 25: dict_data[i[:10]]=i.strip()[12:].split() for i in dict_data.values(): try: sum1 = sum1+float(i[0]) sum2 = sum2+float(i[1]) sum3 = sum3+float(i[2]) except: log('%s %s\n' % (rrd_file,i)) if sum2/len(data) < 3500000000: log('WARNING: %s was less then 200M\n' % rrd_file) return [] return [sum1/len(data),sum2/len(data),sum3/len(data)] def check(average): wrong_t=[] for key in dict_data: if float(dict_data[key][1])/average > 1.6: wrong_t.append(key) return wrong_t def update(rrd_file,t,aver1,aver2,aver3): global text global dict_data errors_time=os.popen('date -d "1970-01-01 UTC %s seconds"' % t).readline().strip() content = '<br/><br/>%s 异常信息:<br/> 域名: %s <br/> 时间: %s<br/> 流量值: 回源带宽: %.2fM , cdn带宽 : %dM <br/> <br/>rrd 异常信息:<br/> 路径: %s<br/> UTC 时间: %s<br/> 异常值: [%s], [%s], [%s]<br/><br/><img src="cid:image1">' % (rrd_file[23:-4],rrd_file[23:-4],errors_time,float(dict_data[t][0])*8/300000000,int(float(dict_data[t][1])*8/300000000),rrd_file,t,dict_data[t][0],dict_data[t][1],dict_data[t][2]) write_error('[ %s ]: at[ %s(%s) ],the value was [%s] [%s] [%s] \n' %(rrd_file,errors_time,t,dict_data[t][0],dict_data[t][1],dict_data[t][2])) text = text + content def log(log_write): f = open('%s/rrd_alt1.log' % rrd_bak, 'a') f.write(log_write) f.close() def write_error(log_write): f = open('%s/rrd_error1.log' % rrd_bak, 'a') f.write(log_write) f.close def run_script(rrd_file): global to_all global text aver_rrd=aver(rrd_file) if len(aver_rrd) == 0: return wrong_time=check(aver_rrd[1]) if len(wrong_time)==0: log('[%s] no errors !\n' % (rrd_file)) return for t in wrong_time: update(rrd_file,t,aver_rrd[0],aver_rrd[1],aver_rrd[2]) graphrrd(rrd_file) if text: for to in to_all: server=connect() sendmessage(server,to,subj,text) log('sendmail to %s\n' % to) if __name__=='__main__': image_time=time.strftime("%d-%H-%M") rrd_dir='/data/rrd/db/1/billing' rrd_bak='/data/rrd/db/1/billing/bak' smtpserver='xxx' image_dir='%s/rrdgraph_%s.png' % (rrd_bak,image_time) smtpuser='xxx' smtppass='yyy' to_all=['xxx','yyy'] subj='check the flow of CDN!!!!' while True: url_list=[] local_time = time.strftime("%m-%d %H:%M:%S") url=urllib2.urlopen('xxx').readlines() for u in url: a = "%s/%s.rrd" % (rrd_dir,u.strip()) url_list.append(a) log("-"*60+"\n") log("the script run time at %s \n" % local_time) while len(url_list): text='' dict_data={} rrd_file = url_list.pop() if os.path.exists(rrd_file): run_script(rrd_file) else: continue log("-"*60+"\n") break
邮件截图
本文出自 “哲就是我” 博客,谢绝转载!