最近发现了一个蛮有意思的监控软件Monit,它的功能远比zabbix要强大,而且要灵活得多,更重要的是它可以监控到的参数比较多。
Monit的安装比较简单,在这里就不做介绍了,我主要说一下在我们的生产环境上需要配置的东西
不过这个配置目前也是在实验阶段还没敢真机上上手
下面是配置文件/etc/monitrc
#setdaemon120#checkservicesat2-minuteintervals
#withstartdelay240#optional:delaythefirstcheckby4-minutes(by
##defaultMonitcheckimmediatelyafterMonitstart)
setdaemon90#设置monit进程检查的时间间隔是90秒
#setlogfilesyslogfacilitylog_daemon
setlogfile/var/log/monit.log#设置日志文件的路径
#setidfile/var/.monit.id
#setstatefile/var/.monit.state
#setmailservermail.bar.baz,#primarymailserver
#backup.bar.bazport10025,#backupmailserveronport10025
#localhost#fallbackrelay
#
setmailserver119.254.72.233#设置发邮件的邮件服务器,如果没有该项设置,那么monit将不会发出警告
##BydefaultMonitwilldropalerteventsifnomailserversareavailable.
##Ifyouwanttokeepthealertsforlaterdeliveryretry,youcanusethe
##EVENTQUEUEstatement.Thebasedirectorywhereundeliveredalertswillbe
##storedisspecifiedbytheBASEDIRoption.Youcanlimitthemaximalqueue
##sizeusingtheSLOTSoption(ifomitted,thequeueislimitedbyspace
##availableinthebackendfilesystem).
#
#seteventqueue
#basedir/var/monit#setthebasedirectorywhereeventswillbestored
#slots100#optionallylimitthequeuesize
#
#
##SendstatusandeventstoM/Monit(formoreinformationsaboutM/Monit
##seehttp://mmonit.com/).
#
#setmmonithttp://monit:[email protected]:8080/collector
#setmmonithttp://119.254.72.248:8080/collector
#
#
##Monitbydefaultusesthefollowingalertmailformat:
##
##--8<--
#From:monit@$HOST#sender
#Subject:monitalert--$HOST$EVENT$SERVICE#subject
##
##$EVENTService$SERVICE#
###
##Date:$DATE#
##Action:$ACTION#
##Host:$HOST#body
##Description:$DESCRIPTION#
###
##Yourfaithfulemployee,#
##Monit#
##--8<--
##
##Youcanoverridethismessageformatorpartsofit,suchassubject
##orsenderusingtheMAIL-FORMATstatement.Macrossuchas$DATE,etc.
##areexpandedatruntime.Forexample,tooverridethesender,use:
#
#setmail-format{from:[email protected]}
setmail-format{from:[email protected]}#设置报警发件人
setmail-format{Subject:alert$HOST$SERVICE$DESCRIPTION}#设置报警邮件的格式
#
##Youcansetalertrecipientswhomwillreceivealertsif/whena
##servicedefinedinthisfilehaserrors.Alertsmayberestrictedon
##eventsbyusingafilterasinthesecondexamplebelow.
#
#[email protected]#receiveallalerts
#[email protected]{timeout}#receivejustservice-
##timeoutalert
#
[email protected]#接收报警邮件地址(接收所有)
#
##Monithasanembeddedwebserverwhichcanbeusedtoviewstatusof
##servicesmonitoredandmanageservicesfromawebinterface.Seethe
##MonitWikiifyouwanttoenableSSLforthewebserver.
#
#sethttpdport2812and
#useaddresslocalhost#onlyacceptconnectionfromlocalhost
#allowlocalhost#allowlocalhosttoconnecttotheserverand
#allowadmin:monit#requireuser'admin'withpassword'monit'
#allow@monit#allowusersofgroup'monit'toconnect(rw)
#allow@usersreadonly#allowusersofgroup'users'toconnectreadonly
#
#
#监控apache服务器
sethttpdport2812and#设置apache监听端口为2812
useaddress119.254.72.248#设置监听apache的monit服务器IP
allowlocalhost#允许本机访问apache
allow203.86.46.224/29#允许该IP段访问apache
allow203.86.63.133#允许该IP访问apache
allowsysadmin:monit12114#设置以用户名sysadmin和对应的密码访问apache
###############################################################################
##Services
###############################################################################
##
##Checkgeneralsystemresourcessuchasloadaverage,cpuandmemory
##usage.Eachtestspecifiesaresource,conditionsandtheactiontobe
##performedshouldatestfail.
#监控系统负载CPU内存使用情况
checksystem119.254.72.248
ifloadavg(1min)>4thenalert#如果一分钟之内的平均负载大于4则报警
ifloadavg(5min)>2thenalert#如果五分钟之内的平均负载大于2则报警
ifmemoryusage>75%thenalert#如果CPU使用率超过75%则报警
ifcpuusage(user)>70%thenalert#如果用户占用内存超过70%则报警
ifcpuusage(system)>30%thenalert#如果系统占用内存超过30%则报警
ifcpuusage(wait)>20%thenalert如果等待进程占用内存超过20%则报警
#
#检测文件是否存在校验和权限UIDGID
##Checkafileforexistence,checksum,permissions,uidandgid.Inaddition
##toalertrecipientsintheglobalsection,customizedalertcanbesentto
##additionalrecipientsbyspecifyingalocalalerthandler.Theservicemay
##begroupedusingtheGROUPoption.Morethanonegroupcanbespecifiedby
##repeatingthe'groupname'statement.
#
#checkfileapache_binwithpath/usr/local/apache/bin/httpd
#iffailedchecksumand
#expectthesum8f7f419955cefa0b33a2ba316cba3659thenunmonitor
#iffailedpermission755thenunmonitor
#iffaileduidrootthenunmonitor
#iffailedgidrootthenunmonitor
#checksum,permission,uid,gid,unmonitor
#}withthemail-format{subject:Alarm!}
#groupserver
#
#检查apache运行状态,monit会自动回应apache的请求,检测apache占用的系统资源情况以及子进程数量。如果apachedown了,monit会自动将apache重启,如果重启的频率过高的话,将会有可能停止monit并使用timeout
##Checkthataprocessisrunning,inthiscaseApache,andthatitrespond
##toHTTPandHTTPSrequests.Checkitsresourceusagesuchascpuandmemory,
##andnumberofchildren.Iftheprocessisnotrunning,Monitwillrestart
##itbydefault.Incasetheserviceisrestartedveryoftenandthe
##problemremains,itispossibletodisablemonitoringusingtheTIMEOUT
##statement.Thisservicedependsonanotherservice(apache_bin)which
##isdefinedabove.
#
#checkprocessapachewithpidfile/usr/local/apache/logs/httpd.pid
#startprogram="/etc/init.d/httpdstart"withtimeout60seconds
#stopprogram="/etc/init.d/httpdstop"
#ifcpu>60%for2cyclesthenalert
#ifcpu>80%for5cyclesthenrestart
#iftotalmem>200.0MBfor5cyclesthenrestart
#ifchildren>250thenrestart
#ifloadavg(5min)greaterthan10for8cyclesthenstop
#iffailedhostwww.tildeslash.comport80protocolhttp
#andrequest"/somefile.html"
#thenrestart
#iffailedport443typetcpsslprotocolhttp
#withtimeout15seconds
#thenrestart
#if3restartswithin5cyclesthentimeout
#dependsonapache_bin
#groupserver
#
checkprocessapachewithpidfile/usr/local/apache/logs/httpd.pid
startprogram="/usr/local/apache/bin/apachectlstart"withtimeout60seconds
stopprogram="/usr/local/apache/bin/apachectlstop"
iffailedhost119.254.72.248port80protocolhttpthenrestart
ifcpu>60%for2cyclesthenalert
ifcpu>80%for5cyclesthenrestart
#iftotalmem>200.0MBfor5cyclesthenrestart
ifchildren>140thenrestart
#ifloadavg(5min)greaterthan10for8cyclesthenstop
#
checkprocesshttpdwithpidfile/usr/local/http_post/logs/httpd.pid
startprogram="/usr/local/http_post/bin/apachectlstart"withtimeout60seconds
stopprogram="/usr/local/http_post/bin/apachectlstop"
iffailedhost119.254.72.248port8080protocolhttpthenrestart
#
###
#检查Nginx服务进程
checkprocessnginxwithpidfile/usr/local/nginx/logs/nginx.pid
startprogram="/usr/local/nginx/bin/nginxstart"withtimeout60seconds
stopprogram="/usr/local/nginx/bin/nginxstop"
iffailedhost119.254.72.248port81protocolhttpthenrestart
##Checkfilesystempermissions,uid,gid,spaceandinodeusage.Otherservices,
##suchasdatabases,maydependonthisresourceandanautomaticallygraceful
##stopmaybecascadedtothembeforethefilesystemwillbecomefullanddata
##lost.
#
#checkfilesystemdatafswithpath/dev/sdb1
#检查系统磁盘空间使用情况并根据不同的设置来报警,一般情况下分两个级别,磁盘使用率达到70%和85%时,分别报警。主要是针对roothomeusrvar四个磁盘空间,尤其是home是邮件目录,占用磁盘空间比较大,磁盘空间上涨很快
checkfilesystemrootwithpath/
ifspaceusage>70%thenalert
ifinodeusage>85%thenalert
checkfilesystemhomewithpath/home
ifspaceusage>50%for5timeswithin15cyclesthenalert
ifinodeusage>85%thenalert
checkfilesystemusrwithpath/usr
ifspaceusage>70%thenalert
ifinodeusage>85%thenalert
checkfilesystemvarwithpath/var
ifspaceusage>70%thenalert
ifinodeusage>85%thenalert
#startprogram="/bin/mount/data"
#stopprogram="/bin/umount/data"
#iffailedpermission660thenunmonitor
#iffaileduidrootthenunmonitor
#iffailedgiddiskthenunmonitor
#ifspaceusage>50%for5timeswithin15cyclesthenalert
#ifspaceusage>99%thenstop
#ifinodeusage>30000thenalert
#ifinodeusage>99%thenstop
#groupserver
#
#
##Checkafile'stimestamp.Inthisexample,wetestifafileisolder
##than15minutesandassumesomethingiswrongifitsnotupdated.Also,
##ifthefilesizeexceedagivenlimit,executeascript
#
#checkfiledatabasewithpath/data/mydatabase.db
#iffailedpermission700thenalert
#iffaileduiddatathenalert
#iffailedgiddatathenalert
#iftimestamp>15minutesthenalert
#ifsize>100MBthenexec"/my/cleanup/script"asuiddbaandgiddba
#
#
##Checkdirectorypermission,uidandgid.Aneventistriggeredifthe
##directorydoesnotbelongtotheuserwithuid0andgid0.Inaddition,
##thepermissionshavetomatchtheoctaldescriptionof755(seechmod(1)).
#
#checkdirectorybinwithpath/bin
#iffailedpermission755thenunmonitor
#iffaileduid0thenunmonitor
#iffailedgid0thenunmonitor
#
#
##Checkaremotehostavailabilitybyissuingapingtestandcheckthe
##contentofaresponsefromawebserver.Uptothreepingsaresentand
##connectiontoaportandanapplicationlevelnetworkcheckisperformed.
#
#checkhostmyserverwithaddress192.168.1.1
#iffailedicmptypeechocount3withtimeout3secondsthenalert
#iffailedport3306protocolmysqlwithtimeout15secondsthenalert
#iffailedurlhttp://user:[email protected]:8080/?querystring
#andcontent=='action="j_security_check"'
#thenalert
#
#
###############################################################################
##Includes
###############################################################################
##
##Itispossibletoincludeadditionalconfigurationpartsfromotherfilesor
##directories.
#
#include/etc/monit.d/*
#
未完 待续。。。。。。。