监控软件Monit

最近发现了一个蛮有意思的监控软件Monit,它的功能远比zabbix要强大,而且要灵活得多,更重要的是它可以监控到的参数比较多。

Monit的安装比较简单,在这里就不做介绍了,我主要说一下在我们的生产环境上需要配置的东西

不过这个配置目前也是在实验阶段还没敢真机上上手

下面是配置文件/etc/monitrc

#setdaemon120#checkservicesat2-minuteintervals

#withstartdelay240#optional:delaythefirstcheckby4-minutes(by

##defaultMonitcheckimmediatelyafterMonitstart)

setdaemon90#设置monit进程检查的时间间隔是90

#setlogfilesyslogfacilitylog_daemon

setlogfile/var/log/monit.log#设置日志文件的路径

#setidfile/var/.monit.id

#setstatefile/var/.monit.state

#setmailservermail.bar.baz,#primarymailserver

#backup.bar.bazport10025,#backupmailserveronport10025

#localhost#fallbackrelay

#

setmailserver119.254.72.233#设置发邮件的邮件服务器,如果没有该项设置,那么monit将不会发出警告

##BydefaultMonitwilldropalerteventsifnomailserversareavailable.

##Ifyouwanttokeepthealertsforlaterdeliveryretry,youcanusethe

##EVENTQUEUEstatement.Thebasedirectorywhereundeliveredalertswillbe

##storedisspecifiedbytheBASEDIRoption.Youcanlimitthemaximalqueue

##sizeusingtheSLOTSoption(ifomitted,thequeueislimitedbyspace

##availableinthebackendfilesystem).

#

#seteventqueue

#basedir/var/monit#setthebasedirectorywhereeventswillbestored

#slots100#optionallylimitthequeuesize

#

#

##SendstatusandeventstoM/Monit(formoreinformationsaboutM/Monit

##seehttp://mmonit.com/).

#

#setmmonithttp://monit:[email protected]:8080/collector

#setmmonithttp://119.254.72.248:8080/collector

#

#

##Monitbydefaultusesthefollowingalertmailformat:

##

##--8<--

#From:monit@$HOST#sender

#Subject:monitalert--$HOST$EVENT$SERVICE#subject

##

##$EVENTService$SERVICE#

###

##Date:$DATE#

##Action:$ACTION#

##Host:$HOST#body

##Description:$DESCRIPTION#

###

##Yourfaithfulemployee,#

##Monit#

##--8<--

##

##Youcanoverridethismessageformatorpartsofit,suchassubject

##orsenderusingtheMAIL-FORMATstatement.Macrossuchas$DATE,etc.

##areexpandedatruntime.Forexample,tooverridethesender,use:

#

#setmail-format{from:[email protected]}

setmail-format{from:[email protected]}#设置报警发件人

setmail-format{Subject:alert$HOST$SERVICE$DESCRIPTION}#设置报警邮件的格式

#

##Youcansetalertrecipientswhomwillreceivealertsif/whena

##servicedefinedinthisfilehaserrors.Alertsmayberestrictedon

##eventsbyusingafilterasinthesecondexamplebelow.

#

#[email protected]#receiveallalerts

#[email protected]{timeout}#receivejustservice-

##timeoutalert

#

[email protected]#接收报警邮件地址(接收所有)

#

##Monithasanembeddedwebserverwhichcanbeusedtoviewstatusof

##servicesmonitoredandmanageservicesfromawebinterface.Seethe

##MonitWikiifyouwanttoenableSSLforthewebserver.

#

#sethttpdport2812and

#useaddresslocalhost#onlyacceptconnectionfromlocalhost

#allowlocalhost#allowlocalhosttoconnecttotheserverand

#allowadmin:monit#requireuser'admin'withpassword'monit'

#allow@monit#allowusersofgroup'monit'toconnect(rw)

#allow@usersreadonly#allowusersofgroup'users'toconnectreadonly

#

#

#监控apache服务器

sethttpdport2812and#设置apache监听端口为2812

useaddress119.254.72.248#设置监听apachemonit服务器IP

allowlocalhost#允许本机访问apache

allow203.86.46.224/29#允许该IP段访问apache

allow203.86.63.133#允许该IP访问apache

allowsysadmin:monit12114#设置以用户名sysadmin和对应的密码访问apache

###############################################################################

##Services

###############################################################################

##

##Checkgeneralsystemresourcessuchasloadaverage,cpuandmemory

##usage.Eachtestspecifiesaresource,conditionsandtheactiontobe

##performedshouldatestfail.

#监控系统负载CPU内存使用情况

checksystem119.254.72.248

ifloadavg(1min)>4thenalert#如果一分钟之内的平均负载大于4则报警

ifloadavg(5min)>2thenalert#如果五分钟之内的平均负载大于2则报警

ifmemoryusage>75%thenalert#如果CPU使用率超过75%则报警

ifcpuusage(user)>70%thenalert#如果用户占用内存超过70%则报警

ifcpuusage(system)>30%thenalert#如果系统占用内存超过30%则报警

ifcpuusage(wait)>20%thenalert如果等待进程占用内存超过20%则报警

#

#检测文件是否存在校验和权限UIDGID

##Checkafileforexistence,checksum,permissions,uidandgid.Inaddition

##toalertrecipientsintheglobalsection,customizedalertcanbesentto

##additionalrecipientsbyspecifyingalocalalerthandler.Theservicemay

##begroupedusingtheGROUPoption.Morethanonegroupcanbespecifiedby

##repeatingthe'groupname'statement.

#

#checkfileapache_binwithpath/usr/local/apache/bin/httpd

#iffailedchecksumand

#expectthesum8f7f419955cefa0b33a2ba316cba3659thenunmonitor

#iffailedpermission755thenunmonitor

#iffaileduidrootthenunmonitor

#iffailedgidrootthenunmonitor

#[email protected]{

#checksum,permission,uid,gid,unmonitor

#}withthemail-format{subject:Alarm!}

#groupserver

#

#检查apache运行状态,monit会自动回应apache的请求,检测apache占用的系统资源情况以及子进程数量。如果apachedown了,monit会自动将apache重启,如果重启的频率过高的话,将会有可能停止monit并使用timeout

##Checkthataprocessisrunning,inthiscaseApache,andthatitrespond

##toHTTPandHTTPSrequests.Checkitsresourceusagesuchascpuandmemory,

##andnumberofchildren.Iftheprocessisnotrunning,Monitwillrestart

##itbydefault.Incasetheserviceisrestartedveryoftenandthe

##problemremains,itispossibletodisablemonitoringusingtheTIMEOUT

##statement.Thisservicedependsonanotherservice(apache_bin)which

##isdefinedabove.

#

#checkprocessapachewithpidfile/usr/local/apache/logs/httpd.pid

#startprogram="/etc/init.d/httpdstart"withtimeout60seconds

#stopprogram="/etc/init.d/httpdstop"

#ifcpu>60%for2cyclesthenalert

#ifcpu>80%for5cyclesthenrestart

#iftotalmem>200.0MBfor5cyclesthenrestart

#ifchildren>250thenrestart

#ifloadavg(5min)greaterthan10for8cyclesthenstop

#iffailedhostwww.tildeslash.comport80protocolhttp

#andrequest"/somefile.html"

#thenrestart

#iffailedport443typetcpsslprotocolhttp

#withtimeout15seconds

#thenrestart

#if3restartswithin5cyclesthentimeout

#dependsonapache_bin

#groupserver

#

checkprocessapachewithpidfile/usr/local/apache/logs/httpd.pid

startprogram="/usr/local/apache/bin/apachectlstart"withtimeout60seconds

stopprogram="/usr/local/apache/bin/apachectlstop"

iffailedhost119.254.72.248port80protocolhttpthenrestart

ifcpu>60%for2cyclesthenalert

ifcpu>80%for5cyclesthenrestart

#iftotalmem>200.0MBfor5cyclesthenrestart

ifchildren>140thenrestart

#ifloadavg(5min)greaterthan10for8cyclesthenstop

#

checkprocesshttpdwithpidfile/usr/local/http_post/logs/httpd.pid

startprogram="/usr/local/http_post/bin/apachectlstart"withtimeout60seconds

stopprogram="/usr/local/http_post/bin/apachectlstop"

iffailedhost119.254.72.248port8080protocolhttpthenrestart

#

###

#检查Nginx服务进程

checkprocessnginxwithpidfile/usr/local/nginx/logs/nginx.pid

startprogram="/usr/local/nginx/bin/nginxstart"withtimeout60seconds

stopprogram="/usr/local/nginx/bin/nginxstop"

iffailedhost119.254.72.248port81protocolhttpthenrestart

##Checkfilesystempermissions,uid,gid,spaceandinodeusage.Otherservices,

##suchasdatabases,maydependonthisresourceandanautomaticallygraceful

##stopmaybecascadedtothembeforethefilesystemwillbecomefullanddata

##lost.

#

#checkfilesystemdatafswithpath/dev/sdb1

#检查系统磁盘空间使用情况并根据不同的设置来报警,一般情况下分两个级别,磁盘使用率达到70%85%时,分别报警。主要是针对roothomeusrvar四个磁盘空间,尤其是home是邮件目录,占用磁盘空间比较大,磁盘空间上涨很快

checkfilesystemrootwithpath/

ifspaceusage>70%thenalert

ifinodeusage>85%thenalert

checkfilesystemhomewithpath/home

ifspaceusage>50%for5timeswithin15cyclesthenalert

ifinodeusage>85%thenalert

checkfilesystemusrwithpath/usr

ifspaceusage>70%thenalert

ifinodeusage>85%thenalert

checkfilesystemvarwithpath/var

ifspaceusage>70%thenalert

ifinodeusage>85%thenalert

#startprogram="/bin/mount/data"

#stopprogram="/bin/umount/data"

#iffailedpermission660thenunmonitor

#iffaileduidrootthenunmonitor

#iffailedgiddiskthenunmonitor

#ifspaceusage>50%for5timeswithin15cyclesthenalert

#ifspaceusage>99%thenstop

#ifinodeusage>30000thenalert

#ifinodeusage>99%thenstop

#groupserver

#

#

##Checkafile'stimestamp.Inthisexample,wetestifafileisolder

##than15minutesandassumesomethingiswrongifitsnotupdated.Also,

##ifthefilesizeexceedagivenlimit,executeascript

#

#checkfiledatabasewithpath/data/mydatabase.db

#iffailedpermission700thenalert

#iffaileduiddatathenalert

#iffailedgiddatathenalert

#iftimestamp>15minutesthenalert

#ifsize>100MBthenexec"/my/cleanup/script"asuiddbaandgiddba

#

#

##Checkdirectorypermission,uidandgid.Aneventistriggeredifthe

##directorydoesnotbelongtotheuserwithuid0andgid0.Inaddition,

##thepermissionshavetomatchtheoctaldescriptionof755(seechmod(1)).

#

#checkdirectorybinwithpath/bin

#iffailedpermission755thenunmonitor

#iffaileduid0thenunmonitor

#iffailedgid0thenunmonitor

#

#

##Checkaremotehostavailabilitybyissuingapingtestandcheckthe

##contentofaresponsefromawebserver.Uptothreepingsaresentand

##connectiontoaportandanapplicationlevelnetworkcheckisperformed.

#

#checkhostmyserverwithaddress192.168.1.1

#iffailedicmptypeechocount3withtimeout3secondsthenalert

#iffailedport3306protocolmysqlwithtimeout15secondsthenalert

#iffailedurlhttp://user:[email protected]:8080/?querystring

#andcontent=='action="j_security_check"'

#thenalert

#

#

###############################################################################

##Includes

###############################################################################

##

##Itispossibletoincludeadditionalconfigurationpartsfromotherfilesor

##directories.

#

#include/etc/monit.d/*

#

未完 待续。。。。。。。

<!--EndFragment-->

你可能感兴趣的:(it)