最近公司要做系统整体监控,所以我被分派写关于apache日志的分析,据说公司每天的access_log最大高达10G【约8千万行】,也不知道这个程序的处理效果如何。比较了PERL、sed、awk的速度发现,基本上从资源消耗上讲perl对RSS\SHARE\MEM的消耗大于sed、awk,但速度与CPU消耗明显小于2者。awk不多说了,什么都占用很大,而且一个字“慢”。最后发现sed的处理速度与CPU消耗基本与perl差不多,如处理1KW行数据时,perl耗时28秒,sed耗时42秒。所以最终选择了sed暂时处理该日志分析程序
 
apache原始日志格式如下:
 
2008.china.com.cn 172.16.20.73 - - [31/Jul/2008:16:52:05 +0800] "GET / HTTP/1.0" 304 - "-" "Wget/1.10.1 (Red Hat modified)"
==================================================================================
 
输出如下:
 
start_time
client_request
client_kbyte_out
sys_http200   
sys_http304   
sys_http403   
sys_http404   
sys_http500   
sys_http503
end_time
取5分钟内的平均值
==================================================================================
 
程序如下:
 
#!/bin/bash
#Program log.sh
#Version v1.2_3
#By IORI
#Create Date 2008-12-19 13:40
#Last Modify 2008-12-23 16:55
 
#################################################
interval=5    #####间隔时间
#interval=300 #####理论使用300秒较合适
MINUTE_LOG='./apache.tmp.log' ###截取5分钟内临时日志文件
FINAL_LOG='./apache.final.log'###生成分析日志文件
#################################################
##################HELP FUNCTION###################定义帮助函数
help()
{
 echo "Usage:: $0  "
}
#################HELP FUNCTION END###############
 
#################TEST $1 VALUE###################测试命令行参数是否为1个
   if [ $# -ne 1 ];then
            help
            exit 1
         else
           APACHE_LOG=$1
   fi
#################################################
#################GET FILE STATUS#################################################
#FILE_STATUS_TIME=`stat -c %y $APACHE_LOG |awk -F '.' '{print $1}'`
#TIME_STRING=`date -d "$FILE_STATUS_TIME" +%s`
#################################################################################
#################PROGRAM START#####################################################
#################程序 开始##########################################################
 
if [ ! -f $APACHE_LOG ];then #测试apache日志是否存在
   
   echo "You input $APACHE_LOG is exist" && exit 2
  else
       while :
 
            do
                rm -f $MINUTE_LOG
          
         
                GET_TIME=${GET_TIME:-`head -n 1 $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`} #获取一个时间戳_默认提取日志文件第一行 提取其中的[31/Jul/2008:16:52:05 值 转换为 "31 Jul 2008 16:52:05" 格式

                AWK_MINUTE=5  
                #CONTROL=0

                   for((i=300;i>=0;i=i-60))
 
                    do
                      
                       DATE_MINUTE=`date -d "$GET_TIME" +%s|awk '{print strftime("%Y:%H:%M",$0)}'` #将 "31 Jul 2008 16:52:05" 转换成 "2008:16:52"
                       DATE_END_MINUTE=`date -d "$GET_TIME" +%s|awk -v second="$i" '{print strftime("%Y:%H:%M",$0+second)}'` #将 "31 Jul 2008 16:52:05" 转换成 "2008:16:52" 并加 second 秒数 即获得5分钟后的时间戳

                    
                     grep -i "$DATE_END_MINUTE" $APACHE_LOG > /dev/null #筛选5分钟后的时间戳是否存在于apache日志中
                   
                     if [ $? == 0 ];then # 成功
                             
                             if [ $DATE_MINUTE != $DATE_END_MINUTE ] ;then #则判断开始时间戳与结束时间戳是否相等
                               
                 START_LINE=`sed -n "/$DATE_MINUTE/=" $APACHE_LOG|head -n1` #如果不相等,则取出开始时间戳的行号,与结束时间戳的行号

                                #END_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|tail -n1`
                                END_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|head -n1`
                              
                                sed -n "${START_LINE},${END_LINE}p" $APACHE_LOG > $MINUTE_LOG ##通过行号,取出5分钟内的日志内容 存放到 临时文件中
                               
           GET_START_TIME=`sed -n "${START_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'` #通过行号获取取出开始时间戳
           GET_END_TIME=`sed -n "${END_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'` #通过行号获取结束时间戳

                              
                          START_TIME=`date -d "$GET_START_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                          END_TIME=`date -d "$GET_END_TIME" +%s|awk  '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                  NEXT_LINE=`sed -n "$(expr $END_LINE + 1)p" $APACHE_LOG`#获得下一行的行号

                          GET_TIME=`echo $NEXT_LINE|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'` #通过行号获取下一行的时间戳
                               
                              break #处理完成 跳出  for((i=300;i>=0;i=i-60)) 循环
                            
                             else #即 开始时间戳等于结束时间戳

######################下面是开始时间戳与结束时间戳相等的处理,解释基本同上##########
                                 sed -n "/$DATE_END_MINUTE/p" $APACHE_LOG > $MINUTE_LOG
                                 START_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|head -n1`
                                 END_LINE=`sed -n "/$DATE_END_MINUTE/=" $APACHE_LOG|tail -n1`
          GET_START_TIME=`sed -n "${START_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`
          GET_END_TIME=`sed -n "${END_LINE}p" $APACHE_LOG|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`
                                 START_TIME=`date -d "$GET_START_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                                 END_TIME=`date -d "$GET_END_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0)}'`
                                 NEXT_LINE=`sed -n "$(expr $(sed -n "/$DATE_END_MINUTE/="  $APACHE_LOG|tail -n1) + 1)p" $APACHE_LOG`
                                 GET_TIME=`echo $NEXT_LINE|awk -F '[' '{print $2}' |awk '{print $1}'|sed 's#/# #g'|sed 's#:# #'`
                                 AWK_MINUTE=1
                                 #CONTROL=`expr $CONTROL + 1 `
                          
                                  #  if [ $CONTROL -eq 2  ];then
                                  
                                  #   echo "#####apache is wrong please check#####" && exit 5
                                  # fi
 
                                 break#跳出 for((i=300;i>=0;i=i-60))循环
                          
                           fi
#################################################################################    
                      else #即没有grep出结束时间戳
                           
                         AWK_MINUTE=`expr $AWK_MINUTE - 1`
       
               continue #继续执行 for((i=300;i>=0;i=i-60))   循环                  
                     fi       
                    
            done # for ((i=300;i>=0;i=i-60))循环 结束

           #sed -n "/$DATE_MINUTE:[0-5]\+[0-9]\+/,/$DATE_END_MINUTE:[0-5]\+[0-9]\+/p" $APACHE_LOG
          
   ####################################分析5分钟内日志######################
          
            #start_time
            #echo "start_time="`date -d "$GET_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$1)}'`| tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            echo "start_time=$START_TIME"| tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics client_request
      awk -v minute=$AWK_MINUTE '{if($7~/GET|HEAD|POST|PUT/) count++}END{printf "client_request=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics client_kbyte_out
            awk -v minute=$AWK_MINUTE '{if($10~/20[0-6]|30[0-5]/)BYTE+=$11}END{printf "client_kbyte_out=%.4f KB ",BYTE/1024/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http200
            awk -v minute=$AWK_MINUTE '{if($10~/200/) count++}END{printf "sys_http200=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http304
            awk -v minute=$AWK_MINUTE '{if($10~/304/) count++}END{printf "sys_http304=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http403
            awk -v minute=$AWK_MINUTE  '{if($10~/403/) count++}END{printf "sys_http403=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http404
            awk -v minute=$AWK_MINUTE '{if($10~/404/) count++}END{printf "sys_http404=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http500
            awk -v minute=$AWK_MINUTE '{if($10~/500/) count++}END{printf "sys_http500=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #statistics sys_http503
            awk -v minute=$AWK_MINUTE '{if($10~/503/) count++}END{printf "sys_http503=%.2f ",count/minute/60}' $MINUTE_LOG |tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            #end_time
            #echo "end_time="`date -d "$GET_TIME" +%s|awk '{print strftime("%Y-%m-%d %H:%M:%S",$0+300)}'` | tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
            echo "end_time=$END_TIME" | tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
          
            #####test####################
            #echo '$MIN_AWK=' $AWK_MINUTE
            #############################
            echo -e ' ' | tee -a ${FINAL_LOG}_"`date +%Y_%m_%d`"
                
    ###############################apache 5分钟 日志分析完#######################
          
              sleep $interval&&rm -f $MINUTE_LOG#等待interval时间删除临时文件
   
                       
       done
fi
#################PROGRAM END###############################################

 
 
后记:在程序筛选过程中发觉,apache日志有不连续的状况,所以按照开始的思想STAR=1,END=1+300
然后STAR=END,END=END+300的思想无法使用了,折腾了半天写了一个半残废的算法,大概实现了每5分钟内取日志后统计分析。