今天照着《hadoop实战》写了个小例子,解析tomcat日志统计各个浏览器的访问次数
package com.ice.stat; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class TomcatLog{ static class TomcatMapper extends Mapper<Object, Text, Text, IntWritable> { private static final IntWritable one = new IntWritable(1); private static Pattern pattern = Pattern.compile("([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),(.*)"); protected void map(Object key, Text value, Context context) throws IOException ,InterruptedException { String line = value.toString(); System.out.println(line); Matcher m = pattern.matcher(line); if(m.matches()){ String agent = m.group(9).toLowerCase(); if(agent.contains("chrome")){ agent = "chrome"; }else if(agent.contains("safari")){ agent = "safari"; }else if(agent.contains("firefox")){ agent = "firefox"; }else{ agent = "other"; } Text t = new Text(agent); context.write(t, one); } }; } static class TomcatReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ protected void reduce(Text key, java.lang.Iterable<IntWritable> value, org.apache.hadoop.mapreduce.Reducer<Text,IntWritable,Text,IntWritable>.Context context) throws IOException ,InterruptedException { int count = 0; for(IntWritable v : value){ count = count + v.get(); } context.write(key, new IntWritable(count)); }; } public static void main(String[] args) throws Exception { if(args.length != 2){ System.err.println("参数个数不对"); System.exit(-1); } Job job = new Job(); job.setJarByClass(TomcatLog.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(TomcatMapper.class); job.setReducerClass(TomcatReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
日志样例:
172.16.2.12,-,-,[06/Sep/2011:10:03:13 +0800],GET /icestat/jpivot/toolbar/sort-asc-up.png HTTP/1.1,200,336,-,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2) 172.16.2.12,-,-,[06/Sep/2011:09:48:17 +0800],GET /icestat/ HTTP/1.1,200,171,http://10.65.11.241:8080/icestat/,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.43 Safari/534.24
把日志放入hdfs
hadoop fs -put icestat_access_log.2011-09-06.txt icestat_access_log.2011-09-06.txt
分析日志
hadoop jar tomcatLog.jar icestat_access_log.2011-09-06.txt output6
查看输出
[root@xxx hadoop-0.20.2]# hadoop fs -cat output6/part-r-00000 11/09/06 00:18:54 WARN conf.Configuration: DEPRECATED: hadoop-site.xml found in the classpath. Usage of hadoop-site.xml is deprecated. Instead use core-site.xml, mapred-site.xml and hdfs-site.xml to override properties of core-default.xml, mapred-default.xml and hdfs-default.xml respectively 11/09/06 00:18:54 WARN fs.FileSystem: "xxx :9000" is a deprecated filesystem name. Use "hdfs://xxx :9000/" instead. 11/09/06 00:18:54 WARN fs.FileSystem: "xxx :9000" is a deprecated filesystem name. Use "hdfs://xxx :9000/" instead. 11/09/06 00:18:54 WARN fs.FileSystem: "xxx :9000" is a deprecated filesystem name. Use "hdfs://xxx :9000/" instead. chrome 58 firefox 23 other 49 safari 5