1、输入数据
1.2.3.4- - [20/Feb/2016:00:05:11 +0800] "POST /zhifubao/zhifu HTTP/1.1" 200 1286
1.2.3.4 - - [20/Feb/2016:00:05:14 +0800] "POST /pay/zf HTTP/1.1" 200 96
2.2.3.4 - - [20/Feb/2016:00:05:15 +0800] "POST /zhifubao/zhifu HTTP/1.1" 200 1290
2.2.3.4 - - [20/Feb/2016:00:05:18 +0800] "POST /pay2/pay.do HTTP/1.1" 200 32
1.2.4.4 - - [20/Feb/2016:00:05:22 +0800] "POST /zhifubao/zhifu HTTP/1.1" 200 1285
1.2.3.5 - - [20/Feb/2016:00:05:23 +0800] "POST /zhifubao/zhifu HTTP/1.1" 200 1291
1.3.3.4 - - [20/Feb/2016:00:05:25 +0800] "POST /pay2/pay.do HTTP/1.1" 200 1976
2、正则表达式 取IP 日期 支付方式
package com.dt.java.test;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RhzfApacheLog {
public static void main(String[] args) {
// TODO Auto-generated method stub
//String a ="a\" ";
// String pattern ="^(\\S+) (\\S+) (\\S+)\\[([\\w/]+)([\\w:/]+)\\s([+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+) \" (\\d{3}) (\\d+)";
/* String pattern ="^(\\S+) (\\S+) (\\S+)";
m.group():11.5.41.3 - -
m.group(1):11.5.41.3
m.group(2):-*/
/* String pattern ="^(\\S+) (\\S+) (\\S+)\\[([\\w/]+)([\\w:/]+)\\s([+\\-]\\d{4})\\]";
m.group():11.7.11.3 - - [18/Feb/2016:00:00:55 +0800]
m.group(1):11.7.11.3
m.group(2):-
m.group(3):-
m.group(4):18/Feb/2016
m.group(5)::00:00:55
m.group(6):+0800
捕获个数:groupCount()=6*/
/* String pattern ="^(\\S+) (\\S+) (\\S+)\\[([\\w/]+)([\\w:/]+)\\s([+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\"";
m.group():11.5.41.3 - - [18/Feb/2016:00:00:55 +0800] "POST /zhifubao/pay HTTP/1.1"
m.group(1):11.5.41.3
m.group(2):-
m.group(3):-
m.group(4):18/Feb/2016
m.group(5)::00:00:55
m.group(6):+0800
m.group(7):POST
m.group(8):/zhifubao/pay
m.group(9):HTTP/1.1
捕获个数:groupCount()=9*/
/* String pattern ="^(\\S+) (\\S+) (\\S+)\\[([\\w/]+)([\\w:/]+)\\s([+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)";
m.group():11.5.41.3 - - [18/Feb/2016:00:00:55 +0800] "POST /zhifubao/pay HTTP/1.1" 200 1285
m.group(1):11.5.41.3
m.group(2):-
m.group(3):-
m.group(4):18/Feb/2016
m.group(5)::00:00:55
m.group(6):+0800
m.group(7):POST
m.group(8):/zhifubao/pay
m.group(9):HTTP/1.1
m.group(10):200
m.group(11):1285
捕获个数:groupCount()=11
*/
String pattern ="^(\\S+) (\\S+) (\\S+)\\[([\\w/]+)([\\w:/]+)\\s([+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)";
Pattern p = Pattern.compile(pattern);
String s = "11.5.41.3 - - [18/Feb/2016:00:00:55 +0800] \"POST /zhifubao/pay HTTP/1.1\" 200 1285";
Matcher m = p.matcher(s);
while(m.find())
{
System.out.println("m.group():"+m.group()); //打印一个大组
System.out.println("m.group(1):"+m.group(1)); //打印组1
System.out.println("m.group(2):"+m.group(2)); //打印组2
System.out.println("m.group(3):"+m.group(3));
System.out.println("m.group(4):"+m.group(4));
System.out.println("m.group(5):"+m.group(5));
System.out.println("m.group(6):"+m.group(6));
System.out.println("m.group(7):"+m.group(7));
System.out.println("m.group(8):"+m.group(8));
System.out.println("m.group(9):"+m.group(9));
System.out.println("m.group(10):"+m.group(10));
System.out.println("m.group(11):"+m.group(11));
System.out.println();
}
System.out.println("捕获个数:groupCount()="+m.groupCount());
}}
3、输出结果
1.1.1.2||18/Feb/2016||/zhifubao/zhifu 50000
1.1.1.2||19/Feb/2016||/zhifubao/zhifu 60000
1.1.1.3||18/Feb/2016||/zhifubao/zhifu 70000
1.1.1.3||19/Feb/2016||/zhifubao/zhifu 50000
1.1.1.1||18/Feb/2016||/zhifubao/zhifu 60000
1.1.1.1||19/Feb/2016||/zhifubao/zhifu 70000
1.1.1.2||18/Feb/2016||/zhifubao/zhifu 70000
1.1.1.2||19/Feb/2016||/zhifubao/zhifu 50000
4、源代码
package com.dtspark.hadoop.hellomapreduce;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class RhzfApacheURLLog {
public static class DataMapper
extends Mapper<LongWritable, Text, Text, LongWritable>{
private String pattern ="^(\\S+) (\\S+) (\\S+)\\[([\\w/]+)([\\w:/]+)\\s([+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)";
private Pattern p = Pattern.compile(pattern);
private LongWritable resultValue = new LongWritable(1);
private Text text =new Text();
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
System.out.println("Map Methond Invoked!!!");
String line =value.toString();
String result = handleLine(line);
if (result != null && result.length() > 0 ){
text.set(result);
context.write(text, resultValue);
}
}
private String handleLine(String line) {
String handResult = null;
if(line.length()>0){
Matcher m = p.matcher(line);
while(m.find())
{ String mip =m.group(1);
String mdate =m.group(4);
String malipay =m.group(8);
handResult = mip.trim() +"||"+mdate.trim() + "||"+ malipay.trim();
System.out.println("m.group(1):"+m.group(1)); //打印组1
System.out.println("m.group(4):"+m.group(4));
System.out.println("m.group(8):"+m.group(8));
}
}
return handResult;
}
}
public static class DataReducer
extends Reducer<Text,LongWritable,Text, LongWritable> {
private LongWritable totalresultValue = new LongWritable(1);
public void reduce(Text key, Iterable<LongWritable> values,
Context context
) throws IOException, InterruptedException {
System.out.println("Reduce Methond Invoked!!!" );
int total =0;
for (LongWritable item : values){
total += item.get();
}
totalresultValue.set(total);
context.write(key, totalresultValue);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: RhzfApacheURLLog <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "RhzfApacheURLLog");
job.setJarByClass(RhzfApacheURLLog.class);
job.setMapperClass(DataMapper.class);
job.setReducerClass(DataReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}