pom.xml
4.0.0
com.yinbodotcc
countwords
0.0.1-SNAPSHOT
jar
countwords
http://maven.apache.org
UTF-8
3.0.3
0.13.1
0.98.6-hadoop2
org.apache.mrunit
mrunit
1.1.0
hadoop2
test
org.apache.hadoop
hadoop-client-minicluster
3.0.3
test
org.apache.hadoop
hadoop-hdfs
3.0.3
org.apache.hadoop
hadoop-client
2.5.1
org.apache.hadoop
hadoop-common
2.5.0
org.apache.maven.plugins
maven-compiler-plugin
3.1
1.8
1. 自定义Writable数据类型LogWritable
package chapter4;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
/**
* HTTP server log processing sample for the Chapter 4 of Hadoop MapReduce
* Cookbook.
*
* @author Thilina Gunarathne
*/
public class LogWritable implements WritableComparable {
private Text userIP, timestamp, request;
private IntWritable responseSize, status;
public LogWritable() {
this.userIP = new Text();
this.timestamp = new Text();
this.request = new Text();
this.responseSize = new IntWritable();
this.status = new IntWritable();
}
public void set (String userIP, String timestamp, String request, int bytes, int status)
{
this.userIP.set(userIP);
this.timestamp.set(timestamp);
this.request.set(request);
this.responseSize.set(bytes);
this.status.set(status);
}
@Override
public void readFields(DataInput in) throws IOException {
userIP.readFields(in);
timestamp.readFields(in);
request.readFields(in);
responseSize.readFields(in);
status.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
userIP.write(out);
timestamp.write(out);
request.write(out);
responseSize.write(out);
status.write(out);
}
@Override
public int compareTo(LogWritable o) {
if (userIP.compareTo(o.userIP) == 0) {
return timestamp.compareTo(o.timestamp);
} else
return userIP.compareTo(o.userIP);
}
public int hashCode()
{
return userIP.hashCode();
}
public Text getUserIP() {
return userIP;
}
public Text getTimestamp() {
return timestamp;
}
public Text getRequest() {
return request;
}
public IntWritable getResponseSize() {
return responseSize;
}
public IntWritable getStatus() {
return status;
}
}
2. Mapper实现类LogProcessorMap
package chapter4;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* HTTP server log processing sample for the Chapter 4 of Hadoop MapReduce
* Cookbook.
*
* @author Thilina Gunarathne
*/
public class LogProcessorMap extends Mapper {
LogWritable outValue = new LogWritable();
Text outKey = new Text();
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String logEntryPattern = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+)";
Pattern p = Pattern.compile(logEntryPattern);
Matcher matcher = p.matcher(value.toString());
if (!matcher.matches()) {
System.err.println("Bad Record : "+value);
return;
}
String userIP = matcher.group(1);
String timestamp = matcher.group(4);
String request = matcher.group(5);
int status = Integer.parseInt(matcher.group(6));
int bytes = Integer.parseInt(matcher.group(7));
outKey.set(userIP);
outValue.set(userIP, timestamp, request,
bytes,status);
context.write(outKey,outValue);
}
}
3. Reducer实现类LogProcessorReduce
package chapter4;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* HTTP server log processing sample for the Chapter 4 of Hadoop MapReduce
* Cookbook.
*
* @author Thilina Gunarathne
*/
public class LogProcessorReduce extends
Reducer {
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (LogWritable val : values) {
sum += val.getResponseSize().get();
}
result.set(sum);
context.write(key, result);
}
}
4. 包含Main函数的类LogProcessor
package chapter4;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class LogProcessor extends Configured implements Tool {
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new LogProcessor(), args);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
if (args.length < 3) {
System.err
.println("Usage: ");
System.exit(-1);
}
/* input parameters */
String inputPath = args[0];
String outputPath = args[1];
int numReduce = Integer.parseInt(args[2]);
Job job = Job.getInstance(getConf(), "log-analysis");
job.setJarByClass(LogProcessor.class);
job.setMapperClass(LogProcessorMap.class);
job.setReducerClass(LogProcessorReduce.class);
job.setNumReduceTasks(numReduce);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LogWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
int exitStatus = job.waitForCompletion(true) ? 0 : 1;
System.out.println("--------------------->>>>>>>>>>>>>>>>>>>>");
return exitStatus;
}
}
输入文件下载位置:ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz 这个下载后解压缩其中的文件到input中