原文链接:http://blog.fens.me/hadoop-mapreduce-log-kpi/
数据类来源:网站访问记录
先上代码:
对原作者的代码进行了部分更改以适应新版本的Hadoop,同时记录下自己的问题和查找答案,方便以后快速回忆
package org.apache.hadoop.examples;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
public class Kpi {//bean,将数据封装,转化为String
private String remote_add;
private String remote_user;
private String time_local;
private String request;
private String statues;
private String body_bytes_sent;
private String http_referer;
private String http_user_agent;
private boolean valid = true;
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append("valid:"+this.valid);
sb.append("\nremote:_addr:"+this.remote_add);
sb.append("\nremote_user:"+this.remote_user);
sb.append("\ntime_local:"+this.time_local);
sb.append("\request:"+this.request);
sb.append("\nstatues:"+this.statues);
sb.append("\nbody_statues:"+this.body_bytes_sent);
sb.append("\nhttp_referer:"+this.http_referer);
sb.append("\nhttp_user_agent:"+this.http_user_agent);
return sb.toString();
}
public String getRemote_add() {
return remote_add;
}
public void setRemote_add(String remote_add) {
this.remote_add = remote_add;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatues() {
return statues;
}
public void setStatues(String statues) {
this.statues = statues;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
if(http_referer.length()<8){
return http_referer;
}
String str = this.http_referer.replace("\""," ").replace("http://", "").replace("https://", "");
return str.indexOf("/")>0?str.substring(0,str.indexOf("/")):str;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
};
public java.util.Date getTime_local_Date() throws ParseException{
SimpleDateFormat sdf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US);
return sdf.parse(this.time_local);
}
public String getTime_local_Date_hour() throws ParseException{
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHH");
return sdf.format(this.getTime_local_Date());
}
public static void main(String args[]) {//测试是否正确分割
String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"";
System.out.println(line);
Kpi kpi = new Kpi();
String[] arr = line.split(" ");
kpi.setRemote_add(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatues(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
System.out.println(kpi);
try {
SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss", Locale.US);
System.out.println(df.format(kpi.getTime_local_Date()));
System.out.println(kpi.getTime_local_Date_hour());
System.out.println(kpi.getHttp_referer());
} catch (ParseException e) {
e.printStackTrace();
}
}
private static Kpi parser(String line) {
System.out.println(line);
Kpi kpi = new Kpi();
String[] arr = line.split(" ");
if (arr.length > 11) {
kpi.setRemote_add(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatues(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
if (arr.length > 12) {
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
kpi.setHttp_user_agent(arr[11]);
}
if (Integer.parseInt(kpi.getStatues()) >= 400) {// 大于400,HTTP错误
kpi.setValid(false);
}
} else {
kpi.setValid(false);
}
return kpi;
}
/**
* 按page的pv分类, fliter the pages that setted
*
*/
public static Kpi filterPVs(String line) {
Kpi kpi = parser(line);
/* Set pages = new HashSet();
pages.add("/about");
pages.add("/black-ip-list/");
pages.add("/cassandra-clustor/");
pages.add("/finance-rhive-repurchase/");
pages.add("/hadoop-family-roadmap/");
pages.add("/hadoop-hive-intro/");
pages.add("/hadoop-zookeeper-intro/");
pages.add("/hadoop-mahout-roadmap/");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}*/
return kpi;
}
}
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.examples.WordCount.IntSumReducer;
import org.apache.hadoop.examples.WordCount.TokenizerMapper;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class KpiPv {//MapReduce分析指定pv(page view)数据
/*
* MapReduce工作过程分为map阶段和reduce阶段。每个阶段都有键值对作为输入和输出,并且它们的类型是由程序员指定的。同时程序员需要做的工作是编写map和reduce函数。
* map阶段输入的key是在文件开头部分文本起始处的偏移量,但是一般没有这方面的需要,所以可以忽略。可以在map阶段进行数据的筛选
* Hadoop规定了自己的一套可用于网络序列化的基本类型,便于RPC等功能的实现。所以没有使用Java内置内型。可以简单的认为Text类型相当于java的String,IntWritable相当于Integer
* map的输入参数是个 Text之类的 对象,并不是 file对象
* 1. 怎么将 文件参数 传递 到 job中呢?在 client 我们调用了FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileInputFormat 实现了 InputFormat 接口
* 在 InputFormat 接口中 有getSplits方法,也就是说分片操作实际上实在 map之前 就已经做好了
* 2.计算出来的分片有时怎么传递给 map呢 ?对于单词数量如何累加?
* nputFormat中的另一个方法createRecordReader() 这个方法:RecordReader:RecordReader是用来从一个输入分片中读取一个一个的K -V 对的抽象类,
* 它最主要的方法就是nextKeyvalue()方法,由它获取分片上的下一个K-V 对
* Hadoop把输入数据划分为等长的小数据发送到MapReduce,称为分片(input split)每个分片都创建一个map任务,由它来运行用户自定义的map函数来分析每个分片中的记录
*/
public static class CountMapper extends
Mapper