一、项目要求
二、需求分析: KPI指标设计
PV(PageView): 页面訪问量统计
IP: 页面独立IP的訪问量统计
Time: 用户每小时PV的统计
Source: 用户来源域名的统计
Browser: 用户的訪问设备统计
以下我着重分析浏览器统计
三、分析过程
1、 日志的一条nginx记录内容
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939
"http://www.angularjs.cn/A00n"
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
2、对上面的日志记录进行分析
remote_addr : 记录client的ip地址, 222.68.172.190
remote_user : 记录clientusername称, –
time_local: 记录訪问时间与时区, [18/Sep/2013:06:49:57 +0000]
request: 记录请求的url与http协议, “GET /images/my.jpg HTTP/1.1″
status: 记录请求状态,成功是200, 200
body_bytes_sent: 记录发送给client文件主体内容大小, 19939
http_referer: 用来记录从那个页面链接訪问过来的, “http://www.angularjs.cn/A00n”
http_user_agent: 记录客户浏览器的相关信息, “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36″
3、java语言分析上面一条日志记录(使用空格切分)
1 |
String line = "222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] \"GET /images/my.jpg HTTP/1.1\" 200 19939 \"http://www.angularjs.cn/A00n\" \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36\"" ; |
2 |
String[] elementList = line.split( " " ); |
3 |
for ( int i= 0 ;i<elementList.length;i++){ |
4 |
System.out.println(i+ " : " +elementList[i]); |
5 |
} |
測试结果:
01 |
0 : 222.68 . 172.190 |
02 |
1 : - |
03 |
2 : - |
04 |
3 : [ 18 /Sep/ 2013 : 06 : 49 : 57 |
05 |
4 : + 0000 ] |
06 |
5 : "GET |
07 |
6 : /images/my.jpg |
08 |
7 : HTTP/ 1.1 " |
09 |
8 : 200 |
10 |
9 : 19939 |
11 |
10 : "http://www.angularjs.cn/A00n" |
12 |
11 : "Mozilla/ 5.0 |
13 |
12 : (Windows |
14 |
13 : NT |
15 |
14 : 6.1 ) |
16 |
15 : AppleWebKit/ 537.36 |
17 |
16 : (KHTML, |
18 |
17 : like |
19 |
18 : Gecko) |
20 |
19 : Chrome/ 29.0 . 1547.66 |
21 |
20 : Safari/ 537.36 " |
01 |
public class Kpi { |
02 |
private String remote_addr; // 记录client的ip地址 |
03 |
private String remote_user; // 记录clientusername称,忽略属性"-" |
04 |
private String time_local; // 记录訪问时间与时区 |
05 |
private String request; // 记录请求的url与http协议 |
06 |
private String status; // 记录请求状态;成功是200 |
07 |
private String body_bytes_sent; // 记录发送给client文件主体内容大小 |
08 |
private String http_referer; // 用来记录从那个页面链接訪问过来的 |
09 |
private String http_user_agent; // 记录客户浏览器的相关信息 |
10 |
private String method; //请求方法 get post |
11 |
private String http_version; //http版本号 |
12 |
|
13 |
public String getMethod() { |
14 |
return method; |
15 |
} |
16 |
public void setMethod(String method) { |
17 |
this .method = method; |
18 |
} |
19 |
public String getHttp_version() { |
20 |
return http_version; |
21 |
} |
22 |
public void setHttp_version(String http_version) { |
23 |
this .http_version = http_version; |
24 |
} |
25 |
public String getRemote_addr() { |
26 |
return remote_addr; |
27 |
} |
28 |
public void setRemote_addr(String remote_addr) { |
29 |
this .remote_addr = remote_addr; |
30 |
} |
31 |
public String getRemote_user() { |
32 |
return remote_user; |
33 |
} |
34 |
public void setRemote_user(String remote_user) { |
35 |
this .remote_user = remote_user; |
36 |
} |
37 |
public String getTime_local() { |
38 |
return time_local; |
39 |
} |
40 |
public void setTime_local(String time_local) { |
41 |
this .time_local = time_local; |
42 |
} |
43 |
public String getRequest() { |
44 |
return request; |
45 |
} |
46 |
public void setRequest(String request) { |
47 |
this .request = request; |
48 |
} |
49 |
public String getStatus() { |
50 |
return status; |
51 |
} |
52 |
public void setStatus(String status) { |
53 |
this .status = status; |
54 |
} |
55 |
public String getBody_bytes_sent() { |
56 |
return body_bytes_sent; |
57 |
} |
58 |
public void setBody_bytes_sent(String body_bytes_sent) { |
59 |
this .body_bytes_sent = body_bytes_sent; |
60 |
} |
61 |
public String getHttp_referer() { |
62 |
return http_referer; |
63 |
} |
64 |
public void setHttp_referer(String http_referer) { |
65 |
this .http_referer = http_referer; |
66 |
} |
67 |
public String getHttp_user_agent() { |
68 |
return http_user_agent; |
69 |
} |
70 |
public void setHttp_user_agent(String http_user_agent) { |
71 |
this .http_user_agent = http_user_agent; |
72 |
} |
73 |
@Override |
74 |
public String toString() { |
75 |
return "Kpi [remote_addr=" + remote_addr + ", remote_user=" |
76 |
+ remote_user + ", time_local=" + time_local + ", request=" |
77 |
+ request + ", status=" + status + ", body_bytes_sent=" |
78 |
+ body_bytes_sent + ", http_referer=" + http_referer |
79 |
+ ", http_user_agent=" + http_user_agent + ", method=" + method |
80 |
+ ", http_version=" + http_version + "]" ; |
81 |
} |
82 |
|
83 |
|
84 |
|
85 |
} |
01 |
package org.aaa.kpi; |
02 |
|
03 |
public class KpiUtil { |
04 |
/*** |
05 |
* line记录转化成kpi对象 |
06 |
* @param line 日志的一条记录 |
07 |
* @author tianbx |
08 |
* */ |
09 |
public static Kpi transformLineKpi(String line){ |
10 |
String[] elementList = line.split( " " ); |
11 |
Kpi kpi = new Kpi(); |
12 |
kpi.setRemote_addr(elementList[ 0 ]); |
13 |
kpi.setRemote_user(elementList[ 1 ]); |
14 |
kpi.setTime_local(elementList[ 3 ].substring( 1 )); |
15 |
kpi.setMethod(elementList[ 5 ].substring( 1 )); |
16 |
kpi.setRequest(elementList[ 6 ]); |
17 |
kpi.setHttp_version(elementList[ 7 ]); |
18 |
kpi.setStatus(elementList[ 8 ]); |
19 |
kpi.setBody_bytes_sent(elementList[ 9 ]); |
20 |
kpi.setHttp_referer(elementList[ 10 ]); |
21 |
kpi.setHttp_user_agent(elementList[ 11 ] + " " + elementList[ 12 ]); |
22 |
return kpi; |
23 |
} |
24 |
} |
6、算法模型: 并行算法
Browser: 用户的訪问设备统计
– Map: {key:$http_user_agent,value:1}
– Reduce: {key:$http_user_agent,value:求和(sum)}
7、map-reduce分析代码
01 |
import java.io.IOException; |
02 |
import java.util.Iterator; |
03 |
|
04 |
import org.apache.hadoop.fs.Path; |
05 |
import org.apache.hadoop.io.IntWritable; |
06 |
import org.apache.hadoop.io.Text; |
07 |
import org.apache.hadoop.mapred.FileInputFormat; |
08 |
import org.apache.hadoop.mapred.FileOutputFormat; |
09 |
import org.apache.hadoop.mapred.JobClient; |
10 |
import org.apache.hadoop.mapred.JobConf; |
11 |
import org.apache.hadoop.mapred.MapReduceBase; |
12 |
import org.apache.hadoop.mapred.Mapper; |
13 |
import org.apache.hadoop.mapred.OutputCollector; |
14 |
import org.apache.hadoop.mapred.Reducer; |
15 |
import org.apache.hadoop.mapred.Reporter; |
16 |
import org.apache.hadoop.mapred.TextInputFormat; |
17 |
import org.apache.hadoop.mapred.TextOutputFormat; |
18 |
import org.hmahout.kpi.entity.Kpi; |
19 |
import org.hmahout.kpi.util.KpiUtil; |
20 |
|
21 |
import cz.mallat.uasparser.UASparser; |
22 |
import cz.mallat.uasparser.UserAgentInfo; |
23 |
|
24 |
public class KpiBrowserSimpleV { |
25 |
|
26 |
public static class KpiBrowserSimpleMapper extends MapReduceBase |
27 |
implements Mapper<Object, Text, Text, IntWritable> { |
28 |
UASparser parser = null ; |
29 |
@Override |
30 |
public void map(Object key, Text value, |
31 |
OutputCollector<Text, IntWritable> out, Reporter reporter) |
32 |
throws IOException { |
33 |
Kpi kpi = KpiUtil.transformLineKpi(value.toString()); |
34 |
|
35 |
if (kpi!= null && kpi.getHttP_user_agent_info()!= null ){ |
36 |
if (parser== null ){ |
37 |
parser = new UASparser(); |
38 |
} |
39 |
UserAgentInfo info = |
40 |
parser.parseBrowserOnly(kpi.getHttP_user_agent_info()); |
41 |
if ( "unknown" .equals(info.getUaName())){ |
42 |
out.collect( new Text(info.getUaName()), new IntWritable( 1 )); |
43 |
} else { |
44 |
out.collect( new Text(info.getUaFamily()), new IntWritable( 1 )); |
45 |
} |
46 |
|
47 |
} |
48 |
} |
49 |
} |
50 |
|
51 |
public static class KpiBrowserSimpleReducer extends MapReduceBase implements |
52 |
Reducer<Text, IntWritable, Text, IntWritable>{ |
53 |
|
54 |
@Override |
55 |
public void reduce(Text key, Iterator<IntWritable> value, |
56 |
OutputCollector<Text, IntWritable> out, Reporter reporter) |
57 |
throws IOException { |
58 |
IntWritable sum = new IntWritable( 0 ); |
59 |
while (value.hasNext()){ |
60 |
sum.set(sum.get()+value.next().get()); |
61 |
} |
62 |
out.collect(key, sum); |
63 |
} |
64 |
} |
65 |
public static void main(String[] args) throws IOException { |
66 |
String input = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/input" ; |
67 |
String output = "hdfs://127.0.0.1:9000/user/tianbx/log_kpi/browerSimpleV" ; |
68 |
JobConf conf = new JobConf(KpiBrowserSimpleV. class ); |
69 |
conf.setJobName( "KpiBrowserSimpleV" ); |
70 |
String url = "classpath:" ; |
71 |
conf.addResource(url+ "/hadoop/core-site.xml" ); |
72 |
conf.addResource(url+ "/hadoop/hdfs-site.xml" ); |
73 |
conf.addResource(url+ "/hadoop/mapred-site.xml" ); |
74 |
|
75 |
conf.setMapOutputKeyClass(Text. class ); |
76 |
conf.setMapOutputValueClass(IntWritable. class ); |
77 |
|
78 |
conf.setOutputKeyClass(Text. class ); |
79 |
conf.setOutputValueClass(IntWritable. class ); |
80 |
|
81 |
conf.setMapperClass(KpiBrowserSimpleMapper. class ); |
82 |
conf.setCombinerClass(KpiBrowserSimpleReducer. class ); |
83 |
conf.setReducerClass(KpiBrowserSimpleReducer. class ); |
84 |
|
85 |
conf.setInputFormat(TextInputFormat. class ); |
86 |
conf.setOutputFormat(TextOutputFormat. class ); |
87 |
|
88 |
FileInputFormat.setInputPaths(conf, new Path(input)); |
89 |
FileOutputFormat.setOutputPath(conf, new Path(output)); |
90 |
|
91 |
JobClient.runJob(conf); |
92 |
System.exit( 0 ); |
93 |
} |
94 |
|
95 |
} |
8、输出文件log_kpi/browerSimpleV内容
AOL Explorer 1
Android Webkit 123
Chrome 4867
CoolNovo 23
Firefox 1700
Google App Engine 5
IE 1521
Jakarta Commons-HttpClient 3
Maxthon 27
Mobile Safari 273
Mozilla 130
Openwave Mobile Browser 2
Opera 2
Pale Moon 1
Python-urllib 4
Safari 246
Sogou Explorer 157
unknown 4685
8 R制作图片
data<-read.table(file="borwer.txt",header=FALSE,sep=",")
names(data)<-c("borwer","num")
qplot(borwer,num,data=data,geom="bar")
解决这个问题
1、排除爬虫和程序点击,对抗作弊
解决的方法:页面做个检測鼠标是否动。
2、浏览量 怎么排除图片
3、浏览量排除假点击?
4、哪一个搜索引擎訪问的?
5、点击哪一个keyword訪问的?
6、从哪一个地方訪问的?
7、使用哪一个浏览器訪问的?