"05/Jul/2015:00:01:04 +0800" "GET" "http%3A//jf.10086.cn/m/" "HTTP/1.1" "200" "http://jf.10086.cn/m/subject/100000000000009_0.html" "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; Lenovo A3800-d Build/LenovoA3800-d) AppleWebKit/533.1 (KHTML, like Gecko)Version/4.0 MQQBrowser/5.4 TBS/025438 Mobile Safari/533.1 MicroMessenger/6.2.0.70_r1180778.561 NetType/cmnet Language/zh_CN" "10.139.198.176" "480x854" "24" "%u5927%u7C7B%u5217%u8868%u9875_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE" "0" "3037487029517069460000" "3037487029517069460000" "1" "75"
"05/Jul/2015:00:01:04 +0800"
"GET" "http%3A//jf.10086.cn/portal/ware/web/SearchWareAction%3Faction%3DsearchWareInfo%26pager.offset%3D144" "HTTP/1.1" "200" "http://jf.10086.cn/portal/ware/web/SearchWareAction?action=searchWareInfo&pager.offset=156" "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-CN; HUAWEI MT2-L01 Build/HuaweiMT2-L01) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 UCBrowser/10.5.2.598 U3/0.8.0 Mobile Safari/534.30" "223.73.104.224" "720x1208" "32" "%u641C%u7D22_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE" "0" "3046252153674140570000" "3046252153674140570000" "1" "2699"
"05/Jul/2015:00:01:04 +0800"
"GET" "" "HTTP/1.1" "200" "http://jf.10086.cn/" "Mozilla/5.0 (Linux; Android 4.4.4; vivo Y13L Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36 baiduboxapp/5.1 (Baidu; P1 4.4.4)" "10.154.210.240" "480x855" "32" "%u9996%u9875_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE" "0" "3098781670304015290000" "3098781670304015290000" "0" "831"
|
序号
|
字段名称
|
字段类型
|
列含义
|
举例
|
0
|
time_local
|
string
|
访问日期
|
05/Jul/2015:05:01:05 +0800
|
1
|
request_method
|
string
|
请求方法
|
GET
|
2
|
arg_referrerPage
|
string
|
当前页面前一个页面
|
http%3A//wap.jf.10086.cn/
|
3
|
server_protocol
|
string
|
协议
|
HTTP/1.1
|
4
|
status
|
string
|
响应状态
|
200
|
5
|
http_referer
|
string
|
请求页面
|
http://jf.10086.cn/
|
6
|
http_user_agent
|
string
|
请求代理
|
Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Mobile/11D257
|
7
|
http_x_forwarded_for
|
string
|
用户IP地址
|
117.95.112.54
|
8
|
screenSize
|
string
|
屏幕分辨率
|
320x568
|
9
|
screenColor
|
string
|
颜色
|
32
|
10
|
pageTitle
|
string
|
页面标题
|
%u9996%u9875_%u4E2D%u56FD%u79FB%u52A8%u79EF%u5206%u5546%u57CE
|
11
|
siteType
|
string
|
渠道 (0-网站端,1-移动端) |
0
|
12
|
uid
|
string
|
用户访问唯一标示(uid)
|
3011949129193080000000
|
13
|
sid
|
string
|
用户会话标示(sid)
|
3011949129193080000000
|
14
|
sflag
|
string
|
会话 (1-新增,0-更新) |
1
|
15
|
onloadTotalTime
|
string
|
页面访问时长
|
452
|
16
|
access_day
|
string
|
分区表
|
|
create external table data_collect(
time_local string,
request_method string ,
referrerPage string,
server_protocol string,
status string ,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
screenSize string,
screenColor string,
pageTitle string ,
siteType string,
uid string,
sid string ,
sflag string ,
onloadTotalTime string
)partitioned by(access_day string)
row format SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'input.regex'='"(.*?)[\\s][+0-9]*"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*"(.*?)"[\\s]*',
'output.format.string' = '%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s')
STORED AS TEXTFILE;
|
load data local inpath '/home/hadoop/20150705' overwrite into table data_collect partition(access_day=20150705)
|
hive (jfyun)> add jar /home/hadoop/app/hive/lib/hive-contrib-0.13.0.jar;
hive (jfyun)> select time_local ,request_method,onloadTotalTime ,access_day from data_collect;
time_local request_method onloadtotaltime access_day 06/Jul/2015:00:01:04 +0800 GET 75 20150706 06/Jul/2015:01:01:04 +0800 GET 2699 20150706 06/Jul/2015:02:01:04 +0800 GET 831 20150706 06/Jul/2015:03:01:07 +0800 GET 135 20150706 |
public class IpToProv extends UDF {
public static Map map = new HashMap();
static {
map.put("100", "北京");
map.put("200", "广东");
map.put("210", "上海");
map.put("220", "天津");
map.put("230", "重庆");
map.put("240", "辽宁");
map.put("250", "江苏");
map.put("270", "湖北");
map.put("280", "四川");
map.put("290", "陕西");
map.put("311", "河北");
map.put("351", "山西");
map.put("371", "河南");
map.put("431", "吉林");
map.put("451", "黑龙江");
map.put("471", "内蒙古");
map.put("531", "山东");
map.put("551", "安徽");
map.put("571", "浙江");
map.put("591", "福建");
map.put("731", "湖南");
map.put("771", "广西");
map.put("791", "江西");
map.put("851", "贵州");
map.put("871", "云南");
map.put("891", "西藏");
map.put("898", "海南");
map.put("931", "甘肃");
map.put("951", "宁夏");
map.put("971", "青海");
map.put("991", "新疆");
}
/**
* IP转为省份
*
* @param ip
* @return
*/
public String evaluate(String ip) {
IPSeeker ipSeeker = new IPSeeker();
String lookup = ipSeeker.lookup(ip, "999");
return map.get(lookup) == null ? "北京" : map.get(lookup);
}
public static void main(String[] args) {
IpToProv area = new IpToProv();
String evaluate = area.evaluate("180.155.87.248");
System.out.println(evaluate);
}
}
add jar /home/hadoop/IpToProv.jar
create temporary function ipToProv as 'cn.hive.IpToProv';
select ipToProv(http_x_forwarded_for) from data_collect limit 1 ;
select ipToProv(http_x_forwarded_for) province_name ,count(sid) count
from data_collect
group by ipToProv(http_x_forwarded_for)
order by count desc;
limit 10;
备注:省份编码列表
序号
|
编码
|
名称
|
1
|
100
|
北京
|
2
|
200
|
广东
|
3
|
210
|
上海
|
4
|
220
|
天津
|
5
|
230
|
重庆
|
6
|
240
|
辽宁
|
7
|
250
|
江苏
|
8
|
270
|
湖北
|
9
|
280
|
四川
|
10
|
290
|
陕西
|
11
|
311
|
河北
|
12
|
351
|
山西
|
13
|
371
|
河南
|
14
|
431
|
吉林
|
15
|
451
|
黑龙江
|
16
|
471
|
内蒙古
|
17
|
531
|
山东
|
18
|
551
|
安徽
|
19
|
571
|
浙江
|
20
|
591
|
福建
|
21
|
731
|
湖南
|
22
|
771
|
广西
|
23
|
791
|
江西
|
24
|
851
|
贵州
|
25
|
871
|
云南
|
26
|
891
|
西藏
|
27
|
898
|
海南
|
28
|
931
|
甘肃
|
29
|
951
|
宁夏
|
30
|
971
|
青海
|
31
|
991
|
新疆
|
select t.access_day,t.http_referer,t.count
from
(select access_day,http_referer,count(1) count from data_collect where access_day='20150705' group by access_day, http_referer) t
order by t.count desc
limit 20;
create external table top_n(
access_day string,
time_local string,
http_referer string,
uid string,
sid string,
loadtotaltime int
)
row format delimited
fields terminated by '\t';
insert overwrite table top_n
select access_day ,time_local,http_referer,uid,sid,onloadtotaltime from data_collect where access_day='20150705';
编写MR程序
/**
* 统计每天每隔URL的TOPN
*
* 集群运行命令: hadoop jar urlTopN.jar hdfs://mycluster:9000/user/hive/warehouse/jfyun.db/top_n hdfs://mycluster:9000/topN
* @author shenfl
*
*/
public class URLTopN extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapreduce.output.basename", "topN");// 修改Reduce生产文件的名称
conf.set("mapreduce.output.textoutputformat.separator", "$$");// reduce输出结果分隔符修改
conf.set("N", "20");// 设置topN
Job job = Job.getInstance(conf, URLTopN.class.getSimpleName());
job.setJarByClass(URLTopN.class);
FileInputFormat.setInputDirRecursive(job, true);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setMapperClass(URLTopNMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setReducerClass(URLTopNReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
Path outputDir = new Path(args[1]);
deleteOutDir(conf, outputDir);
FileOutputFormat.setOutputPath(job, outputDir);
boolean waitForCompletion = job.waitForCompletion(true);
return waitForCompletion ? 0 : 1;
}
/**
* @param conf
* @param outputDir
* @throws IOException
*/
public void deleteOutDir(Configuration conf, Path outputDir) throws IOException {
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outputDir)) {
fs.delete(outputDir, true);
}
}
public static void main(String[] args) {
try {
int run = ToolRunner.run(new Configuration(), new URLTopN(), args);
System.exit(run);
} catch (Exception e) {
e.printStackTrace();
}
}
public static class URLTopNMapper extends Mapper {
Text k2 = new Text();
LongWritable v2 = new LongWritable();
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
String line = v1.toString();
String[] splited = line.split("\t");
k2.set(splited[2]);
v2.set(1);
context.write(k2, v2);
}
}
public static class URLTopNReducer extends Reducer {
LongWritable v3 = new LongWritable();
LogInfoWritable logInfo;
public static Integer topN = Integer.MIN_VALUE;
/**
* 存储每个 url访问的次数,并且必须使用TreeMap,默认k升序,可以自己订单k的对象,使用降序排序
*/
TreeMap logMap = new TreeMap();
/**
* ReduceTask任务启动时候调用一次
*/
@Override
protected void setup(Reducer.Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
topN = Integer.parseInt(conf.get("N"));
}
@Override
protected void reduce(Text k2, Iterable v2s, Context context) throws IOException,
InterruptedException {
long sum = 0;
for (LongWritable v : v2s) {
sum += v.get();
}
logInfo = new LogInfoWritable();
logInfo.set(k2.toString(), sum);
logMap.put(logInfo, NullWritable.get());
}
/**
* 执行完所有的reduce后,调用cleanup方法.输出前访问量top10的 url 要求:
* logMap中的value按照从从到到小输出,这样才能获取访问量top10url
*/
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
int count = 0;
for (Map.Entry entry : logMap.entrySet()) {
if (++count <= topN) {
context.write(new Text(entry.getKey().getUrl()), new LongWritable(entry.getKey().getSum()));
}
}
}
}
public static class LogInfoWritable implements WritableComparable {
private String url;
private long sum;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public long getSum() {
return sum;
}
public void setSum(long sum) {
this.sum = sum;
}
public void set(String url, long sum) {
this.url = url;
this.sum = sum;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(url);
out.writeLong(sum);
}
@Override
public void readFields(DataInput in) throws IOException {
this.url = in.readUTF();
this.sum = in.readLong();
}
//从大到小排序
@Override
public int compareTo(LogInfoWritable o) {
return this.sum-o.sum>0?-1:1;
}
@Override
public String toString() {
return this.url + "\t" + this.sum;
}
}
}
select screencolor,count(distinct uid) count
from data_collect
where access_day='20150705'
group by screencolor
order by count desc
limit 3;
select screensize,count(distinct uid) count
from data_collect
where access_day='20150705'
group by screensize
order by count desc;