hive日志分析实战(一)

分析用户玩家流失率
(流失:如果一用户登陆某游戏某区服后接下来一周时间内登陆该区服的天数少于2天,则认为该用户已流失)
日志格式如下:

{"cnt":3,"src":"bbs","time":20130622063117,"qid":"100005648","gkey":"yjjh","skey":"S9"}
{"cnt":2,"src":"null","time":20130622005615,"qid":"100015499","gkey":"dgwm","skey":"592120005"}
{"cnt":5,"src":"txt","time":20130622044917,"qid":"100021254","gkey":"yjjh","skey":"S1"}
{"cnt":1,"src":"null","time":20130622090137,"qid":"100023162","gkey":"wulin","skey":"S20"}
{"cnt":1,"src":"null","time":20130622090417,"qid":"100024132","gkey":"wulin","skey":"S20"}
{"cnt":1,"src":"null","time":20130622090526,"qid":"100025487","gkey":"wulin","skey":"S20"}
{"cnt":1,"src":"loginhistory","time":20130622101001,"qid":"100030555","gkey":"sxd","skey":"S149"}
{"cnt":1,"src":"se-yxlist-frxz","time":20130622101158,"qid":"100035304","gkey":"frxz","skey":"S12"}
{"cnt":5,"src":"se","time":20130622100838,"qid":"100035995","gkey":"ktpd","skey":"S9"}
{"cnt":2,"src":"null","time":20130622101413,"qid":"100035995","gkey":"xjsj","skey":"S22"}

每条日志记录某用户在一天之内登陆某游戏区服的情况:
[list]
[*] cnt:登陆次数
[*] src:来源
[*] time:第一次登陆时间
[*] qid:用户ID
[*] gkey:游戏ID
[*] skey:区服ID
[/list]
步骤:
1.导入数据

create external table login_interm (cnt string,src string,time string,qid string,gkey string,skey string) row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' with SERDEPROPERTIES ('input.regex' = '\\{"cnt":([^,]*),"src":"([^"]*)","time":([^,]*),"qid":"([^"]*)","gkey":"([^"]*)","skey":"([^"]*)"\}','output.format.string' = '%1$s %2$s %3$s %4$s %5$s %6$s') location '/login_logs/';

create external table login_info (cnt string,src string,time string,qid string,gkey string,skey string) PARTITIONED BY(pgkey STRING) CLUSTERED BY(skey,qid) SORTED BY(skey,qid,time) INTO 32 BUCKETS stored as SEQUENCEFILE;

from login_interm insert overwrite table login_info PARTITION (pgkey) select cnt,src,time,qid,gkey,skey,gkey;

几点说明:
[list]
[*] 因dynamic partition,无法直接将日志文件load进login_info表,需要login_interm来完成字符串正则匹配
[*] 动态分区的数目:对于低版本内核,如果分区较多,使用python执行reduce操作时会报错"Hive. java.io.IOException: error=7, Argument list too long"(参见[url]https://groups.google.com/a/cloudera.org/forum/#!topic/cdh-user/dSGrvvNhCcQ[/url])
[/list]
2.提取

create table login_stat_use_reduce (login_times int,login_days int,qid string,gkey string,skey string);
add file '/home/hadoop/reduce.py';
from (select cnt,time,qid,gkey,skey from login_info where cnt is not null distribute by gkey,skey,qid sort by gkey,skey,qid,time) map_out insert overwrite table login_stat_use_reduce reduce cnt,time,qid,gkey,skey using '/usr/bin/python2.7 ./reduce.py' as login_times,days,qid,gkey,skey;


[list]
[*] distribute by:
[/list]
其中reduce.py:

#!/usr/bin/python2.7
#coding:utf-8
import datetime
import time
import sys,logging

def datetime_toString(dt):
"""把datetime转成字符串"""
return dt.strftime("%Y%m%d")
def string_toDatetime(string):
"""把字符串转成datetime"""
return datetime.datetime.strptime(string, "%Y%m%d")
def string_toTimestamp(strTime):
"""把字符串转成时间戳形式"""
return time.mktime(string_toDatetime(strTime).timetuple())
def timestamp_toString(stamp):
"""把时间戳转成字符串形式"""
return time.strftime("%Y%m%d%H", time.localtime(stamp))
def datetime_toTimestamp(dateTime):
"""把datetime类型转外时间戳形式"""
return time.mktime(dateTime.timetuple())

def substract_DateTime(dateStr1,dateStr2):
""" 返回两个日期之间的差 """
d1=string_toDatetime(dateStr1)
d2=string_toDatetime(dateStr2)
return d2-d1

def substract_TimeStamp(dateStr1,dateStr2):
""" 两个日期的 timestamp 差值 """
ts1= string_toTimestamp(dateStr1)
ts2= string_toTimestamp(dateStr2)
return ts1-ts2

def compare_dateTime(dateStr1,dateStr2):
"""两个日期的比较, 当然也可以用timestamep方法比较,都可以实现."""
date1 = string_toDatetime(dateStr1)
date2 = string_toDatetime(dateStr2)
return date1.date()>date2.date()

def dateTime_Add(dateStr,days=0):
""" 指定日期加上 一个时间段,天,小时,或分钟之后的日期 """
date1= string_toDatetime(dateStr)
return date1+datetime.timedelta(days=days)

first_line = 1;
pre_login_days = 0
pre_login_cnt = 0
pre_gkey = ""
pre_skey = ""
pre_qid = ""
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='./worker.log',
filemode='a')
for line in sys.stdin:
line = line.strip().strip('\t');
logging.error("data:"+line)
if line == "":
continue
cnt,time,qid,gkey,skey = line.split('\t')
if not cnt or not time or not qid or not gkey or not skey:
continue
login_date = time[0:8]
if first_line:
lost = 0;
pre_gkey = gkey
pre_skey = skey
pre_qid = qid
pre_login_days = 1
pre_login_date = login_date
pre_login_cnt = eval(cnt)
first_line = 0
continue
if gkey != pre_gkey or skey != pre_skey or qid != pre_qid:
if pre_login_days < 3:
print "%d\t%d\t%s\t%s\t%s"%(pre_login_cnt,pre_login_days,pre_qid,pre_gkey,pre_skey)
lost = 0;
pre_gkey = gkey
pre_skey = skey
pre_qid = qid
pre_login_days = 1
pre_login_date = login_date
pre_login_cnt = eval(cnt)
else:
if lost or pre_login_days >= 3:
continue
if compare_dateTime(login_date,datetime_toString(dateTime_Add(pre_login_date,7))):
lost = 1
continue
pre_login_days += 1
pre_login_cnt += eval(cnt)
if pre_login_days < 3:
print "%d\t%d\t%s\t%s\t%s"%(pre_login_cnt,pre_login_days,pre_qid,pre_gkey,pre_skey)

你可能感兴趣的:(hadoop)