hive日志分析实战(一)

分析用户玩家流失率 
(流失:如果一用户登陆某游戏某区服后接下来一周时间内登陆该区服的天数少于2天,则认为该用户已流失) 
日志格式如下: 

Text代码   收藏代码
  1. {"cnt":3,"src":"bbs","time":20130622063117,"qid":"100005648","gkey":"yjjh","skey":"S9"}  
  2. {"cnt":2,"src":"null","time":20130622005615,"qid":"100015499","gkey":"dgwm","skey":"592120005"}  
  3. {"cnt":5,"src":"txt","time":20130622044917,"qid":"100021254","gkey":"yjjh","skey":"S1"}  
  4. {"cnt":1,"src":"null","time":20130622090137,"qid":"100023162","gkey":"wulin","skey":"S20"}  
  5. {"cnt":1,"src":"null","time":20130622090417,"qid":"100024132","gkey":"wulin","skey":"S20"}  
  6. {"cnt":1,"src":"null","time":20130622090526,"qid":"100025487","gkey":"wulin","skey":"S20"}  
  7. {"cnt":1,"src":"loginhistory","time":20130622101001,"qid":"100030555","gkey":"sxd","skey":"S149"}  
  8. {"cnt":1,"src":"se-yxlist-frxz","time":20130622101158,"qid":"100035304","gkey":"frxz","skey":"S12"}  
  9. {"cnt":5,"src":"se","time":20130622100838,"qid":"100035995","gkey":"ktpd","skey":"S9"}  
  10. {"cnt":2,"src":"null","time":20130622101413,"qid":"100035995","gkey":"xjsj","skey":"S22"}  


每条日志记录某用户在一天之内登陆某游戏区服的情况: 

  • cnt:登陆次数
  • src:来源
  • time:第一次登陆时间
  • qid:用户ID
  • gkey:游戏ID
  • skey:区服ID


步骤: 
1.导入数据 

Sql代码   收藏代码
  1. create external table login_interm (cnt string,src string,time string,qid string,gkey string,skey string) row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' with SERDEPROPERTIES ('input.regex' = '\\{"cnt":([^,]*),"src":"([^"]*)","time":([^,]*),"qid":"([^"]*)","gkey":"([^"]*)","skey":"([^"]*)"\}','output.format.string' = '%1$s %2$s %3$s %4$s %5$s %6$s') location '/login_logs/';  
  2.   
  3. create external table login_info (cnt string,src string,time string,qid string,gkey string,skey string) PARTITIONED BY(pgkey STRING) CLUSTERED BY(skey,qid) SORTED BY(skey,qid,timeINTO 32 BUCKETS  stored as SEQUENCEFILE;  
  4.   
  5. from login_interm insert overwrite table login_info PARTITION (pgkey) select cnt,src,time,qid,gkey,skey,gkey;  


几点说明: 

  • 因dynamic partition,无法直接将日志文件load进login_info表,需要login_interm来完成字符串正则匹配
  • 动态分区的数目:对于低版本内核,如果分区较多,使用python执行reduce操作时会报错"Hive. java.io.IOException: error=7, Argument list too long"(参见https://groups.google.com/a/cloudera.org/forum/#!topic/cdh-user/dSGrvvNhCcQ)


2.提取 

Sql代码   收藏代码
  1. create table login_stat_use_reduce (login_times int,login_days int,qid string,gkey string,skey string);  
  2. add file '/home/hadoop/reduce.py';  
  3. from (select cnt,time,qid,gkey,skey from login_info where cnt is not null distribute by gkey,skey,qid sort by gkey,skey,qid,time) map_out insert overwrite table login_stat_use_reduce reduce cnt,time,qid,gkey,skey using '/usr/bin/python2.7 ./reduce.py' as login_times,days,qid,gkey,skey;  

 

  • distribute by:


其中reduce.py: 

Python代码   收藏代码
  1. #!/usr/bin/python2.7   
  2. #coding:utf-8  
  3. import datetime  
  4. import time  
  5. import sys,logging  
  6.   
  7. def datetime_toString(dt):  
  8.     """把datetime转成字符串"""  
  9.     return dt.strftime("%Y%m%d")  
  10. def string_toDatetime(string):  
  11.     """把字符串转成datetime"""  
  12.     return datetime.datetime.strptime(string, "%Y%m%d")  
  13. def string_toTimestamp(strTime):  
  14.     """把字符串转成时间戳形式"""   
  15.     return time.mktime(string_toDatetime(strTime).timetuple())  
  16. def timestamp_toString(stamp):  
  17.     """把时间戳转成字符串形式"""   
  18.     return time.strftime("%Y%m%d%H", time.localtime(stamp))  
  19. def datetime_toTimestamp(dateTime):  
  20.     """把datetime类型转外时间戳形式"""  
  21.     return time.mktime(dateTime.timetuple())  
  22.                                                           
  23. def substract_DateTime(dateStr1,dateStr2):  
  24.     """ 返回两个日期之间的差 """  
  25.     d1=string_toDatetime(dateStr1)  
  26.     d2=string_toDatetime(dateStr2)  
  27.     return d2-d1  
  28.   
  29. def substract_TimeStamp(dateStr1,dateStr2):  
  30.     """ 两个日期的 timestamp 差值 """  
  31.     ts1= string_toTimestamp(dateStr1)  
  32.     ts2= string_toTimestamp(dateStr2)  
  33.     return ts1-ts2  
  34.   
  35. def compare_dateTime(dateStr1,dateStr2):  
  36.     """两个日期的比较, 当然也可以用timestamep方法比较,都可以实现."""  
  37.     date1 = string_toDatetime(dateStr1)  
  38.     date2 = string_toDatetime(dateStr2)  
  39.     return date1.date()>date2.date()  
  40.   
  41. def dateTime_Add(dateStr,days=0):  
  42.     """ 指定日期加上 一个时间段,天,小时,或分钟之后的日期 """  
  43.     date1= string_toDatetime(dateStr)  
  44.     return date1+datetime.timedelta(days=days)  
  45.   
  46. first_line = 1;  
  47. pre_login_days = 0  
  48. pre_login_cnt = 0  
  49. pre_gkey = ""  
  50. pre_skey = ""  
  51. pre_qid  = ""  
  52. logging.basicConfig(level=logging.DEBUG,  
  53.         format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',  
  54.         datefmt='%a, %d %b %Y %H:%M:%S',  
  55.         filename='./worker.log',  
  56.         filemode='a')  
  57. for line in sys.stdin:    
  58.     line = line.strip().strip('\t');  
  59.     logging.error("data:"+line)  
  60.     if line == "":  
  61.         continue  
  62.     cnt,time,qid,gkey,skey = line.split('\t')    
  63.     if not cnt or not time or not qid or not gkey or not skey:  
  64.         continue  
  65.     login_date = time[0:8]  
  66.     if first_line:  
  67.         lost = 0;  
  68.         pre_gkey = gkey  
  69.         pre_skey = skey  
  70.         pre_qid  = qid  
  71.         pre_login_days = 1   
  72.         pre_login_date = login_date  
  73.         pre_login_cnt  = eval(cnt)  
  74.         first_line = 0  
  75.         continue  
  76.     if gkey != pre_gkey or skey != pre_skey or qid != pre_qid:  
  77.         if pre_login_days < 3:  
  78.             print "%d\t%d\t%s\t%s\t%s"%(pre_login_cnt,pre_login_days,pre_qid,pre_gkey,pre_skey)  
  79.         lost = 0;  
  80.         pre_gkey = gkey  
  81.         pre_skey = skey  
  82.         pre_qid  = qid  
  83.         pre_login_days = 1  
  84.         pre_login_date = login_date  
  85.         pre_login_cnt = eval(cnt)  
  86.     else:  
  87.         if lost or pre_login_days >= 3:  
  88.         continue  
  89.         if compare_dateTime(login_date,datetime_toString(dateTime_Add(pre_login_date,7))):  
  90.             lost = 1  
  91.             continue  
  92.         pre_login_days += 1  
  93.         pre_login_cnt  += eval(cnt)  
  94. if pre_login_days < 3:  
  95.     print "%d\t%d\t%s\t%s\t%s"%(pre_login_cnt,pre_login_days,pre_qid,pre_gkey,pre_skey)  

http://godlovesdog.iteye.com/blog/1898200

你可能感兴趣的:(hive)