一、实现功能
1.分析网站日志,获得日期,uv,pv,登录人数,游客人数,平均访问时长,二跳率,独立ip数等关键信息。
其中:
登录:userid有值,会员,有账号登录
游客:userid无值,非登录人员
平均访问时长:在网页停留时间
二跳率:在一次会话中,同一个session点击的页面大于等于2的会话就是二跳(判断同一个session有多条记录的几率是多少)
独立ip数:统计ip去重
2.日志数据集
yhd_source.id yhd_source.url yhd_source.referer yhd_source.keyword yhd_source.type yhd_source.guidyhd_source.pageid yhd_source.moduleid yhd_source.linkid yhd_source.attachedinfo yhd_source.sessionid yhd_source.trackeru yhd_source.trackertype yhd_source.ip yhd_source.trackersrc yhd_source.cookie yhd_source.ordercode yhd_source.tracktime yhd_source.enduserid yhd_source.firstlink yhd_source.sessionviewno yhd_source.productid yhd_source.curmerchantid yhd_source.provinceid yhd_source.cityid yhd_source.fee yhd_source.edmactivity yhd_source.edmemail yhd_source.edmjobid yhd_source.ieversion yhd_source.platform yhd_source.internalkeyword yhd_source.resultsum yhd_source.currentpage yhd_source.linkposition yhd_source.buttonposition yhd_source.date
121508281810000000 http://www.yhd.com/?union_ref=7&cp=0 3 PR4E9HWE38DMN4Z6HUG667SCJNZXMHSPJRER VFA5QRQ1N4UJNS9P6MH6HPA76SXZ737P 10977119545 124.65.159.122 unionKey:10977119545 2015-08-28 18:10:00 50116447 http://image.yihaodianimg.com/virtual-web_static/virtual_yhd_iframe_index_widthscreen.html?randid=2015828 6 1000 Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0 Win32 lunbo_tab_3 2015082818
121508281810000001 http://my.yhd.com/order/finishOrder.do?orderCode=5435446505152 http://buy.yhd.com/checkoutV3/index.do 3 YJ25S3QAVPAS31PHSB3HFGZ1E5AYMKX9XUTX 6W26QM41DM6HHND3R4FP42YYXXE1NKGA 222.73.202.251 2015-08-28 18:10:00 85133152 http://www.haosou.com/s?src=new_isearch&q=1%E5%8F%B7%E5%BA%97 25 0 1 Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 Win32 MY_ORDERCOMPLETION_EDITADDRESS 2015082818
121508281810000002 http://list.yhd.com/p/c5072-b-a-s1-v0-p1-price-d0-pid-pt1086211-pl1171565-m0-k?tp=44.1086211.0.0.0.Kxnn54p-11-FFJKr http://list.yhd.com/p/pt1086211-pl1171565?tp=44.1086211.1508.0.1.Kxnmyye-11-FFJKr 3 JRBWWU6ECXN15Q2Z5QT4TETNHKY7QHE3Y8B3 44.1086211.0.0.0.Kxnn54p-11-FFJKr 5Z5JZMYUGK9TP3QWHDDTU6G5T6PHEQRZ 4734 111.193.165.158 msessionid:DW6SB2FGG84ZZ2WD77DAZHFBXNV8D5776RQ4,uname:gaochentongxue,unionKey:4734,websiteId:A100215249 2015-08-28 18:10:00 116262550 http://www.yhd.com/?tracker_u=1624169&t=1440753050503 107 2 1000 Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 Win32 107 1 search_navi_cat_4 2015082818
二、数据处理
1.数据采集加载到hive
create database yhd;
create table yhd_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)partitioned by (date string)
row format delimited fields terminated by "\t";
load data local inpath '/opt/datas/2015082818' into table yhd_source partition(date ='2015082818');
2.数据清洗
创建会话信息表:
create table session_info(
session_id string ,
guid string ,
trackerU string ,
landing_url string ,
landing_url_ref string ,
user_id string ,
pv string ,
stay_time string ,
min_trackTime string ,
ip string ,
provinceId string
)
partitioned by (date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
3.创建两张临时表作为中间表
(1)两张临时表的结构关系
(2)创建第一张临时表:算出group才能得到的某些数值
create table session_tmp as
select
sessionId session_id,
max(guid) guid,
max(endUserId) user_id,
count(distinct url) pv,
(unix_timestamp(max(trackTime))-unix_timestamp(min(trackTime))) stay_time,
min(trackTime) min_trackTime,
max(ip) ip,
max(provinceId) provinceId
from yhd_source where date = '2015082818'
group by sessionId;
(3)创建第二张临时表
备注:只是取关键字段,不需要任何处理。
create table track_tmp as
select
sessionId session_id,
trackTime trackTime,
url landing_url,
trackerU trackerU,
referer landing_url_ref
from yhd_source where date='2015082818';
(4)两张临时表进行join
insert overwrite table session_info partition(date='2015082818')
select
a.session_id,
a.guid,
b.trackerU,
b.landing_url,
b.landing_url_ref,
a.user_id,
a.pv,
a.stay_time,
a.min_trackTime,
a.ip,
a.provinceId
from session_tmp a join track_tmp b
on a.session_id=b.session_id and a.min_trackTime=b.trackTime;
4.数据分析:结果表
create table result2 as
select
date date,
sum(pv) PV,
count(distinct guid) UV,
count(distinct case when length(user_id)!=0 then user_id else null end) login_user,
count(distinct case when length(user_id)=0 then guid else null end) visitor,
avg(stay_time) avg_time,
count(case when pv>=2 then session_id else null end)/count(session_id) second_jump,
count(distinct ip) IP
from session_info where date='2015082818'
group by date;
结果
日期 uv pv 登录人数 游客人数 平均访问时长 二跳率 独立ip数
2015082818 23928 37843.0 11411 12367 50.10636239012983 0.26695427788081605 19174