hive实现网站用户行为分析指标



字段解释
accessDate     //访问时间,精确到日期,String格式


accessTime   //访问时间,精确到毫秒,int格式


accessHour   //访问小时,区间为0-23,int格式 


requestMethod   //请求方式(get post 统计的时候没用到),String格式


requestProtocal   //请求协议(http https,统计的时候没用到),String格式


requestUrl   //请求URL地址,
e.g.:http://it18zhang.com/news/view?news_id=26,格式为String


requestIp   //请求IP地址,e.g.:192.168.1.1,String格式


returnStatus   //返回状态(不了解,统计的时候没用到),String格式


referUrl   //上一跳URL地址,
e.g.:http://www.baidu.com/s?wd=ggg&xxxx, 格式为String


referDomain   //上一跳域名,e.g.: baidu.com,String格式


userOrigin   //用户入口地址,e.g.:http://www.baidu.com, 格式为String
(这个不太了解,但是根据originWord觉得应该是搜索引擎)


originWord   //入口关键字,e.g.:it 18zhang it18zhang,格式为String


browser   //浏览器,e.g.:Firefox Chrome,格式为String


browserVersion   //浏览器版本,e.g.:51.0 50.1,格式为String


operateSystem   //操作系统,e.g.:Windows10 macOS Ubuntu,格式为String


ipNumber   //IP数,(这个不了解,统计时候没用到),int格式


userProvince   //用户省份,String格式


screenSize   //屏幕尺寸,e.g.:1366x768,String格式


screenColor   //屏幕颜色,e.g.:red blue green,String格式


pageTitle   //页面标题,e.g.:Python,BigData,String格式


siteType   //站型,(不太了解,默认为0),String格式


userFlag   //用户标示,从cookie中提取,相当于userID,String格式


visitFlag   //访问标示, 从cookie中提取,相当于sessionID,String格式


sFlag   //(不太清楚是做什么的,统计没有用到,分为1和0),String格式


timeOnPage //页面停留时间,精确到毫秒,int格式








以access_day(进入时间)为分区表,创建外部表
create external table users(
accessDate    string,
accessTime  int,
accessHour   int,
requestMethod   string,
referUrl   string,
requestProtocal   string,
returnStatus   string,
requestUrl   string,
referDomain   string,
userOrigin   string,
originWord   string,
browser   string,
browserVersion   string,
operateSystem   string,
requestIp   string,
ipNumber   int,
userProvince   string,
screenSize   string,
screenColor   string,
pageTitle   string,
siteType   string,
userFlag   string,
visitFlag   string,
sFlag   string,
timeOnPage int)
partitioned by (access_day string)
row format delimited
fields terminated by ' '

stored as textfile;




load data local inpath '/home/hadoop/data/20160101.txt'  overwrite into table users partition (access_day='20160101');
load data local inpath '/home/hadoop/data/20160601.txt'  overwrite into table users partition (access_day='20160601');
load data local inpath '/home/hadoop/data/20170601.txt'  overwrite into table users partition (access_day='20170601');
load data local inpath '/home/hadoop/data/20170101.txt'  overwrite into table users partition (access_day='20170101');



统计方法
PV统计
1、 按天统计PV
select accessdate ,count(1) from users where access_day='20160601' group by accessdate;
2、 按小时统计PV
select accesshour, count(1) from users where access_day='20160601' group by accesshour;
3、 统计每天每个省份PV
select accessdate, userprovince, count(1) from users where access_day='20160601' group by accessdate,userprovince;
4、 每天每个省份 每个小时统计
select accessdate, userprovince, accesshour, count(1) from users where access_day='20160601' group by accessdate, userprovince, accesshour;
5、 统计每天每个页面的访问量
select accessdate, requesturl, count(1) from users where access_day='20160601' group by accessdate, requesturl ;
UV统计
1、 统计总的访问人数,即访客(UV)
select access_day,count (distinct requestIp) from users group by access_day;
2、 统计当天平均访问页面数(页/人=PV/UV)
select count(1) pvSta, count(distinct requestIp) uvSta from users where access_day='20160601';


select count(1)/count(distinct requestIp)  from users where access_day='20160601';


3、 统计每个页面访客数和最早时间和最晚时间
Select requesturl, count(distinct requestIp) visitCount, min(accessTime) firstAccessTime, max(accessTime) recentAccessTime from users where access_day='20160601' group by  requestUrl order by  visitCount desc;
网站停留时间统计
平均网站停留时间=网站总停留时间/会话的数量(访次)
//注意,本时间数据类型为int,为了方便讲解业务,故直接把两时间数据进行相减,具体业务还需实现用户自定义函数(UDF)把int转换为时间数据并进行相关操作。
1、 每个访客每天的网站停留时间=最后一次时间-首次访问时间
create view pageTime (id,visitKeepTime) as select requestIp, ceil((max(accessTime) - min(accessTime))/1000) visitKeepTime from users where access_day='20170601' group by requestIp;  bug
select * from pageTime;
2、 根据第一步的数据,统计分析网站的用户平均停留时间
select avg(visitKeepTime) from pageTime;
客户设备相关信息统计
1、 浏览器比例分析,访问量。统计每个浏览器对应的版本有多少人访问
select browser, browserVersion, count(distinct requestIp) staCount from users e where siteType='0' and access_day='20160601' group by browser,browserVersion order by browser,browserVersion;
2、 操作系统统计,访问量。统计每个操作有多少人使用
select operateSystem,count(distinct userflag) from users where siteType='0' and  access_day='20160601' group by operateSystem order by operateSystem
3、 屏幕颜色统计,访问量。有多少人使用
Select screenColor,count(distinct userflag) from users where siteType='0' and  access_day='20160601' group by screenColor order by screenColor
4、 屏幕尺寸,访问量。有多少人使用
Select screenSize,count(distinct userflag) from users where siteType='0' and  access_day='20160601' group by screenSize order by screenSize
来源统计
1、 来源关键字统计。统计每个关键字使用的次数
select originWord, count(1) staCount from users where siteType='0' and  access_day='20160601' and originWord!='-' group by originWord order by staCount desc;
2、 热门入口网站地址。
select userOrigin, count(1) staCount from users where siteType='0' and  access_day='20160601' group by userOrigin order by staCount desc;
3、 热门标题统计
select pageTitle, count(1) staCount from users where siteType='0' and  access_day='20160601' group by pageTitle order by staCount desc;
用户地区访问量分布
1、 主要统计每个省份的访问量
Select userprovince, count(1) from users where siteType='0' and access_day='20160601' group by userProvince;
2、 统计每个省份访客数
select userprovince, count(distinct requestIp) from users where siteType='0' and access_day='20160601' group by userProvince;




用户访问相关信息 留存率 
1、 当日回访人数占比
即一天内多次访问网站(产生了多个会话session)的独立访客数。
select count(distinct requestIp) from (select requestIp, count(requestIp) visitNum from users where siteType='0' and access_day='20160601' group by requestIp) a where a.visitNum > 1;
2、 访客平均访问频度
平均每个独立访客一天内访问网站的次数(产生的session个数),访客平均访问频度=访问次数/独立访客。
select requestIp, count(1)/count(distinct requestIp)  from users where siteType='0' and access_day='20160601' group by requestIp;
3、 独立用户平均访问时长
平均每次访问(会话)在网站上的停留时间。平均访问时长=访问时长/访问次数。体现网站对访客的吸引程度。
select sum(visitKeepTime)/count(distinct requestIp) from (select requestIp, ceil((max(accessTime) - min(accessTime))/1000) visitKeepTime from users where access_day='20170601' group by requestIp) a ;  session 
4、 平均访问深度   
平均每次访问(会话)产生的PV。平均访问深度=浏览次数/访问次数。体现网站对访客的吸引程度。
select a.pv/b.visitNum from (select count(1) pv from users where access_day='20160601') a ,(select count(distinct requestIp) visitNum from users  where access_day='20160601') b;
5、 人均浏览页面
select a.pv/b.userNum from (select count(1) pv from users where access_day='20160601') a ,(select count(distinct requestIp) userNum from users  where access_day='20160601') b;
6、 新增独立访客
select count(distinct requestIp) from (select requestIp, min(cast(accessDate as int)) edate from users group by requestIp) a where a.edate = 20160601 And Flag=1;


select count(distinct requestIp) from users where sFlag=1; 


7.来路域名
  select referUrl,count(*) from users group by referUrl;






你可能感兴趣的:(hive)