stored as textfile;
load data local inpath '/home/hadoop/data/20160101.txt' overwrite into table users partition (access_day='20160101');
load data local inpath '/home/hadoop/data/20160601.txt' overwrite into table users partition (access_day='20160601');
load data local inpath '/home/hadoop/data/20170601.txt' overwrite into table users partition (access_day='20170601');
load data local inpath '/home/hadoop/data/20170101.txt' overwrite into table users partition (access_day='20170101');
统计方法
PV统计
1、 按天统计PV
select accessdate ,count(1) from users where access_day='20160601' group by accessdate;
2、 按小时统计PV
select accesshour, count(1) from users where access_day='20160601' group by accesshour;
3、 统计每天每个省份PV
select accessdate, userprovince, count(1) from users where access_day='20160601' group by accessdate,userprovince;
4、 每天每个省份 每个小时统计
select accessdate, userprovince, accesshour, count(1) from users where access_day='20160601' group by accessdate, userprovince, accesshour;
5、 统计每天每个页面的访问量
select accessdate, requesturl, count(1) from users where access_day='20160601' group by accessdate, requesturl ;
UV统计
1、 统计总的访问人数,即访客(UV)
select access_day,count (distinct requestIp) from users group by access_day;
2、 统计当天平均访问页面数(页/人=PV/UV)
select count(1) pvSta, count(distinct requestIp) uvSta from users where access_day='20160601';
select count(1)/count(distinct requestIp) from users where access_day='20160601';
3、 统计每个页面访客数和最早时间和最晚时间
Select requesturl, count(distinct requestIp) visitCount, min(accessTime) firstAccessTime, max(accessTime) recentAccessTime from users where access_day='20160601' group by requestUrl order by visitCount desc;
网站停留时间统计
平均网站停留时间=网站总停留时间/会话的数量(访次)
//注意,本时间数据类型为int,为了方便讲解业务,故直接把两时间数据进行相减,具体业务还需实现用户自定义函数(UDF)把int转换为时间数据并进行相关操作。
1、 每个访客每天的网站停留时间=最后一次时间-首次访问时间
create view pageTime (id,visitKeepTime) as select requestIp, ceil((max(accessTime) - min(accessTime))/1000) visitKeepTime from users where access_day='20170601' group by requestIp; bug
select * from pageTime;
2、 根据第一步的数据,统计分析网站的用户平均停留时间
select avg(visitKeepTime) from pageTime;
客户设备相关信息统计
1、 浏览器比例分析,访问量。统计每个浏览器对应的版本有多少人访问
select browser, browserVersion, count(distinct requestIp) staCount from users e where siteType='0' and access_day='20160601' group by browser,browserVersion order by browser,browserVersion;
2、 操作系统统计,访问量。统计每个操作有多少人使用
select operateSystem,count(distinct userflag) from users where siteType='0' and access_day='20160601' group by operateSystem order by operateSystem
3、 屏幕颜色统计,访问量。有多少人使用
Select screenColor,count(distinct userflag) from users where siteType='0' and access_day='20160601' group by screenColor order by screenColor
4、 屏幕尺寸,访问量。有多少人使用
Select screenSize,count(distinct userflag) from users where siteType='0' and access_day='20160601' group by screenSize order by screenSize
来源统计
1、 来源关键字统计。统计每个关键字使用的次数
select originWord, count(1) staCount from users where siteType='0' and access_day='20160601' and originWord!='-' group by originWord order by staCount desc;
2、 热门入口网站地址。
select userOrigin, count(1) staCount from users where siteType='0' and access_day='20160601' group by userOrigin order by staCount desc;
3、 热门标题统计
select pageTitle, count(1) staCount from users where siteType='0' and access_day='20160601' group by pageTitle order by staCount desc;
用户地区访问量分布
1、 主要统计每个省份的访问量
Select userprovince, count(1) from users where siteType='0' and access_day='20160601' group by userProvince;
2、 统计每个省份访客数
select userprovince, count(distinct requestIp) from users where siteType='0' and access_day='20160601' group by userProvince;
用户访问相关信息 留存率
1、 当日回访人数占比
即一天内多次访问网站(产生了多个会话session)的独立访客数。
select count(distinct requestIp) from (select requestIp, count(requestIp) visitNum from users where siteType='0' and access_day='20160601' group by requestIp) a where a.visitNum > 1;
2、 访客平均访问频度
平均每个独立访客一天内访问网站的次数(产生的session个数),访客平均访问频度=访问次数/独立访客。
select requestIp, count(1)/count(distinct requestIp) from users where siteType='0' and access_day='20160601' group by requestIp;
3、 独立用户平均访问时长
平均每次访问(会话)在网站上的停留时间。平均访问时长=访问时长/访问次数。体现网站对访客的吸引程度。
select sum(visitKeepTime)/count(distinct requestIp) from (select requestIp, ceil((max(accessTime) - min(accessTime))/1000) visitKeepTime from users where access_day='20170601' group by requestIp) a ; session
4、 平均访问深度
平均每次访问(会话)产生的PV。平均访问深度=浏览次数/访问次数。体现网站对访客的吸引程度。
select a.pv/b.visitNum from (select count(1) pv from users where access_day='20160601') a ,(select count(distinct requestIp) visitNum from users where access_day='20160601') b;
5、 人均浏览页面
select a.pv/b.userNum from (select count(1) pv from users where access_day='20160601') a ,(select count(distinct requestIp) userNum from users where access_day='20160601') b;
6、 新增独立访客
select count(distinct requestIp) from (select requestIp, min(cast(accessDate as int)) edate from users group by requestIp) a where a.edate = 20160601 And Flag=1;
select count(distinct requestIp) from users where sFlag=1;
7.来路域名
select referUrl,count(*) from users group by referUrl;