ods层与宽表ods_weblog_detail数据
(1)建表
CREATE TABLE dw_pvs_everyhour_oneday (
month string,
day string,
hour string,
pvs BIGINT )
partitioned BY ( datestr string );
(2)插入数据:
对每月,每日,每小时进行聚合,求count(1)
INSERT INTO TABLE dw_pvs_everyhour_oneday PARTITION (datestr = '20130918')
SELECT a.month AS month, a.day AS day, a.hour AS hour,count(*) AS pvs
FROM ods_weblog_detail a
WHERE a.datestr = '20130918'
GROUP BY a.month,a.day,a.hour;
(3)查询
+--------------------------------+------------------------------+-------------------------------+------------------------------+----------------------------------+--+
| dw_pvs_everyhour_oneday.month | dw_pvs_everyhour_oneday.day | dw_pvs_everyhour_oneday.hour | dw_pvs_everyhour_oneday.pvs | dw_pvs_everyhour_oneday.datestr |
+--------------------------------+------------------------------+-------------------------------+------------------------------+----------------------------------+--+
| 09 | 18 | 06 | 111 | 20130918 |
| 09 | 18 | 07 | 1010 | 20130918 |
| 09 | 18 | 08 | 2052 | 20130918 |
| 09 | 18 | 09 | 1374 | 20130918 |
| 09 | 18 | 10 | 568 | 20130918 |
| 09 | 18 | 11 | 571 | 20130918 |
| 09 | 18 | 12 | 621 | 20130918 |
| 09 | 18 | 13 | 531 | 20130918 |
| 09 | 18 | 14 | 514 | 20130918 |
| 09 | 18 | 15 | 759 | 20130918 |
| 09 | 18 | 16 | 475 | 20130918 |
| 09 | 18 | 17 | 382 | 20130918 |
| 09 | 18 | 18 | 262 | 20130918 |
| 09 | 18 | 19 | 390 | 20130918 |
| 09 | 18 | 20 | 211 | 20130918 |
| 09 | 18 | 21 | 213 | 20130918 |
| 09 | 18 | 22 | 351 | 20130918 |
| 09 | 18 | 23 | 382 | 20130918 |
| 09 | 19 | 00 | 312 | 20130918 |
| 09 | 19 | 01 | 324 | 20130918 |
| 09 | 19 | 02 | 546 | 20130918 |
| 09 | 19 | 03 | 552 | 20130918 |
| 09 | 19 | 04 | 569 | 20130918 |
| 09 | 19 | 05 | 540 | 20130918 |
| 09 | 19 | 06 | 150 | 20130918 |
+--------------------------------+------------------------------+-------------------------------+------------------------------+----------------------------------+--+
(1)建表
CREATE TABLE dw_pvs_everyday (
month string,
day string,
pvs BIGINT )
partitioned BY ( datestr string );
(2)插入数据
对每月,每日进行聚合,求count(1)
INSERT INTO TABLE dw_pvs_everyday PARTITION (datestr = '20130918')
SELECT a.month AS month, a.day AS day,COUNT(*) AS pvs
FROM ods_weblog_detail a
WHERE a.datestr = '20130918'
GROUP BY a.month,a.day;
(3)查询
+--------+------+--------+--+
| month | day | pvs |
+--------+------+--------+--+
| 09 | 18 | 10777 |
| 09 | 19 | 2993 |
+--------+------+--------+--+
(1)建表
CREATE TABLE dw_pvs_referer_everyhour (
referer_url string,
referer_host string,
month string,
day string,
hour string,
pv_referer_cnt BIGINT )
partitioned BY ( datestr string );
(2)插入数据
对每月,每日,每小时,每个url,host进行聚合,求count(1)
INSERT INTO TABLE dw_pvs_referer_everyhour
PARTITION (datestr='20130918')
SELECT http_referer,ref_host,month,day,hour,count(1) AS pv_referer_cnt
FROM ods_weblog_detail
GROUP BY http_referer,ref_host,month,day,hour
HAVING ref_host IS NOT NULL
ORDER BY hour asc,day asc,month asc,pv_referer_cnt desc;
(3)查询
+--------+------+-------+-------------------+----------------------------------------------------+-----------------+--+
| month | day | hour | referer_host | referer_url | pv_referer_cnt |
+--------+------+-------+-------------------+----------------------------------------------------+-----------------+--+
| 09 | 18 | 06 | blog.fens.me | "http://blog.fens.me/mongodb-replica-set/" | 20 |
| 09 | 18 | 06 | blog.fens.me | "http://blog.fens.me/vps-ip-dns/" | 19 |
| 09 | 18 | 06 | blog.fens.me | "http://blog.fens.me/nodejs-grunt-intro/" | 7 |
| 09 | 18 | 06 | blog.fens.me | "http://blog.fens.me/nodejs-socketio-chat/" | 7 |
| 09 | 18 | 06 | blog.fens.me | "http://blog.fens.me/wp-content/themes/silesia/style.css" | 7 |
| 09 | 18 | 06 | blog.fens.me | "http://blog.fens.me/nodejs-async/" | 5 |
| 09 | 18 | 06 | www.angularjs.cn | "http://www.angularjs.cn/A00n" | 2 |
| 09 | 18 | 06 | blog.fens.me | "http://blog.fens.me/nodejs-express3/" | 2 |
| 09 | 18 | 06 | www.angularjs.cn | "http://www.angularjs.cn/" | 1 |
| 09 | 18 | 06 | www.google.com | "http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=6&cad=rja&ved=0CHIQFjAF&url=http%3A%2F%2Fblog.fens.me%2Fvps-ip-dns%2F&ei=j045UrP5AYX22AXsg4G4DQ&usg=AFQjCNGsJfLMNZnwWXNpTSUl6SOEzfF6tg&sig2=YY1oxEybUL7wx3IrVIMfHA&bvm=bv.52288139,d.b2I" | 1 |
+--------+------+-------+-------------------+----------------------------------------------------+-----------------+--+
(1)建表
CREATE TABLE dw_pvs_refererhost_everyhour (
ref_host string,
month string,
day string,
hour string,
ref_host_cnts BIGINT )
partitioned BY ( datestr string );
(2)插入数据
对每月,每日,每小时,每个host进行聚合,求count(1)
INSERT INTO TABLE dw_pvs_refererhost_everyhour
PARTITION ( datestr = '20130918' )
SELECT ref_host,month,day,hour,count(1) AS ref_host_cnts
FROM ods_weblog_detail
GROUP BY ref_host,month,day,hour
HAVING ref_host IS NOT NULL
ORDER BY hour ASC,day ASC,month ASC,ref_host_cnts DESC;
(3)查询
+--------+------+-------+--------------------+----------------+--+
| month | day | hour | ref_host | ref_host_cnts |
+--------+------+-------+--------------------+----------------+--+
| 09 | 18 | 06 | blog.fens.me | 68 |
| 09 | 18 | 06 | www.angularjs.cn | 3 |
| 09 | 18 | 06 | www.google.com | 2 |
| 09 | 18 | 06 | www.baidu.com | 1 |
| 09 | 18 | 06 | cos.name | 1 |
| 09 | 18 | 07 | blog.fens.me | 711 |
| 09 | 18 | 07 | www.google.com.hk | 20 |
| 09 | 18 | 07 | www.angularjs.cn | 20 |
| 09 | 18 | 07 | www.dataguru.cn | 10 |
| 09 | 18 | 07 | www.fens.me | 6 |
+--------+------+-------+--------------------+----------------+--+
(1)建表
CREATE TABLE dw_pvs_refhost_topn_everyhour(
hour string,
toporder string,
ref_host string,
ref_host_cnts string)
partitioned by (datestr string);
(2)插入数据
使用窗口函数ROW_NUMBER() over求取前3pv量的host来源
INSERT INTO TABLE dw_pvs_refhost_topn_everyhour PARTITION (datestr = '20130918')
SELECT t.hour,t.od,t.ref_host,t.ref_host_cnts
FROM (
SELECT ref_host,ref_host_cnts,CONCAT(month,day,hour) AS hour,
ROW_NUMBER() over ( PARTITION by CONCAT(month,day,hour) ORDER BY ref_host_cnts DESC) od
FROM dw_pvs_refererhost_everyhour ) t
WHERE od<=3;
第一步:使用一个ip地址代表一个人,对ip地址去重
SELECT count(DISTINCT(remote_addr)) FROM ods_weblog_detail;
第二步:总页数/去重后的ip地址
SELECT '20130918',SUM(b.pvs)/COUNT(b.remote_addr)
FROM (
SELECT remote_addr,count(1) AS pvs
FROM ods_weblog_detail
WHERE datestr = '20130918'
GROUP BY remote_addr ) b;
使用request字段来代表我们访问的页面
(1)查询
SELECT request AS request,COUNT(request) AS request_counts
FROM ods_weblog_detail
GROUP BY request
HAVING request IS NOT NULL
ORDER BY request_counts DESC
LIMIT 20;
(2)结果
+----------------------------------------------------+-----------------+--+
| request | request_counts |
+----------------------------------------------------+-----------------+--+
| / | 3139 |
| /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 | 361 |
| /wp-includes/js/jquery/jquery.js?ver=1.10.2 | 358 |
| /js/baidu.js | 318 |
| /wp-admin/admin-ajax.php | 308 |
| /js/google.js | 308 |
| /wp-content/themes/silesia/js/jquery.cycle.all.min.js | 293 |
| /wp-content/themes/silesia/functions/js/shortcode.js | 290 |
| /wp-content/themes/silesia/js/load.js | 290 |
| /wp-includes/js/comment-reply.min.js?ver=3.6 | 285 |
| /feed/ | 263 |
| /wp-content/themes/silesia/style.css | 255 |
| /wp-content/themes/silesia/functions/css/shortcodes.css | 254 |
| /wp-content/themes/silesia/images/slide-bg.png | 238 |
| /wp-content/themes/silesia/images/natty-logo.png | 238 |
| /wp-content/themes/silesia/images/crubms-div.png | 238 |
| /wp-content/themes/silesia/images/ico-twitter.png | 236 |
| /wp-content/themes/silesia/images/home-ico.png | 236 |
| /wp-content/themes/silesia/images/ico-meta.gif | 235 |
| /wp-content/themes/silesia/images/sprites/post-type.png | 233 |
+----------------------------------------------------+-----------------+--+
(1)建表
CREATE TABLE dw_hotpages_everyday(
day stirng,
url string,
pvs string);
(2)插入数据
聚合,limit
INSERT INTO TABLE dw_hotpages_everyday
SELECT '20130918',a.request,a.request_counts
FROM(
SELECT request AS request,COUNT(request) AS request_counts
FROM ods_weblog_detail
WHERE datestr='20130918'
GROUP BY request
HAVING request IS NOT NULL ) a
ORDER BY a.request_counts DESC
LIMIT 10;
(3)查询
+-----------+----------------------------------------------------+-------+--+
| day | url | pvs |
+-----------+----------------------------------------------------+-------+--+
| 20130918 | / | 3139 |
| 20130918 | /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 | 361 |
| 20130918 | /wp-includes/js/jquery/jquery.js?ver=1.10.2 | 358 |
| 20130918 | /js/baidu.js | 318 |
| 20130918 | /wp-admin/admin-ajax.php | 308 |
| 20130918 | /js/google.js | 308 |
| 20130918 | /wp-content/themes/silesia/js/jquery.cycle.all.min.js | 293 |
| 20130918 | /wp-content/themes/silesia/functions/js/shortcode.js | 290 |
| 20130918 | /wp-content/themes/silesia/js/load.js | 290 |
| 20130918 | /wp-includes/js/comment-reply.min.js?ver=3.6 | 285 |
+-----------+----------------------------------------------------+-------+--+
(1)sql语句
SELECT CONCAT(a.month,a.day),a.month,a.day,a.request,a.request_counts
FROM (
SELECT month,day,request,COUNT(1) AS request_counts
FROM ods_weblog_detail
WHERE datestr = '20130918'
GROUP BY request,month,day
HAVING request IS NOT NULL
ORDER BY request_counts DESC
LIMIT 10 ) a;
(2)结果
+-------+----------+--------+----------------------------------------------------+-------------------+--+
| _c0 | a.month | a.day | a.request | a.request_counts |
+-------+----------+--------+----------------------------------------------------+-------------------+--+
| 0918 | 09 | 18 | / | 2268 |
| 0919 | 09 | 19 | / | 871 |
| 0918 | 09 | 18 | /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 | 293 |
| 0918 | 09 | 18 | /wp-includes/js/jquery/jquery.js?ver=1.10.2 | 290 |
| 0918 | 09 | 18 | /js/baidu.js | 269 |
| 0918 | 09 | 18 | /js/google.js | 259 |
| 0918 | 09 | 18 | /wp-content/themes/silesia/js/jquery.cycle.all.min.js | 244 |
| 0918 | 09 | 18 | /wp-content/themes/silesia/js/load.js | 243 |
| 0918 | 09 | 18 | /wp-content/themes/silesia/functions/js/shortcode.js | 242 |
| 0918 | 09 | 18 | /wp-includes/js/comment-reply.min.js?ver=3.6 | 223 |
+-------+----------+--------+----------------------------------------------------+-------------------+--+
独立访客:按照时间维度比如小时来统计独立访客及其产生的pv对于独立访客的识别,如果在原始日志中有用户标识,则根据用户标识即很好实现;此处,由于原始日志中并没有用户标识,以访客IP来模拟,技术上是一样的,只是精确度相对较低
(1)建表
CREATE TABLE dw_user_dstc_ip_h(
remote_addr string,
pvs bigint,
hour string);
(2)插入数据
INSERT INTO dw_user_dstc_ip_h
SELECT remote_addr,count(1) AS pvs,concat(month,day,hour) AS hour
FROM ods_weblog_detail
WHERE datestr = '20130918'
GROUP BY remote_addr,concat(month,day,hour);
(1)创建访客累计表(无重复用户)
CREATE TABLE dw_user_dsct_history(
day string,
ip string)
partitioned by (datestr string);
(2)创建每日新访客表
CREATE TABLE dw_user_new_d(
day string,
ip string)
partitioned by (datestr string);
(3)每日新访客插入到新访客表
使用每日访客表与累计访客总表进行左外连接,在用where进行筛选,选择tmp.old_addr IS NULL,即为新增的访客,再加入到总表
INSERT INTO TABLE dw_user_new_d PARTITION (datestr = '20130918')
SELECT tmp.day AS day,tmp.today_addr AS new_ip
FROM (
SELECT today.day AS day,today.remote_addr AS today_addr,old.ip AS old_addr
FROM (
SELECT DISTINCT remote_addr AS remote_addr,"20130918" AS day
FROM ods_weblog_detail
WHERE datestr = '20130918' ) today
LEFT OUTER JOIN dw_user_dsct_history old
ON today.remote_addr = old.ip ) tmp
WHERE tmp.old_addr IS NULL;
(4)每日新访客追加到访客累积表
INSERT INTO TABLE dw_user_dsct_history
PARTITION (datestr = '20130918')
SELECT day,ip FROM dw_user_new_d
WHERE datestr = '20130918';
(1)建表
CREATE TABLE dw_user_returning(
day string,
remote_addr string,
acc_cnt string)
partitioned by (datestr string);
(2)插入数据
对session进行count统计visit表中每个用户来访次数,再用
where进行筛选,选择tmp.acc_cnt大于1的为回头客
INSERT overwrite TABLE dw_user_returning
PARTITION (datestr = '20130918')
SELECT tmp.day,tmp.remote_addr,tmp.acc_cnt
FROM (
SELECT '20130918' AS day,remote_addr,count(session) AS acc_cnt
FROM ods_click_stream_visit
GROUP BY remote_addr ) tmp
WHERE tmp.acc_cnt > 1;
总visit数/去重ip后的用户数
SELECT SUM(pagevisits)/COUNT(DISTINCT remote_addr)
FROM ods_click_stream_visit
WHERE datestr = '20130918';