流量分析--分组TopN统计
=========================================================================================
统计pv总量最大的来源TOPN
--需求:按照时间维度,统计一天内各小时产生最多pvs的来源topN
select * from dw_pvs_refererhost_everyhour limit 10;
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
| dw_pvs_refererhost_everyhour.ref_host | dw_pvs_refererhost_everyhour.month | dw_pvs_refererhost_everyhour.day | dw_pvs_refererhost_everyhour.hour | dw_pvs_refererhost_everyhour.ref_host_cnts | dw_pvs_refererhost_everyhour.datestr |
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
| blog.fens.me | 11 | 02 | 00 | 222 | 20181101 |
| www.fens.me | 11 | 02 | 00 | 26 | 20181101 |
| h2w.iask.cn | 11 | 02 | 00 | 12 | 20181101 |
| www.google.com.hk | 11 | 02 | 00 | 6 | 20181101 |
| angularjs.cn | 11 | 02 | 00 | 6 | 20181101 |
| cnodejs.org | 11 | 02 | 00 | 2 | 20181101 |
| www.leonarding.com | 11 | 02 | 00 | 2 | 20181101 |
| www.itpub.net | 11 | 02 | 00 | 2 | 20181101 |
| blog.fens.me | 11 | 02 | 01 | 178 | 20181101 |
| cos.name | 11 | 02 | 01 | 6 | 20181101 |
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
--row_number函数
select ref_host,ref_host_cnts,concat(month,day,hour),
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od
from dw_pvs_refererhost_everyhour;
+-------------------+----------------+---------+-----+--+
| ref_host | ref_host_cnts | _c2 | od |
+-------------------+----------------+---------+-----+--+
| blog.fens.me | 136 | 110106 | 1 |
| blog.fens.me | 136 | 110106 | 2 |
| www.angularjs.cn | 6 | 110106 | 3 |
| www.angularjs.cn | 6 | 110106 | 4 |
| www.google.com | 4 | 110106 | 5 |
| www.google.com | 4 | 110106 | 6 |
| www.baidu.com | 2 | 110106 | 7 |
| cos.name | 2 | 110106 | 8 |
| www.baidu.com | 2 | 110106 | 9 |
| cos.name | 2 | 110106 | 10 |
+-------------------+----------------+---------+-----+--+
--综上可以得出
drop table dw_pvs_refhost_topn_everyhour;
--建立表,插入数据
create table dw_pvs_refhost_topn_everyhour(
hour string,
toporder string,
ref_host string,
ref_host_cnts string)partitioned by(datestr string);
desc dw_pvs_refhost_topn_everyhour;
+--------------------------+-----------------------+-----------------------+--+
| col_name | data_type | comment |
+--------------------------+-----------------------+-----------------------+--+
| hour | string | |
| toporder | string | |
| ref_host | string | |
| ref_host_cnts | string | |
| datestr | string | |
| | NULL | NULL |
| # Partition Information | NULL | NULL |
| # col_name | data_type | comment |
| | NULL | NULL |
| datestr | string | |
+--------------------------+-----------------------+-----------------------+--+
--插入:
--row_number函数
insert into table dw_pvs_refhost_topn_everyhour partition(datestr='20181101')
select t.hour,t.od,t.ref_host,t.ref_host_cnts from
(select ref_host,ref_host_cnts,concat(month,day,hour) as hour,
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od
from dw_pvs_refererhost_everyhour) t where od<=3;
select * from dw_pvs_refhost_topn_everyhour limit 5;
+-------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------------+----------------------------------------+--+
| dw_pvs_refhost_topn_everyhour.hour | dw_pvs_refhost_topn_everyhour.toporder | dw_pvs_refhost_topn_everyhour.ref_host | dw_pvs_refhost_topn_everyhour.ref_host_cnts | dw_pvs_refhost_topn_everyhour.datestr |
+-------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------------+----------------------------------------+--+
| 110106 | 1 | blog.fens.me | 136 | 20181101 |
| 110106 | 2 | blog.fens.me | 136 | 20181101 |
| 110106 | 3 | www.angularjs.cn | 6 | 20181101 |
| 110107 | 1 | blog.fens.me | 1422 | 20181101 |
| 110107 | 2 | blog.fens.me | 1422 | 20181101 |
| 110107 | 3 | www.google.com.hk | 40 | 20181101 |
| 110108 | 1 | blog.fens.me | 3112 | 20181101 |
| 110108 | 2 | blog.fens.me | 3112 | 20181101 |
| 110108 | 3 | www.fens.me | 52 | 20181101 |
| 110109 | 1 | blog.fens.me | 2094 | 20181101 |
+-------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------------+----------------------------------------+--+