3、流量分析--分组TopN统计

流量分析--分组TopN统计
=========================================================================================
统计pv总量最大的来源TOPN
--需求:按照时间维度,统计一天内各小时产生最多pvs的来源topN


select * from dw_pvs_refererhost_everyhour limit 10;
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
| dw_pvs_refererhost_everyhour.ref_host  | dw_pvs_refererhost_everyhour.month  | dw_pvs_refererhost_everyhour.day  | dw_pvs_refererhost_everyhour.hour  | dw_pvs_refererhost_everyhour.ref_host_cnts  | dw_pvs_refererhost_everyhour.datestr  |
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+
| blog.fens.me                           | 11                                  | 02                                | 00                                 | 222                                         | 20181101                              |
| www.fens.me                            | 11                                  | 02                                | 00                                 | 26                                          | 20181101                              |
| h2w.iask.cn                            | 11                                  | 02                                | 00                                 | 12                                          | 20181101                              |
| www.google.com.hk                      | 11                                  | 02                                | 00                                 | 6                                           | 20181101                              |
| angularjs.cn                           | 11                                  | 02                                | 00                                 | 6                                           | 20181101                              |
| cnodejs.org                            | 11                                  | 02                                | 00                                 | 2                                           | 20181101                              |
| www.leonarding.com                     | 11                                  | 02                                | 00                                 | 2                                           | 20181101                              |
| www.itpub.net                          | 11                                  | 02                                | 00                                 | 2                                           | 20181101                              |
| blog.fens.me                           | 11                                  | 02                                | 01                                 | 178                                         | 20181101                              |
| cos.name                               | 11                                  | 02                                | 01                                 | 6                                           | 20181101                              |
+----------------------------------------+-------------------------------------+-----------------------------------+------------------------------------+---------------------------------------------+---------------------------------------+--+

--row_number函数
select ref_host,ref_host_cnts,concat(month,day,hour),
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od 
from dw_pvs_refererhost_everyhour;
+-------------------+----------------+---------+-----+--+
|     ref_host      | ref_host_cnts  |   _c2   | od  |
+-------------------+----------------+---------+-----+--+
| blog.fens.me      | 136            | 110106  | 1   |
| blog.fens.me      | 136            | 110106  | 2   |
| www.angularjs.cn  | 6              | 110106  | 3   |
| www.angularjs.cn  | 6              | 110106  | 4   |
| www.google.com    | 4              | 110106  | 5   |
| www.google.com    | 4              | 110106  | 6   |
| www.baidu.com     | 2              | 110106  | 7   |
| cos.name          | 2              | 110106  | 8   |
| www.baidu.com     | 2              | 110106  | 9   |
| cos.name          | 2              | 110106  | 10  |
+-------------------+----------------+---------+-----+--+


--综上可以得出
drop table dw_pvs_refhost_topn_everyhour;

--建立表,插入数据
create table dw_pvs_refhost_topn_everyhour(
hour string,
toporder string,
ref_host string,
ref_host_cnts string)partitioned by(datestr string);

desc dw_pvs_refhost_topn_everyhour;
+--------------------------+-----------------------+-----------------------+--+
|         col_name         |       data_type       |        comment        |
+--------------------------+-----------------------+-----------------------+--+
| hour                     | string                |                       |
| toporder                 | string                |                       |
| ref_host                 | string                |                       |
| ref_host_cnts            | string                |                       |
| datestr                  | string                |                       |
|                          | NULL                  | NULL                  |
| # Partition Information  | NULL                  | NULL                  |
| # col_name               | data_type             | comment               |
|                          | NULL                  | NULL                  |
| datestr                  | string                |                       |
+--------------------------+-----------------------+-----------------------+--+


--插入:
--row_number函数
insert into table dw_pvs_refhost_topn_everyhour partition(datestr='20181101')
select t.hour,t.od,t.ref_host,t.ref_host_cnts from
(select ref_host,ref_host_cnts,concat(month,day,hour) as hour,
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od 
from dw_pvs_refererhost_everyhour) t where od<=3;

select * from dw_pvs_refhost_topn_everyhour limit 5;
+-------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------------+----------------------------------------+--+
| dw_pvs_refhost_topn_everyhour.hour  | dw_pvs_refhost_topn_everyhour.toporder  | dw_pvs_refhost_topn_everyhour.ref_host  | dw_pvs_refhost_topn_everyhour.ref_host_cnts  | dw_pvs_refhost_topn_everyhour.datestr  |
+-------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------------+----------------------------------------+--+
| 110106                              | 1                                       | blog.fens.me                            | 136                                          | 20181101                               |
| 110106                              | 2                                       | blog.fens.me                            | 136                                          | 20181101                               |
| 110106                              | 3                                       | www.angularjs.cn                        | 6                                            | 20181101                               |
| 110107                              | 1                                       | blog.fens.me                            | 1422                                         | 20181101                               |
| 110107                              | 2                                       | blog.fens.me                            | 1422                                         | 20181101                               |
| 110107                              | 3                                       | www.google.com.hk                       | 40                                           | 20181101                               |
| 110108                              | 1                                       | blog.fens.me                            | 3112                                         | 20181101                               |
| 110108                              | 2                                       | blog.fens.me                            | 3112                                         | 20181101                               |
| 110108                              | 3                                       | www.fens.me                             | 52                                           | 20181101                               |
| 110109                              | 1                                       | blog.fens.me                            | 2094                                         | 20181101                               |
+-------------------------------------+-----------------------------------------+-----------------------------------------+----------------------------------------------+----------------------------------------+--+

 

你可能感兴趣的:(数据分析(网站流量日志))