如果需求中出现 每xxx 各xxx 按xxx
, 很大可能就是分组的字段条件
设置智能本地模式 : set hive.exec.mode.local.auto=true;
需求 : 计算该处理批次(一天)中各小时pvs
处理数据所在的表 : ods_weblog_detail
分组条件 : 时间维度 (day hour)
表中天是分区字段 , 可以不通过group by即可过滤出所需要的天
-- 查询到的就是一天中各小时的pvs , 自己编写的查询sql
t.month,t.day,t.hour,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.month,t.day,t.hour;
-- 将查询到的结果保存到hdfs上
-- 第一种方式:直接在ods_weblog_detail单表上进行查询
-- 计算该处理批次(一天)中的各小时pvs
-- 首先创建一个表用于存储查询得到的数据
drop table dw_pvs_everyhour_oneday;
create table dw_pvs_everyhour_oneday(month string,day string,hour string,pvs bigint) partitioned by(datestr string);
-- 将查询的结果存储到新建的表中
insert into table dw_pvs_everyhour_oneday partition(datestr='20130918')
select a.month as month,a.day as day,a.hour as hour,count(*) as pvs from ods_weblog_detail a
where a.datestr='20130918' group by a.month,a.day,a.hour;
方式一 : 在上一个基础上sum每个小时就构成了一天
Insert into table dw_pvs_everyday
Select sum(pvs) as pvs,month,day from dw_pvs_everyhour_oneday group by month,day having day='18';
方式二 : 只能查询出一天的pvs量
count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918';
方式三 : 可以按天和月进行分组
t.month,t.day,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.month,t.day;
-- 将查询的结果保存到hdfs上
-- 直接在ods_weblog_detail单表上进行查询
drop table dw_pvs_everyday;
create table dw_pvs_everyday(pvs bigint,month string,day string);
insert into table dw_pvs_everyday
select count(*) as pvs,a.month as month,a.day as day from ods_weblog_detail a
group by a.month,a.day;
方式四 :跟时间的维度表进行join
--维度 : 月
drop table dw_pvs_everymonth;
create table dw_pvs_everymonth (pvs bigint,month string);
insert into table dw_pvs_everymonth
select count(*) as pvs,a.month from (select distinct month from t_dim_time) a
join ods_weblog_detail b on a.month=b.month group by a.month;
--维度 : 日
select count(*) as pvs,a.month as month,a.day as day from (select distinct month, day from t_dim_time) a
join ods_weblog_detail b
on a.month=b.month and a.day=b.day
group by a.month,a.day;
--维度 : 小时
select count(*) as pvs,a.month as month,a.day as day,a.hour as hour from (select distinct month, day ,hour from t_dim_time) a
join ods_weblog_detail b
on a.month=b.month and a.day=b.day and a.hour=b.hour
group by a.month,a.day,a.hour;
执行最终无结果 原因是:宽表中hour字段的提取有误
substring(time_local,11,3) as hour
变成substring(time_local,12,2) as hour
需求 : 统计每小时各来访url产生的pvs
表 : ods_weblog_detail
分组字段 : 时间(hour) url(http_referer)
t.http_referer,t.ref_host,t.month,t.day,t.hour,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.http_referer,t.ref_host,t.month,t.day,t.hour limit 10;
能执行 没考虑无意义数据
t.http_referer,t.ref_host,t.month,t.day,t.hour,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.http_referer,t.ref_host,t.month,t.day,t.hour
having t.ref_host is not null limit 10;
t.http_referer,t.ref_host,t.month,t.day,t.hour,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.http_referer,t.ref_host,t.month,t.day,t.hour
having t.ref_host is not null
order by pvs desc limit 10;
--统计每小时各来访url产生的pv量,查询结果存入:("dw_pvs_referer_everyhour" )
drop table dw_pvs_referer_everyhour;
create table dw_pvs_referer_everyhour(referer_url string,referer_host string,month string,day string,hour string,pv_referer_cnt bigint) partitioned by(datestr string);
insert into table dw_pvs_referer_everyhour partition(datestr='20130918')
select http_referer,ref_host,month,day,hour,count(1) as pv_referer_cnt
from ods_weblog_detail
group by http_referer,ref_host,month,day,hour
having ref_host is not null
order by hour asc,day asc,month asc,pv_referer_cnt desc;
表 : ods_weblog_detail
分组 : 时间(hour) host
方式一:在上一个基础之上 根据host分组 sum每个pvs
t.ref_host,t.hour,count(1) as pvs
from ods_weblog_detail t
group by t.ref_host,t.hour
having t.ref_host is not null
order by pvs desc limit 10;
drop table dw_pvs_refererhost_everyhour;
create table dw_pvs_refererhost_everyhour(ref_host string,month string,day string,hour string,ref_host_cnts bigint) partitioned by(datestr string);
insert into table dw_pvs_refererhost_everyhour partition(datestr='20130918')
select ref_host,month,day,hour,count(1) as ref_host_cnts
from ods_weblog_detail
group by ref_host,month,day,hour
having ref_host is not null
order by hour asc,day asc,month asc,ref_host_cnts desc;
扩展了解:User Agent 也简称 UA。它是一个特殊字符串头,是一种向访问网站提供所使用的浏览器类型及版本、操作系统及版本、浏览器内核、等信息的标识。
- 按终端维度(了解)
select distinct(http_user_agent) from ods_weblog_detail where http_user_agent like '%Chrome%' limit 200;
- 按栏目维度(了解)
- 按 按 referer 维度(了解)
-- 统计每小时各来访 url 产生的 pv 量 drop table dw_pvs_referer_everyhour; create table dw_pvs_referer_everyhour(referer_url string,referer_host string,month string,day string,hour string,pv_referer_cnt bigint) partitioned by(datestr string); insert into table dw_pvs_referer_everyhour partition(datestr='20130918') select http_referer,ref_host,month,day,hour,count(1) as pv_referer_cnt from ods_weblog_detail group by http_referer,ref_host,month,day,hour having ref_host is not null order by hour asc,day asc,month asc,pv_referer_cnt desc; -- 统计每小时各来访 host 的产生的 pv 数并排序 drop table dw_pvs_refererhost_everyhour; create table dw_pvs_refererhost_everyhour(ref_host string,month string,day string,hour string,ref_host_cnts bigint) partitioned by(datestr string); insert into table dw_pvs_refererhost_everyhour partition(datestr='20130918') select ref_host,month,day,hour,count(1) as ref_host_cnts from ods_weblog_detail group by ref_host,month,day,hour having ref_host is not null order by hour asc,day asc,month asc,ref_host_cnts desc;
每 按 各
分组字段 : 时间(hour) 来源(host) pvs
表数据 : dw_pvs_refererhost_everyhour
知识点 : TOPN ( 分组 TOP)
语法:row_number() over (partition by xxx order by xxx) rank
,rank 为分组的别名,相当于新增一个字段为 rank。
row_number ,rank ,dense_ran
1 a 10
2 a 12
3 b 13
4 b 12
5 a 14
6 a 15
7 a 13
8 b 11
9 a 16
10 b 17
11 a 14
select id,
rank()over(partition by name order by sal desc ) rp,
dense_rank() over(partition by name order by sal desc ) drp,
row_number()over(partition by name order by sal desc) rmp
from f_test
10 b 17 1 1 1
3 b 13 2 2 2
4 b 12 3 3 3
8 b 11 4 4 4
9 a 16 1 1 1
6 a 15 2 2 2
11 a 14 3 3 3
5 a 14 3 3 4
7 a 13 5 4 5
2 a 12 6 5 6
1 a 10 7 6 7
row over()
考虑了数据的重复性 , 挤占坑位语法 : row() over(partition by xxx order by xxx) as rank
dense_rank over()
考虑了数据的重复性 , 不挤占坑位语法 : dense_rank() over(partition by xxx order by xxx) as rank
不考虑数据的重复性语法 : row_number() over(partition by xxx order by xxx) as rank
一般找出topN我们都采用dense_rank over()
select ref_host,ref_host_cnts,concat(month,day,hour),
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) od
from dw_pvs_refererhost_everyhour;
select ref_host,ref_host_cnts,concat(month,day,hour),
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od
from dw_pvs_refererhost_everyhour;
drop table dw_pvs_refhost_topn_everyhour;
create table dw_pvs_refhost_topn_everyhour(
hour string,
toporder string,
ref_host string,
ref_host_cnts string
)partitioned by(datestr string);
insert into table dw_pvs_refhost_topn_everyhour partition(datestr='20130918')
select t.hour,t.od,t.ref_host,t.ref_host_cnts from
(select ref_host,ref_host_cnts,concat(month,day,hour) as hour,
row_number() over (partition by concat(month,day,hour) order by ref_host_cnts desc) as od
from dw_pvs_refererhost_everyhour) t where od<=3;
concat : concat()函数用于将多个字符串连接成一个字符串
返回结果为连接参数产生的字符串。如有任何一个参数为NULL ,则返回值为 NULL。
contcat_ws() 代表 CONCAT With Separator ,是CONCAT()的特殊形式。第一个参数是其它参数的分隔符。分隔符的位置放在要连接的两个字符串之间。分隔符可以是一个字符串,也可以是其它参数。注意:如果分隔符为 NULL,则结果为 NULL。函数会忽略任何分隔符参数后的 NULL 值。
和MySQL中concat函数不同的是, concat_ws函数在执行的时候,不会因为NULL值而返回NULL
表 : ods_weblog_detail
t.remote_addr,count(1) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.remote_addr;
from (select
t.remote_addr,count(1) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.remote_addr) a;
报错: Invalid column reference 't' 在嵌套子查询中 不能多级嵌套引用 可以采用别名的方式引用
from (select t.remote_addr as ip,count(1) as pvs from ods_weblog_detail t where t.datestr='20130918' group by t.remote_addr) a;
drop table dw_avgpv_user_everyday;
create table dw_avgpv_user_everyday(
day string,
avgpv string);
insert into table dw_avgpv_user_everyday
select '20130918',sum(b.pvs)/count(b.remote_addr) from
(select remote_addr,count(1) as pvs from ods_weblog_detail where datestr='20130918' group by remote_addr) b;
sum(a.pvs)/count(a.ip) as avgpvs
t.remote_addr as ip,count(*) as pvs
from ods_weblog_detail t
where t.datestr='20130918'
group by t.remote_addr) a) allen
需求: 统计每日最热门的页面 top10
分组:天(分区字段 where)页面(request)
方式一 :
row_number() over (partition by xxx order by xx) as step
因为此处的分组字段天恰巧也是分区字段 就不需要通过groupby 过滤
t.request,count(*) as pages
from ods_weblog_detail t
where t.datestr='20130918'
group by t.request
order by pages desc limit 10;
drop table dw_hotpages_everyday;
create table dw_hotpages_everyday(day string,url string,pvs string);
insert into table dw_hotpages_everyday
select '20130918',a.request,a.request_counts from
(select request as request,count(request) as request_counts from ods_weblog_detail where datestr='20130918' group by request having request is not null) a
order by a.request_counts desc limit 10;
drop table dw_user_dstc_ip_h;
create table dw_user_dstc_ip_h(
remote_addr string,
pvs bigint,
hour string);
insert into table dw_user_dstc_ip_h
select remote_addr,count(1) as pvs,concat(month,day,hour) as hour
from ods_weblog_detail
Where datestr='20130918'
group by concat(month,day,hour),remote_addr;
select count(1) as dstc_ip_cnts,hour from dw_user_dstc_ip_h group by hour;
select remote_addr,count(1) as counts,concat(month,day) as day
from ods_weblog_detail
Where datestr='20130918'
group by concat(month,day),remote_addr;
--时间维度: 月
select remote_addr,count(1) as counts,month
from ods_weblog_detail
group by month,remote_addr;
tips : concat可以将三个字段作为一个来划分
from 今天 left join 历史 on 今天.ip=历史.ip
where 历史.ip is null;
今天 : (如何获取今天所有的访客) 也就是今天的独立访客(UV)
distinct t.remote_addr
from ods_weblog_detail t
where t.datestr='20130918';
distinct t.remote_addr
from ods_weblog_detail t
where t.datestr='20130918') today left join dw_user_dsct_history history on today.ip=history.ip
where history.ip is null;
from (select
distinct t.remote_addr as ip
from ods_weblog_detail t
where t.datestr='20130918') today left join dw_user_dsct_history history on today.ip=history.ip
where history.ip is not null;
drop table dw_user_dsct_history;
create table dw_user_dsct_history(
day string,
ip string
partitioned by(datestr string);
insert into table dw_user_dsct_history partition(datestr='20130918')
select day,ip from dw_user_new_d where datestr='20130918';
drop table dw_user_new_d;
create table dw_user_new_d (
day string,
ip string
partitioned by(datestr string);
insert into table dw_user_new_d partition(datestr='20130918')
select tmp.day as day,tmp.today_addr as new_ip from
select today.day as day,today.remote_addr as today_addr,old.ip as old_addr
(select distinct remote_addr as remote_addr,"20130918" as day from ods_weblog_detail where datestr="20130918") today
left outer join
dw_user_dsct_history old
on today.remote_addr=old.ip
) tmp
where tmp.old_addr is null;
select count(distinct remote_addr) from ods_weblog_detail;
select count(1) from dw_user_dsct_history where datestr='20130918';
select count(1) from dw_user_new_d where datestr='20130918';
表 : ods_click_stream_visit
t.remote_addr,count(t.session) as visits
from ods_click_stream_visit t
where t.datestr='20130918'
group by t.remote_addr;
from (select t.remote_addr,count(t.session) as visits
from ods_click_stream_visit t
where t.datestr='20130918'
group by t.remote_addr) a
where a.visits=1;
from (select t.remote_addr,count(t.session) as visits
from ods_click_stream_visit t
where t.datestr='20130918'
group by t.remote_addr) a
where a.visits>1;
或者 :
t.remote_addr,count(t.session) as visits
from ods_click_stream_visit t
where t.datestr='20130918'
group by t.remote_addr
having visits >1;
t.remote_addr,count(t.session) as visits
from ods_click_stream_visit t
where t.datestr='20130918'
group by t.remote_addr
having visits =1;
-- 回头/单次访客统计
-- 并将查询的数据保存到hdfs上
drop table dw_user_returning;
create table dw_user_returning(
day string,
remote_addr string,
acc_cnt string)
partitioned by (datestr string);
insert overwrite table dw_user_returning partition(datestr='20130918')
select tmp.day,tmp.remote_addr,tmp.acc_cnt
(select '20130918' as day,remote_addr,count(session) as acc_cnt from ods_click_stream_visit group by remote_addr) tmp
where tmp.acc_cnt>1;
如果存在需求上的模糊点 需要进行沟通。
count(t.session)/count(distinct t.remote_addr)
from ods_click_stream_visit t
where t.datestr='20130918';
sum(t.pagevisits)/count(distinct t.remote_addr)
from ods_click_stream_visit t
where t.datestr='20130918';
只有当自己和自己join的时候 当前的状态就会跟之前的状态出现在一行中。
测试 :
create table t_salary_detail(username string,month string,salary int)
row format delimited fields terminated by ',';
load data local inpath '/root/hivedata/t_salary_detail.dat' into table t_salary_detail;
select * from t_salary_detail;
| t_salary_detail.username | t_salary_detail.month | t_salary_detail.salary |
| A | 2015-01 | 5 |
| A | 2015-01 | 15 |
| B | 2015-01 | 5 |
| A | 2015-01 | 8 |
| B | 2015-01 | 25 |
| A | 2015-01 | 5 |
| A | 2015-02 | 4 |
| A | 2015-02 | 6 |
| B | 2015-02 | 10 |
| B | 2015-02 | 5 |
| A | 2015-03 | 7 |
| A | 2015-03 | 9 |
| B | 2015-03 | 11 |
| B | 2015-03 | 6 |
1、第一步,先求个用户的月总金额 , 有上述
select username,month,sum(salary) as salary from t_salary_detail group by username,month;
| username | month | salary | total(累加)
| A | 2015-01 | 33 | 33
| A | 2015-02 | 10 | 43
| A | 2015-03 | 16 | 59
| B | 2015-01 | 33 | 33
| B | 2015-02 | 15 | 48
| B | 2015-03 | 17 | 65
2、第二步,将月总金额表 自己连接 自己连接
由于我们需要将需求中的数据整理到一行 , 因此我们使用自己join自己
select A.*,B.* FROM
(select username,month,sum(salary) as salary from t_salary_detail group by username,month) A
inner join
(select username,month,sum(salary) as salary from t_salary_detail group by username,month) B
where B.month <= A.month;
| a.username | a.month | a.salary | b.username | b.month | b.salary |
| A | 2015-01 | 33 | A | 2015-01 | 33 |
| A | 2015-02 | 10 | A | 2015-01 | 33 |
| A | 2015-02 | 10 | A | 2015-02 | 10 |
| A | 2015-03 | 16 | A | 2015-01 | 33 |
| A | 2015-03 | 16 | A | 2015-02 | 10 |
| A | 2015-03 | 16 | A | 2015-03 | 16 |
| B | 2015-01 | 30 | B | 2015-01 | 30 |
| B | 2015-02 | 15 | B | 2015-01 | 30 |
| B | 2015-02 | 15 | B | 2015-02 | 15 |
| B | 2015-03 | 17 | B | 2015-01 | 30 |
| B | 2015-03 | 17 | B | 2015-02 | 15 |
| B | 2015-03 | 17 | B | 2015-03 | 17 |
进行分组查询,分组的字段是a.username a.month
求月累计值: 将b.month <= a.month的所有b.salary求和即可
select A.username,A.month,max(A.salary) as salary,sum(B.salary) as accumulate
(select username,month,sum(salary) as salary from t_salary_detail group by username,month) A
inner join
(select username,month,sum(salary) as salary from t_salary_detail group by username,month) B
where B.month <= A.month
group by A.username,A.month
order by A.username,A.month;
| a.username | a.month | salary | accumulate |
| A | 2015-01 | 33 | 33 |
| A | 2015-02 | 10 | 43 |
| A | 2015-03 | 16 | 59 |
| B | 2015-01 | 30 | 30 |
| B | 2015-02 | 15 | 45 |
| B | 2015-03 | 17 | 62 |
需求 : 在一条指定的业务流程中,各个步骤的完成人数及相对上一个步骤的百分比。
tips: union操作符合并两个或多个 SELECT 语句的结果。
load data local inpath '/root/hivedata/click-part-r-00000' overwrite into table ods_click_pageviews partition(datestr='20130920');
create table dw_oute_numbs as
select 'step1' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/item%'
select 'step2' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/category%'
select 'step3' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/order%'
select 'step4' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20130920' and request like '/index%';
select * from dw_oute_numbs;
| dw_oute_numbs.step | dw_oute_numbs.numbs |
| step1 | 1029 |
| step2 | 1029 |
| step3 | 1028 |
| step4 | 1018 |
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr;
| rnstep | rnnumbs | rrstep | rrnumbs |
| step1 | 1029 | step1 | 1029 |
| step2 | 1029 | step1 | 1029 |
| step3 | 1028 | step1 | 1029 |
| step4 | 1018 | step1 | 1029 |
| step1 | 1029 | step2 | 1029 |
| step2 | 1029 | step2 | 1029 |
| step3 | 1028 | step2 | 1029 |
| step4 | 1018 | step2 | 1029 |
| step1 | 1029 | step3 | 1028 |
| step2 | 1029 | step3 | 1028 |
| step3 | 1028 | step3 | 1028 |
| step4 | 1018 | step3 | 1028 |
| step1 | 1029 | step4 | 1018 |
| step2 | 1029 | step4 | 1018 |
| step3 | 1028 | step4 | 1018 |
| step4 | 1018 | step4 | 1018 |
select tmp.rnstep,tmp.rnnumbs/tmp.rrnumbs as ratio
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr) tmp
where tmp.rrstep='step1';
| rnstep | rnnumbs | rrstep | rrnumbs |
| step1 | 1029 | step1 | 1029 |
| step2 | 1029 | step1 | 1029 |
| step3 | 1028 | step1 | 1029 |
| step4 | 1018 | step1 | 1029 |
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr
where cast(substr(rn.step,5,1) as int)=cast(substr(rr.step,5,1) as int)-1;
注意:cast为Hive内置函数 类型转换
select cast(1 as float); --1.0
select cast('2016-05-22' as date); --2016-05-22
| step1 | 1029 | step2 | 1029 |
| step2 | 1029 | step3 | 1028 |
| step3 | 1028 | step4 | 1018 |
| rnstep | rnnumbs | rrstep | rrnumbs |
| step1 | 1029 | step2 | 1029 |
| step2 | 1029 | step3 | 1028 |
| step3 | 1028 | step4 | 1018 |
select tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as leakage_rate
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;
select abs.step,abs.numbs,abs.rate as abs_ratio,rel.rate as leakage_rate
select tmp.rnstep as step,tmp.rnnumbs as numbs,tmp.rnnumbs/tmp.rrnumbs as rate
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr) tmp
where tmp.rrstep='step1'
) abs
left outer join
select tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as rate
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1
) rel
on abs.step=rel.step;