需求描述
统计视频网站的常规指标,各种TopN指标:
创建表:chbvideo_ori,chbvideo_user_ori,
创建表:chbvideo_orc,chbvideo_user_orc
create table chbvideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table chbvideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
load data inpath '/tmp/hivetest/chbVideoOut/video/2008/0222' overwrite into table chbvideo_ori;
load data local inpath '/uardata1/hivetest/chbVideo/user/2008/0903' into table chbvideo_user_ori;
create table chbvideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
clustered by (uploader) into 8 buckets
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
create table chbvideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc;
# 导入到表中
chbvideo_orc:
insert into table chbvideo_orc select * from chbvideo_ori;
chbvideo_user_orc:
insert into table chbvideo_user_orc select * from chbvideo_user_ori;
思路:使用order by按照views字段做一个全局排序即可,同时我们设置只显示前10条。
select
videoId,
uploader,
age,
category,
length,
views,
rate,
ratings,
comments
from chbvideo_orc
order by views desc
limit 10;
思路:
select
category_name as category,
count(t1.videoId) as hot
from (
select
videoId,
category_name
from
chbvideo_orc lateral view explode(category) t_catetory as category_name) t1
group by t1.category_name
order by hot desc
limit 10;
思路:
select
category_name as category,
count(t2.videoId) as hot_with_views
from (
select
videoId,
category_name
from (
select
*
from
chbvideo_orc
order by views desc
limit 20
) t1 lateral view explode(category) t_catetory as category_name) t2
group by category_name
order by hot_with_views desc;
思路
t1:观看数前50的视频
select
*
from
chbvideo_orc
order by views desc
limit 50;
t2:将相关视频的id进行列转行操作
select
explode(relatedId) as videoId
from
t1;
t5:得到两列数据,一列是category,一列是之前查询出来的相关视频id
(select
distinct(t2.videoId),
t3.category
from
t2
inner join
chbvideo_orc t3 on t2.videoId = t3.videoId) t4 lateral view explode(category) t_catetory as category_name;
select
category_name as category,
count(t5.videoId) as hot
from (
select
videoId,
category_name
from (
select
distinct(t2.videoId),
t3.category
from (
select
explode(relatedId) as videoId
from (
select
*
from
chbvideo_orc
order by views desc
limit 50
) t1
) t2
inner join
chbvideo_orc t3 on t2.videoId = t3.videoId) t4 lateral view explode(category) t_catetory as category_name) t5
group by category_name
order by hot desc;
create table chbvideo_category(
videoId string,
uploader string,
age int,
categoryId string,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as orc;
insert into table chbvideo_category
select
videoId,
uploader,
age,
categoryId,
length,
views,
rate,
ratings,
comments,
relatedId
from
chbvideo_orc lateral view explode(category) catetory as categoryId;
select
videoId,
views
from
chbvideo_category
where
categoryId = "Music"
order by views desc
limit 10;
思路:
select
videoId,
views,
ratings
from
chbvideo_category
where
categoryId = "Music"
order by ratings desc
limit 10;
思路:
select
*
from
chbvideo_user_orc
order by videos desc
limit 10;
select
t2.videoId,
t2.views,
t2.ratings,
t1.videos,
t1.friends
from (
select
*
from
chbvideo_user_orc
order by videos desc
limit 10
) t1
join
chbvideo_orc t2
on
t1.uploader = t2.uploader
order by views desc
limit 20;
思路:
select
t1.*
from (
select
videoId,
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank from chbvideo_category) t1
where
rank <= 10;