创建临时视频表
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
创建临时用户表
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
加载原数据到临时表
load data local inpath "/opt/module/data/video" into table gulivideo_ori;
load data local inpath "/opt/module/user" into table gulivideo_user_ori;
创建视频表
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
创建用户表
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
由于初始表和最终表他们两者中的结构都是一样的,所以对其中一种进行解读
对于用户表
视频表
需求Ⅰ:统计视频观看数 Top10
代码:
select videoId
from gulivideo_orc
order by views
limit 10;
需求Ⅱ:统计视频类别热度 Top10
步骤代码:
--步骤一使用下列任意一种都可以,第一种使用侧偏移,第二种直接炸裂
select
videoId,
category_name
from gulivideo_orc
lateral view explode(category) tmp_category as category_name;t1
select
explode(category) categoru_name
from gulivideo_orc;t1
--第二步代码:
select
category_name,
count(*) ct
from t1
group by category_name
order by ct desc
limit 10;
最终代码:
select
category_name,
count(*) ct
from
(select
explode(category) category_name
from gulivideo_orc)t1
group by category_name
order by ct desc
limit 10;
需求Ⅲ:统计出视频观看数最高的 20 个视频的所属类别以及每一个类别包含 Top20 视频的个数
select
category
from gulivideo_orc
order by views desc
limit 20;t1
select
explode(category) category_name
from t1;t2
select
category_name,
count(*) video_sum
from t2
group by category_name;
最终代码:
select
category_name,
count(*) video_sum
from
(select
explode(category) category_name
from
(select
category
from gulivideo_orc
order by views desc
limit 20)t1)t2
group by category_name;
需求Ⅳ: 统计视频观看数 Top50 所关联视频的所属类别排序
分步代码:
select
relatedId
from gulivideo_orc
order by views desc
limit 50;t1
select
explode(relatedId) relatedId_id
from t1;t2
select
category
from t2
join gulivideo_orc t3
on t2.relatedId_id = t3.videoId;t4
select
explode(category) category_name
from t4;t5
select
category_name,
count(*) ct
from t5
group by category_name
order by ct desc;
代码:
select
category_name,
count(*) ct
from
(select
explode(category) category_name
from
(select
category
from
(select
explode(relatedId) relatedId_id
from
(select
relatedId
from gulivideo_orc
order by views desc
limit 50)t1)t2
join gulivideo_orc t3
on t2.relatedId_id = t3.videoId)t4)t5
group by category_name
order by ct desc;
需求Ⅴ:统计每个类别中的视频热度 Top10,以 Music 为例
分布代码:
select
videoId,
category_name
from gulivideo_orc
lateral view explode(category) tmp_category as category_name;t1
select
videoId,
views,
category_name
from t1
where category_name = "Music"
order by views desc
limit 10;
代码:
select
videoId,
views,
category_name
from
(select
videoId,
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp_category as category_name)t1
where category_name = "Music"
order by views desc
limit 10;
需求Ⅵ:统计每个类别视频观看数 Top10
分步代码:
select
videoId,
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp_category as category_name;t1
select
videoId,
views,
category_name
rank() over(partition by t1.category_name order by t1.views desc) rk
from t1;t2
select
videoId,
views,
category_name,
rk
from t2
where t2.rk <= 10;
代码:
select
videoId,
views,
category_name
rk
from (
select
videoId,
views,
category_name,
rank() over(partition by t1.category_name order by t1.views desc) rk
from
(select
videoId,
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp_category as category_name)t1)t2
where t2.rk <= 10;
需求Ⅶ:统计上传视频最多的用户 Top10 以及他们上传的视频观看次数在前 20 的视频
分布代码:
select
uploader
from
gulivideo_user_orc
order by videos desc
limit 10;t1
select
videoId,
views,
t2.uploader
from t1
join gulivideo_orc g
on t1.uploader = g.uploader;t2
select
videoId,
views,
uploader,
rank()over(partition by uploader order by views desc) rk
from t2;t3
select
videoId,
views,
uploader
from t3
where rk<= 20;
代码:
select
videoId,
views,
uploader,
rank()over(partition by uploader order by views desc) rk
from
(select
videoId,
views,
t1.uploader
from
(select
uploader
from
gulivideo_user_orc
order by videos desc
limit 10)t1
join gulivideo_orc g
on t1.uploader = g.uploader) t2;