字段 备注 详细描述
videoId 视频唯一 id(String) 11 位字符串
uploader 视频上传者(String) 上传视频的用户名 String
age 视频年龄(int) 视频在平台上的整数天
category 视频类别(Array<String>) 上传视频指定的视频分类
length 视频长度(Int) 整形数字标识的视频长度
views 观看次数(Int) 视频被浏览的次数
rate 视频评分(Double) 满分 5 分
Ratings 流量(Int) 视频的流量,整型数字
conments 评论数(Int) 一个视频的整数评论数
relatedId 相关视频 id(Array<String>) 相关视频的 id,最多 20 个
字段 备注 字段类型
uploader 上传者用户名 string
videos 上传视频数 int
friends 朋友数量 int
创建最终表:video_ori,video_user_ori
video_ori
create table video_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table video_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
load data local inpath "/opt/module/data/video" into table video_ori;
load data local inpath "/opt/module/user" into table video_user_ori;
测试数据
思路:使用 order by 按照 views 字段做一个全局排序即可,同时我们设置只显示前 10条。
最终代码:
SELECT
videoId,
views
FROM
video_ori
ORDER BY
views DESC
LIMIT 10;
思路:
SELECT
t1.category_name ,
COUNT(t1.videoId) hot
FROM
(
SELECT
videoId,
category_name
FROM
video_ori
lateral VIEW explode(category) video_ori_tmp AS category_name
) t1
GROUP BY
t1.category_name
ORDER BY
hot
DESC
LIMIT 10
思路:
SELECT
t2.category_name,
COUNT(t2.videoId) video_sum
FROM
(
SELECT
t1.videoId,
category_name
FROM
(
SELECT
videoId,
views ,
category
FROM
video_ori
ORDER BY
views
DESC
LIMIT 20
) t1
lateral VIEW explode(t1.category) t1_tmp AS category_name
) t2
GROUP BY t2.category_name
SELECT
t6.category_name,
t6.video_sum,
rank() over(ORDER BY t6.video_sum DESC ) rk
FROM
(
SELECT
t5.category_name,
COUNT(t5.relatedid_id) video_sum
FROM
(
SELECT
t4.relatedid_id,
category_name
FROM
(
SELECT
t2.relatedid_id ,
t3.category
FROM
(
SELECT
relatedid_id
FROM
(
SELECT
videoId,
views,
relatedid
FROM
video_ori
ORDER BY
views
DESC
LIMIT 50
)t1
lateral VIEW explode(t1.relatedid) t1_tmp AS relatedid_id
)t2
JOIN
gulivideo_orc t3
ON
t2.relatedid_id = t3.videoId
) t4
lateral VIEW explode(t4.category) t4_tmp AS category_name
) t5
GROUP BY
t5.category_name
ORDER BY
video_sum
DESC
) t6
思路:
SELECT
t1.videoId,
t1.views,
t1.category_name
FROM
(
SELECT
videoId,
views,
category_name
FROM video_ori
lateral VIEW explode(category) video_ori_tmp AS category_name
)t1
WHERE
t1.category_name = "Music"
ORDER BY
t1.views
DESC
LIMIT 10
t2.videoId,
t2.views,
t2.category_name,
t2.rk
FROM
(
SELECT
t1.videoId,
t1.views,
t1.category_name,
rank() over(PARTITION BY t1.category_name ORDER BY t1.views DESC ) rk
FROM
(
SELECT
videoId,
views,
category_name
FROM video_orc
lateral VIEW explode(category) video_orc_tmp AS category_name
)t1
)t2
WHERE t2.rk <=10
思路:
t2.videoId,
t2.views,
t2.uploader
FROM
(
SELECT
uploader,
videos
FROM video_user_orc
ORDER BY
videos
DESC
LIMIT 10
) t1
JOIN video_orc t2
ON t1.uploader = t2.uploader
ORDER BY
t2.views
DESC
LIMIT 20
如下数据为蚂蚁森林中用户领取的减少碳排放量
id dt lowcarbon
1001 2021-12-12 123
1002 2021-12-12 45
1001 2021-12-13 43
1001 2021-12-13 45
1001 2021-12-13 23
1002 2021-12-14 45
1001 2021-12-14 230
1002 2021-12-15 45
1001 2021-12-15 23
找出连续 3 天及以上减少碳排放量在 100 以上的用户
select
id,
dt,
sum(lowcarbon) lowcarbon
from test1
group by id,dt
having lowcarbon>100;t1
1001 2021-12-12 123
1001 2021-12-13 111
1001 2021-12-14 230
等差数列法:两个等差数列如果等差相同,则相同位置的数据相减等到的结果相同
select
id,
dt,
lowcarbon,
rank() over(partition by id order by dt) rk
from t1;t2
select
id,
dt,
lowcarbon,
date_sub(dt,rk) flag
from t2;t3
select
id,
flag,
count(*) ct
from t3
group by id,flag
having ct>=3;
select
id,
flag,
count(*) ct
from
(select
id,
dt,
lowcarbon,
date_sub(dt,rk) flag
from
(select
id,
dt,
lowcarbon,
rank() over(partition by id order by dt) rk
from
(select
id,
dt,
sum(lowcarbon) lowcarbon
from test1
group by id,dt
having lowcarbon>100)t1)t2)t3
group by id,flag
having ct>=3;
如下为电商公司用户访问时间数据
id ts(秒)
1001 17523641234
1001 17523641256
1002 17523641278
1001 17523641334
1002 17523641434
1001 17523641534
1001 17523641544
1002 17523641634
1001 17523641638
1001 17523641654
某个用户连续的访问记录如果时间间隔小于 60 秒,则分为同一个组,结果为:
id ts(秒) group
1001 17523641234 1
1001 17523641256 1
1001 17523641334 2
1001 17523641534 3
1001 17523641544 3
1001 17523641638 4
1001 17523641654 4
1002 17523641278 1
1002 17523641434 2
1002 17523641634 3
select
id,
ts,
lag(ts,1,0) over(partition by id order by ts) lagts
from
test2;t1
1001 17523641234 0
1001 17523641256 17523641234
1001 17523641334 17523641256
1001 17523641534 17523641334
1001 17523641544 17523641534
1001 17523641638 17523641544
1001 17523641654 17523641638
1002 17523641278 0
1002 17523641434 17523641278
1002 17523641634 17523641434
select
id,
ts,
ts-lagts tsdiff
from
t1;t2
select
id,
ts,
ts-lagts tsdiff
from
(select
id,
ts,
lag(ts,1,0) over(partition by id order by ts) lagts
from
test2)t1;t2
1001 17523641234 17523641234
1001 17523641256 22
1001 17523641334 78
1001 17523641534 200
1001 17523641544 10
1001 17523641638 94
1001 17523641654 16
1002 17523641278 17523641278
1002 17523641434 156
1002 17523641634 200
select
id,
ts,
sum(if(tsdiff>=60,1,0)) over(partition by id order by ts) groupid
from
t2;
select
id,
ts,
sum(if(tsdiff>=60,1,0)) over(partition by id order by ts) groupid
from
(select
id,
ts,
ts-lagts tsdiff
from
(select
id,
ts,
lag(ts,1,0) over(partition by id order by ts) lagts
from
test2)t1)t2;
某游戏公司记录的用户每日登录数据
id dt
1001 2021-12-12
1002 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1002 2021-12-16
1001 2021-12-19
1002 2021-12-17
1001 2021-12-20
计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。
思路:分组
1001 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1001 2021-12-19
1001 2021-12-20
1001 2021-12-12 1970-01-01
1001 2021-12-13 2021-12-12
1001 2021-12-14 2021-12-13
1001 2021-12-16 2021-12-14
1001 2021-12-19 2021-12-16
1001 2021-12-20 2021-12-19
select
id,
dt,
lag(dt,1,'1970-01-01') over(partition by id order by dt) lagdt
from
test3;t1
1001 2021-12-12 564564
1001 2021-12-13 1
1001 2021-12-14 1
1001 2021-12-16 2
1001 2021-12-19 3
1001 2021-12-20 1
select
id,
dt,
datediff(dt,lagdt) flag
from
t1;t2
1001 2021-12-12 1
1001 2021-12-13 1
1001 2021-12-14 1
1001 2021-12-16 1
1001 2021-12-19 2
1001 2021-12-20 2
select
id,
dt,
sum(if(flag>2,1,0)) over(partition by id order by dt) flag
from
t2;t3
select
id,
flag,
datediff(max(dt),min(dt)) days
from
t3
group by id,flag;t4
select
id,
max(days)+1
from
t4
group by id;
select
id,
max(days)+1
from
(select
id,
flag,
datediff(max(dt),min(dt)) days
from
(select
id,
dt,
sum(if(flag>2,1,0)) over(partition by id order by dt) flag
from
(select
id,
dt,
datediff(dt,lagdt) flag
from
(select
id,
dt,
lag(dt,1,'1970-01-01') over(partition by id order by dt) lagdt
from
test3)t1)t2)t3
group by id,flag)t4
group by id;
如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
brand stt edt
oppo 2021-06-05 2021-06-09
oppo 2021-06-11 2021-06-21
vivo 2021-06-05 2021-06-15
vivo 2021-06-09 2021-06-21
redmi 2021-06-05 2021-06-21
redmi 2021-06-09 2021-06-15
redmi 2021-06-17 2021-06-26
huawei 2021-06-05 2021-06-26
huawei 2021-06-09 2021-06-15
huawei 2021-06-17 2021-06-21
计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-05 到 2021-06-15,第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。
select
id,
stt,
edt,
max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from test4;t1
redmi 2021-06-05 2021-06-21 null
redmi 2021-06-09 2021-06-15 2021-06-21
redmi 2021-06-17 2021-06-26 2021-06-21
select
id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
edt
from t1;t2
redmi 2021-06-05 2021-06-21
redmi 2021-06-22 2021-06-15
redmi 2021-06-22 2021-06-26
select
id,
datediff(edt,stt) days
from
t2;t3
redmi 16
redmi -4
redmi 4
select
id,
sum(if(days>=0,days+1,0)) days
from
t3
group by id;
redmi 22
select
id,
sum(if(days>=0,days+1,0)) days
from
(select
id,
datediff(edt,stt) days
from
(select
id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
edt
from
(select
id,
stt,
edt,
max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from test4)t1)t2)t3
group by id;
如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数。
id stt edt
1001 2021-06-14 12:12:12 2021-06-14 18:12:12
1003 2021-06-14 13:12:12 2021-06-14 16:12:12
1004 2021-06-14 13:15:12 2021-06-14 20:12:12
1002 2021-06-14 15:12:12 2021-06-14 16:12:12
1005 2021-06-14 15:18:12 2021-06-14 20:12:12
1001 2021-06-14 20:12:12 2021-06-14 23:12:12
1006 2021-06-14 21:12:12 2021-06-14 23:15:12
1007 2021-06-14 22:12:12 2021-06-14 23:10:12
select id,stt dt,1 p from test5
union
select id,edt dt,-1 p from test5;t1
1001 2021-06-14 12:12:12 1
1001 2021-06-14 18:12:12 -1
1001 2021-06-14 20:12:12 1
1001 2021-06-14 23:12:12 -1
1002 2021-06-14 15:12:12 1
1002 2021-06-14 16:12:12 -1
1003 2021-06-14 13:12:12 1
1003 2021-06-14 16:12:12 -1
1004 2021-06-14 13:15:12 1
1004 2021-06-14 20:12:12 -1
1005 2021-06-14 15:18:12 1
1005 2021-06-14 20:12:12 -1
1006 2021-06-14 21:12:12 1
1006 2021-06-14 23:15:12 -1
1007 2021-06-14 22:12:12 1
1007 2021-06-14 23:10:12 -1
select
id,
dt,
sum(p) over(order by dt) sum_p
from
(select id,stt dt,1 p from test5
union
select id,edt dt,-1 p from test5)t1;t2
select
max(sum_p)
from
(select
id,
dt,
sum(p) over(order by dt) sum_p
from
(select id,stt dt,1 p from test5
union
select id,edt dt,-1 p from test5)t1)t2;