尚大练习网站
在这里插入图片描述](https://img-blog.csdnimg.cn/24d8c48e679e4cca95eab16c70945984.png)
with
t1 as (
SELECT
sku_id,
sum(sku_num) as num
from
order_detail
group by
sku_id
),
t2 as (
SELECT
sku_id,
num,
rank() over (
order BY
num desc
) rk
FROM t1
)
SELECT sku_id FROM
t2
where
rk = 2;
这道题刚开始用的rank()函数进行排序,rank()的排序规则是 1 1 3 ,即出现排名相同的会处于同一名次,会出现如下的情况
即同一天登录的无法去重
使用row_number() 可以解决问题,因为row_number()的排序规则是1,2,3,4
SELECT
user_id
FROM
(
SELECT
user_id,
count(*) as cnt
FROM
(
SELECT
user_id,
create_date,
rk,
date_sub(create_date, rk) as ds
FROM
(
SELECT
user_id,
create_date,
row_number() over(
partition by user_id
order by
create_date
) as rk
FROM
order_info
) t1
) t2
GROUP by
user_id,
ds
HAVING
cnt >= 3
) t3;
2022-11-11
注意点:在hql中 group by中出现的字段,在select中要出现,要么是要查询的,要么用聚合函数包裹住
包裹比较多,有更好的解法还请多多指教。
SELECT
category_id,
category_name,
sku_id,
name,
order_num,
sku_cnt
FROM
(
SELECT
category_id,
category_name,
sku_id,
name,
sku_cnt,
order_num,
rank() over(
PARTITION by category_id
order by
order_num desc
) as rk
FROM
(
select
t1.category_id,
t1.category_name,
od.sku_id,
si.name,
sum(od.sku_num) as order_num,
t1.sku_cnt
FROM
(
SELECT
ci.category_id,
ci.category_name,
count(*) as sku_cnt
FROM
sku_info si
JOIN category_info ci on si.category_id = ci.category_id
GROUP by
ci.category_id,
ci.category_name
) t1
join sku_info si ON t1.category_id = si.category_id
JOIN order_detail od ON si.sku_id = od.sku_id
GROUP by
t1.category_id,
t1.category_name,
od.sku_id,
si.name,
t1.sku_cnt
) t2
) t3
where rk=1;
从订单信息表(order_info)中统计每个用户截止其每个下单日期的累积消费金额,以及每个用户在其每个下单日期的VIP等级。
用户vip等级根据累积消费金额计算,计算规则如下:
设累积消费总额为X,
若0=
若30000<=X<50000,则vip等级为白银会员
若50000<=X<80000,则vip为黄金会员
若80000<=X<100000,则vip等级为白金会员
若X>=100000,则vip等级为钻石会员
SELECT
user_id,
create_date,
t1.sum_so_far,
(
CASE
when t1.sum_so_far >= 0 and sum_so_far<10000 THEN '普通会员'
when t1.sum_so_far >= 10000 and sum_so_far<30000 THEN '青铜会员'
when t1.sum_so_far >= 30000 and sum_so_far<50000 THEN '白银会员'
when t1.sum_so_far >= 50000 and sum_so_far<80000 THEN '黄金会员'
when t1.sum_so_far >= 80000 and sum_so_far<100000 THEN '白金会员'
else '钻石会员'
END
) vip_level
FROM
(
SELECT
DISTINCT
user_id,
create_date,
sum(total_amount) over(
partition by user_id
order by
create_date
) as sum_so_far
FROM
order_info
) t1
注意点
case when… then …else …end 语句的 使用 用于复杂条件的判断
if语句 if (表达式,result_true,result_false) 一般用于二义性的判断
distinct 去重要放在select的后面
前备知识
求日期差的函数 datediff(date1_start,date2_end)
TIMESTAMPDIFF(时间粒度,开始时间,结束时间)
时间粒度
2/SECOND--秒:计算开始时间与结束时间相差的秒数。
4/MINUTE--分:计算开始时间与结束时间相差多少分钟。
8/HOUR--小时:计算开始时间与结束时间相差多少小时。
16/DAY--天:计算开始时间与结束时间相差的天数。
32/WEEK--周:计算开始时间与结束时间相差的周数。
64/MONTH--月:计算开始时间与结束时间相差多少月。
128/QUARTER--季:计算开始时间与结束时间相差的季度数。
256/YEAR--年:计算开始时间与结束时间相差的年数。
cast函数 转换成指定的格式
CAST(expr AS CHAR | CHARACTER | VARCHAR | NCHAR | NVARCHAR)
CAST(expr AS CHAR(n) | CHARACTER(n) | VARCHAR(n) )
CAST(expr AS CHAR VARYING | CHARACTER VARYING)
CAST(expr AS INT | INTEGER | BIGINT | SMALLINT | TINYINT)
CAST(expr AS DEC | DECIMAL | NUMERIC)
CAST(expr AS DEC(p[,s]) | DECIMAL(p[,s]) | NUMERIC(p[,s]) )
CAST(expr AS DOUBLE)
CAST(expr AS MONEY | SMALLMONEY)
CAST(expr AS DATE)
CAST(expr AS TIME)
CAST(expr AS POSIXTIME)
CAST(expr AS TIMESTAMP | DATETIME | SMALLDATETIME)
CAST(expr AS BIT)
CAST(expr AS BINARY | BINARY VARYING | VARBINARY)
CAST(expr AS BINARY(n) | BINARY VARYING(n) | VARBINARY(n) )
CAST(expr AS GUID)
步骤
1.通过min(create_date) over(partition by user_id) as first_order_date 查到用户第一次登录的时间
SELECT
user_id,
create_date,
min(create_date) over(partition by user_id) as first_order_date
FROM
order_info;t1
2.根据t1表的create_date 和first_order_date 通过datediff函数 找到第二天下单的用户,打标记
SELECT
user_id,
create_date,
first_order_date,
CASE
WHEN datediff(create_date, first_order_date) = 1 then 1
else 0
END as flag
from t1;t2
3.根据t2表
case
WHEN flag = 1 then user_id
ELSE NULL
END
统计flag为1的数量,
count(DISTINCT user_id) over() as all_cnt 统计用户总数 需要去重
SELECT
DISTINCT flag,
count(
case
WHEN flag = 1 then user_id
ELSE NULL
END
) over(PARTITION by flag) as flag_cnt,
count(DISTINCT user_id) over() as all_cnt
FROM
t2;t3
计算比率
调用cast 函数进行转换且保留3位小数
SELECT
flag,
max(
CAST (flag_cnt / all_cnt AS decimal(18, 3))
)
as ration
FROM
5.拼接百分数
要先转换为double类型的,然后再拼接,否则小数点后的0会被省略。
SELECT
concat(cast(round(ration,3)*100 as double),'%') as percentage
FROM
(
SELECT
flag,
max(
CAST (flag_cnt / all_cnt AS decimal(18, 3))
)
as ration
FROM
(
SELECT
DISTINCT flag,
count(
case
WHEN flag = 1 then user_id
ELSE NULL
END
) over(PARTITION by flag) as flag_cnt,
count(DISTINCT user_id) over() as all_cnt
FROM
(
SELECT
user_id,
create_date,
first_order_date,
CASE
WHEN datediff(create_date, first_order_date) = 1 then 1
else 0
END as flag
FROM
(
SELECT
user_id,
create_date,
min(create_date) over(partition by user_id) as first_order_date
FROM
order_info
) t1
) t2
) t3
group by
flag
HAVING
flag = 1
) t3;
前置知识
从 yyyy-MM-dd中提取年份数据 year
写的有点冗余,但是菜鸡只能写出这种sql了
步骤如下
1.构造带有年份的数据
(SELECT
sku_id,
create_date,
year(create_date) as first_year,
price,
sku_num
FROM
order_detail)t1
2.从t1中提取出价格和 商品总数量
因为select中 要查的字段必须要在group 中出现,所以提取价格用了 min() 函数,同一种商品的价格都一样
(SELECT
sku_id,
first_year,
min(price) as price,
sum(sku_num) as num
FROM
t1);t2
3.从t2表中提取出 商品总销售额,同时为了提取首年进行一个排序 row_number()
SELECT
sku_id,
first_year,
num as order_num,
num * price as order_amount,
row_number() OVER(PARTITION BY sku_id) as rn
FROM t2 ;t3
4.从t3表中提取最终数据
-- 商品首年的销售数量和销售总额
SELECT
sku_id,
first_year as year,
order_num,
order_amount
FROM
(
SELECT
sku_id,
first_year,
num as order_num,
num * price as order_amount,
row_number() OVER(PARTITION BY sku_id) as rn
FROM
(
SELECT
sku_id,
first_year,
min(price) as price,
sum(sku_num) as num
FROM
(
SELECT
sku_id,
create_date,
year(create_date) as first_year,
price,
sku_num
FROM
order_detail
) t1
GROUP by
sku_id,
first_year
)t2
) t3
where rn=1;
2022-11-16
思路
1.给sku_info多加两列 去年的开始日期 和 最后上架的时间
add_months(‘2022-01-10’, -1)表示’2022-01-10’往前的一个月
add_months(‘2022-01-01’,-12) 表示往前的12个月
SELECT
sku_id,
from_date,
name,
add_months('2022-01-10', -1) as end_date,
add_months('2022-01-01',-12) as start_date
from
sku_info;t1
2.从t1中选出时间符合的数据
SELECT
t1.sku_id,
t1.name,
od.sku_num
FROM
(
SELECT
sku_id,
from_date,
name,
add_months('2022-01-10', -1) as end_date,
add_months('2022-01-01',-12) as start_date
from
sku_info
) t1
JOIN order_detail od ON t1.sku_id = od.sku_id
where
t1.from_date < t1.end_date AND od.create_date>t1.start_date
;t2
3.根据t2表进行聚合
SELECT
DISTINCT
sku_id,
name,
sum(sku_num) over(PARTITION BY sku_id) as order_num
FROM t2;t3
4.得到最终的结果
SELECT
sku_id,
name,
order_num
FROM
(
SELECT
DISTINCT
sku_id,
name,
sum(sku_num) over(PARTITION BY sku_id) as order_num
FROM
(
SELECT
t1.sku_id,
t1.name,
od.sku_num
FROM
(
SELECT
sku_id,
from_date,
name,
add_months('2022-01-10', -1) as end_date,
add_months('2022-01-01',-12) as start_date
from
sku_info
) t1
JOIN order_detail od ON t1.sku_id = od.sku_id
where
t1.from_date < t1.end_date AND od.create_date>t1.start_date
) t2
)t3
where t3.order_num<100
思路
1.对统一用户按照登录时间进行排序
2.选出第一条数据
3.group by 分组
需要注意 登录时间是 yyyy-MM-dd hh:mm:ss格式 需要转换成yyyy-MM-dd格式用到了date_format函数
SELECT
date_format(login_ts, 'yyyy-MM-dd') as login_date_first,
count(*) as user_count
FROM
(
SELECT
user_id,
login_ts
FROM
(
SELECT
user_id,
login_ts,
logout_ts,
row_number() over(
PARTITION BY user_id
order by
login_ts
) as rn
FROM
user_login_detail
) t1
where
t1.rn = 1
) t2
GROUP by
date_format(login_ts, 'yyyy-MM-dd')
思路
1.按照商品和订单日期求和
2.对求和后的排序
3.rn=1
SELECT
sku_id,
create_date,
sum_num
FROM
(
SELECT
sku_id,
create_date,
sum_num,
row_number() OVER(
PARTITION BY sku_id
order by
sum_num desc
) as rn
FROM
(
SELECT
sku_id,
sku_num,
create_date,
sum(sku_num) over(PARTITION by sku_id, create_date) as sum_num
from
order_detail
) t1
) t2
where
rn = 1;
思路
1.求出所属品类的平均 设计知识点 floor向下取整
SELECT
category_id,
floor(sum_num / ct) as cate_avg_num
FROM
(
select
category_id,
count(DISTINCT si.sku_id) as ct,
sum(sku_num) as sum_num
from
order_detail od
left join sku_info si on od.sku_id = si.sku_id
group by
category_id
) t
品类1 有多少种商品
-- 1.求出所属品类的平均数 销售总数/平均数 求出
-- 2.查询累积销量 根据商品id进行分类
SELECT
sku_id,
name,
sum_num,
cate_avg_num as cate_avg_num
FROM
(
SELECT
si.sku_id,
si.category_id,
si.name,
t1.sum_num,
t2.cate_avg_num
FROM
sku_info si
JOIN (
SELECT
sku_id,
sum(sku_num) as sum_num
FROM
order_detail
group by
sku_id
) t1 on si.sku_id = t1.sku_id
JOIN (
SELECT
category_id,
floor(sum_num / ct) as cate_avg_num
FROM
(
select
category_id,
count(DISTINCT si.sku_id) as ct,
sum(sku_num) as sum_num
from
order_detail od
left join sku_info si on od.sku_id = si.sku_id
group by
category_id
) t
) t2 on si.category_id = t2.category_id
) t3
where
sum_num > cate_avg_num;
2022-11-24
思路
1.从用户登录明细表user_login_detail 统计出用户的总登录次数,并根据登录日期进行排序
(select
user_id,
login_ts,
count(*) OVER (PARTITION by user_id) as total_login_count,
rank() over(
PARTITION by user_id
ORDER BY
login_ts
) as rk
from
user_login_detail); t1
2.从t1表挑选出2021年的记录,并统计21年的登录次数
SELECT
user_id,
login_ts as register_date,
total_login_count,
count(*) OVER(PARTITION by user_id) as login_count_2021,
rk
FROM
t1
where year(login_ts) = '2021';t2
3.从t2表 join 订单信息表order_info ,并根据用户id进行分组 得到想要的结果
完整sql
SELECT
t3.user_id,t3.register_date,t3.total_login_count,t3.login_count_2021, count(*) as order_count_2021, sum(oi.total_amount) as order_amount_2021
FROM
(
SELECT
user_id,
date_format(register_date, 'yyyy-MM-dd') as register_date,
total_login_count,
login_count_2021
FROM
(
SELECT
user_id,
login_ts as register_date,
total_login_count,
count(*) OVER(PARTITION by user_id) as login_count_2021,
rk
FROM
(
select
user_id,
login_ts,
count(*) OVER (PARTITION by user_id) as total_login_count,
rank() over(
PARTITION by user_id
ORDER BY
login_ts
) as rk
from
user_login_detail
) t1
where
year(login_ts) = '2021'
) t2
where
rk = 1
) t3
JOIN order_info oi on t3.user_id = oi.user_id
where year(oi.create_date)='2021'
group by t3.user_id,t3.register_date,t3.total_login_count,t3.login_count_2021
思路:
1.两张表根据sku_id 进行join,找到最新的价格,一个商品会有多条修改记录
2.根据sku_id 进行日期的降序
3.选出rn=1的
SELECT
sku_id,new_price as price
FROM
(
SELECT
si.sku_id,sp.new_price,row_number() over(PARTITION by si.sku_id order BY sp.change_date DESC) as rn
FROM sku_info si
LEFT JOIN sku_price_modify_detail sp
on si.sku_id=sp.sku_id
where sp.change_date<='2021-10-01'
) t1 where rn=1;
思路
1.利用datediff 统计出期望配送日期 和下单日期差 同时对下单时间进行一个排序
2.count(*) 在hive中不能单独使用,要接一个窗口函数 over(),
sum 同理,sum 可以根据df进行统计了。
-- 首单中为即时订单的比例
SELECT
CAST (sm / all_user AS decimal(18, 2)) as percentage
FROM
(
SELECT
user_id,count() over() as all_user,sum(CASE WHEN df=0 then 1 else 0 END) OVER() as sm
FROM
(
SELECT
user_id,
order_date,
custom_date,
datediff(custom_date, order_date) as df,
row_number() OVER(
PARTITION BY user_id
order by
order_date
) as rn
FROM
delivery_info
)t1
WHERE rn=1
) t2
LIMIT 1;
2022-11-26
思路
1.对收藏表用collect_set 做一个拼接,找到用户所喜欢的所有的商品 比如[1,2,3]
SELECT
user_id,
collect_set(sku_id) All_sku_id
FROM
favor_info
group by
user_id;t2
2.friendship_info 表和 t2表关联 找到用户的好友和好友喜欢的商品
3.array_contains(All_sku_id, sku_id) 表示sku_id 是否在All_sku_id中
代码
SELECT
DISTINCT user1_id user_id,
sku_id
FROM
(
SELECT
user1_id,
All_sku_id,
user2_id,
sku_id
FROM
(
SELECT
t1.user1_id,
All_sku_id,
user2_id
FROM
friendship_info t1
JOIN (
SELECT
user_id,
collect_set(sku_id) All_sku_id
FROM
favor_info
group by
user_id
) t2 ON t1.user1_id = t2.user_id
) t3
JOIN favor_info f ON t3.user2_id = f.user_id
) t4
where
user1_id <> user2_id
AND NOT array_contains(All_sku_id, sku_id)
ORDER BY
user_id,
sku_id
SELECT
user_id,
start_date,
end_date
FROM
(
SELECT
user_id,
count(*) as ct,
min(start_date) as start_date,
max(start_date) as end_date
FROM
(
SELECT
user_id,
date_sub(start_date, rk) as ds,
start_date
FROM
(
SELECT
user_id,
date_format(login_ts, 'yyyy-MM-dd') as start_date,
row_number() over(
PARTITION by user_id
ORDER by
login_ts
) as rk
FROM
user_login_detail
) t1
) t2
group by
user_id,
ds
having
ct >= 2
) t3
SELECT
create_date,
sum(case WHEN t1.gender='男' then total_amount else 0 end) as total_amount_male,
sum(case WHEN t1.gender='女' then total_amount else 0 end) as total_amount_female
from
(
SELECT
oi.user_id,oi.create_date,oi.total_amount,ui.gender
from order_info oi
join user_info ui on oi.user_id=ui.user_id
) t1
GROUP BY create_date
SELECT
create_date,
round(total_3d, 2) as total_3d,
round(avg_3d, 2) as avg_3d
FROM
(
SELECT
create_date,
sum(sum_total_amount) over(
ORDER BY
create_date rows BETWEEN 2 PRECEDING
AND CURRENT ROW
) AS total_3d,
avg(sum_total_amount) over(
ORDER BY
create_date rows BETWEEN 2 PRECEDING
AND CURRENT ROW
) AS avg_3d
FROM
(
SELECT
create_date,
sum(total_amount) as sum_total_amount
FROM
order_info
group by
create_date
) t1
) t2
思路
1.有点偷鸡的感觉。
-- 购买过商品 1 2 但是没有购买商品3
WITH
t as(
SELECT
oi.user_id,concat_ws(",",COLLECT_SET(sku_id)) AS sku_arr
FROM
order_info oi
JOIN order_detail od on oi.order_id = od.order_id
where
od.sku_id = 1
OR od.sku_id = 2
OR od.sku_id = 3
GROUP BY oi.user_id
)
SELECT user_id from t where t.sku_arr='1,2'
思路
1.full outer join 两个表
2.if判断 相加减
WITH t as (
SELECT
sku_id,
sku_num,
create_date
from
order_detail
where
sku_id = 1
or sku_id = 2
),
t1 as (
SELECT
sku_id,
sum(sku_num) as sku_num,
create_date
FROM
t
group by
sku_id,
create_date
)
SELECT
t2.create_date,if(t2.sku_num is not NULL,t2.sku_num,0)-if(t3.sku_num is not NULL,t3.sku_num,0) as diff
FROM
(
SELECT
sku_id,sku_num,create_date
from
t1
where
sku_id = 1
)t2
FULL OUTER JOIN (
SELECT
sku_id,sku_num,create_date
from
t1
where
sku_id = 2
) t3
ON t2.create_date=t3.create_date
order by create_date
思路:
因为同一天的用户可能会下很多订单,需要把这些订单都找到,所以用到dense_rank() 排序规则是1 1 2
SELECT
user_id,
order_id,
create_date
FROM
(
SELECT
user_id,
order_id,
create_date,
dense_rank() OVER(PARTITION by user_id order by create_date desc ) as rk
FROM order_info
)t1 where rk<=3;
1.开窗 按照时间升序,
lead(login_ts, 1, “2021-10-10”) login_ts 表示要获取的字段,1表示后面第1行,"2021-10-10"表示如果后面没有了的默认值
-- 最早登陆的时间 最晚登录的时间
WITH t1 AS (
select
user_id,
login_ts,
datediff(
lead(login_ts, 1, "2021-10-10") over (
partition by user_id
order by
login_ts asc
),
login_ts
) as win
from
user_login_detail
),
t2 as (
SELECT
user_id,
win,
row_number() over(
PARTITION BY user_id
order by
win desc
) as rk
from
t1
)
SELECT
user_id,win as max_diff
FROM t2
where rk=1;
lead () 函数按照用户分区 获取下一条数据的登录登出时间
SELECT
user_id
FROM
(
SELECT
*,
lead(login_ts,1,login_ts) OVER(PARTITION BY user_id order by logout_ts) as next_log_in_time,
lead(logout_ts,1,logout_ts) OVER(PARTITION BY user_id order by logout_ts) as next_log_out_time,
lead(ip_address,1,ip_address) OVER(PARTITION BY user_id order by logout_ts) as next_ip_address
FROM
user_login_detail
) t1
WHERE ((next_log_in_time>=login_ts and next_log_in_time<=logout_ts) or (next_log_in_time<=login_ts and next_log_out_time>=logout_ts)) AND ip_address!=next_ip_address
SELECT
sku_id
FROM
(
SELECT
sku_id,
ym,
sum(total) as total
FROM
(
SELECT
sku_id,
create_date,
date_format(create_date, 'yyyy-MM') as ym,
price,
sku_num,
price * sku_num as total
from
order_detail
WHERE
sku_id = 1
or sku_id = 2
) t1
group by
sku_id,
ym
HAVING
(
sku_id = 1
and total > 21000
)
or (
sku_id = 2
and total > 10000
)
) t2
group by
sku_id
HAVING
count(*) > 1
case
when… then
when… then
else
end
SELECT
category,count(*) as cn
FROM
(
SELECT
sku_id,
CASE
WHEN sum_num <= 5000 then '冷门商品'
WHEN sum_num <= 19999 then '一般商品'
ELSE '热门商品'
end
category
FROM
(
SELECT
sku_id,
sum(sku_num) as sum_num
FROM
order_detail
group by
sku_id
) t1
)t2
GROUP BY category
with t1 as (
SELECT
od.sku_id,
od.sku_num,
si.category_id
FROM
order_detail od
join sku_info si on od.sku_id = si.sku_id
),
t2 as(
SELECT
sku_id,
category_id,
sum(sku_num) as sum_num
FROM
t1
GROUP BY
sku_id,
category_id
),
t3 as (
SELECT
sku_id,
category_id,
sum_num,
rank() OVER(
PARTITION BY category_id
order BY
sum_num DESC
)as rk
FROM
t2
)
SELECT
sku_id,category_id
FROM
t3
WHERE rk<=3
2022-12-16 还是在坚持刷sql,但是一直没更新,有几道题目也没想出怎么做