2023.3.13
--题目1:找出所有科目成绩都大于某一学科平均成绩的学生
--表结构:score(uid,subject_id,score)
--数据例:1001 01 90
select t.uid
from (select uid,
count(1) as cnt,
count(`if`(tab.score > avg_sco, tab.score, null)) as avg_ant
from (select uid,
subject_id,
score,
avg(score) over (partition by subject_id) as avg_sco
from score) tab
group by uid) t
where t.cnt = t.avg_ant
--题目2:统计出每个用户每个月的访问数和累积访问次数
--表结构:action(userId string,visitDate string,visitCount)
--数据例:u01 2017/2/22 4
select userid,
dt,
cnt current_cnt,
sum(cnt) over (partition by userid order by dt) acc_cnt
from (select userid,
dt,
sum(visitcount) as cnt
from (select userId,
date_format(regexp_replace(visitdate, '/', '-'), 'yyyy-MM') as dt,
visitcount
from action) t
group by userid, dt) tab
--题目3:有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计
--数据例:u1 a
--1)每个店铺的UV(访客数)
--2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数数据集
1:
select shop, count(distinct user_id)
from visit
group by shop;
2:
select shop,
user_id,
cnt,
rn
from (select shop,
user_id,
cnt,
rank() over (partition by shop order by t.cnt desc ) as rn
from (select shop,
user_id,
count(1) as cnt
from visit
group by shop, user_id) t) temp
where temp.rn <= 3
--题目4
--order_tab(dt,order_id,user_id,amount)
--数据样例:2017-01-01,10029028,1000003251,33.57
--1)给出 2017年每个月的订单数、用户数、总成交金额。
--2)给出2017年11月的新客数(指在11月才有第一笔订单)
1:
select substr(dt, 1, 7),
count(order_id),
count(distinct user_id),
sum(amount)
from order_tab
where substr(dt, 1, 4) = '2017'
group by substr(dt, 1, 7);
2.
select count(1) as cnt
from (select user_id,
min(dt)
from order_tab
group by user_id
having substr(min(dt), 1, 7) = '2017-11') t
--题目5
--请写出代码求得所有用户和活跃用户的总数及平均年龄,活跃用户指连续两天都有访问记录的用户
--user_age(dt string,user_id string,age int)
--数据样例:2019-02-11,test_1,23
select count(user_id) as user_cnt,
sum(t.age) / count(user_id) user_avg_age,
'total' is_active
from (select user_id, min(user_age.age) as age
from user_age
group by user_id) t
union all
select count(user_id) as act_user,
sum(age) / count(user_id) act_user_avg_age,
'active'
from (select user_id,
min(age) as age
from (select user_id,
dt,
age,
datediff(nvl(lead_dt, '9999-12-31'), dt) as diff_num
from (select user_id,
dt,
age,
lead(dt, 1) over (partition by user_id order by dt) as lead_dt
from (select user_id,
dt,
min(age) as age
from user_age
group by user_id, dt) t) t1) t2
where diff_num = 1
group by user_id) t3
--题目6
--用一条SQL语句查询出每门课都大于80分的学生姓名
--name kecheng fenshu
--数据示例:张三 语文 81
select name from score group by name having min(fenshu)>80
--题目7
--怎么把这样一个
year month amount
1991 1 1.1
1991 2 1.2
1991 3 1.3
1991 4 1.4
1992 1 2.1
1992 2 2.2
1992 3 2.3
1992 4 2.4
查成这样一个结果
year m1 m2 m3 m4
1991 1.1 1.2 1.3 1.4
1992 2.1 2.2 2.3 2.4
select year,
max(case when month = 1 then amount else null end) as m1,
max(case when month = 2 then amount else null end) as m2,
max(case when month = 3 then amount else null end) as m3,
max(case when month = 4 then amount else null end) as m4
from m_am
group by year
--题目8 有一个订单表order。已知字段有:order_id(订单ID), user_id(用户ID),amount(金额), pay_datetime(付费时间),channel_id(渠道ID),dt(分区字段)。
--order_a(order_id int,user_id int,amount double,pay_datatime timestamp,channel_id int)
--1)查询dt=‘2018-09-01‘里每个渠道的订单数,下单人数(去重),总金额
select channel_id,
count(order_id),
count(distinct user_id),
sum(amount)
from order_a
where dt = '2018-09-01'
group by channel_id;
--2)查询dt=‘2018-09-01‘里每个渠道的金额最大3笔订单。
select channel_id,
order_id,
rn
from (select channel_id,
order_id,
rank() over (partition by channel_id order by amount desc ) as rn
from order_a
where dt = '2018-09-01') t
where t.rn <= 3;
--4) 有一天发现订单数据重复,请分析原因
1.首先关系型数据库不会发生重复
2.hive ods层接受到数据有重复很正常,前提数据通道是kafka数据源
3.那么很有可能后端查询mysql业务数据重复查询,然后重复打到kafka了
4.也有可能手动使用datax等工具同步数据的时候,数据拉取限制条件在凌晨12点的情况下,>和>=没写清楚
2023.3.15
--题目1
--找出连续3天及以上减少碳排放量在100以上的用户
--id dt lowcarbon
--1001 2021-12-12 123
--1002 2021-12-12 45
--1001 2021-12-13 43
--1001 2021-12-13 45
--1001 2021-12-13 23
--1002 2021-12-14 45
--1001 2021-12-14 230
--1002 2021-12-15 45
--1001 2021-12-15 23
select id
from (select id,
dt,
cnt,
date_sub(dt, rn) as tem_dt
from (select id,
dt,
cnt,
rank() over (partition by id,dt order by cnt) as rn
from (select id, dt, sum(lowcarbon) as cnt
from table
group by id, dt
having sum(lowcarbon) > 100) t1) t2) t3
group by id, tem_dt
having count(1) >= 3
--题目2
--计算每个用户最大的连续登录天数,可以间隔一天。解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录
--id dt
--1001 2021-12-12
--1001 2021-12-12
--1001 2021-12-13
--1001 2021-12-14
--1001 2021-12-16
--1001 2021-12-19
--1001 2021-12-20
--1002 2021-12-12
--1002 2021-12-16
--1002 2021-12-17
2023.3.16
--第 4 题 打折日期交叉问题
--如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
--id stt edt
--oppo 2021-06-05 2021-06-09
--oppo 2021-06-11 2021-06-21
--vivo 2021-06-05 2021-06-15
--vivo 2021-06-09 2021-06-21
--redmi 2021-06-05 2021-06-21
--redmi 2021-06-09 2021-06-15
--redmi 2021-06-17 2021-06-26
--huawei 2021-06-05 2021-06-26
--huawei 2021-06-09 2021-06-15
--huawei 2021-06-17 2021-06-21
计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-05 到 2021-06-15,第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15 号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天
select id,
sum(if(days>=0,days+1,0)) days
from
( select id,
datediff(edt,stt) days
from
(
select id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
edt
from
( select id,
stt,
edt,
max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from test_b
)t1
)t2
)t3
group by
id;
2023.3.17
--如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数。
--id stt edt
--1001 2021-06-14 12:12:12 2021-06-14 18:12:12
--1003 2021-06-14 13:12:12 2021-06-14 16:12:12
--1004 2021-06-14 13:15:12 2021-06-14 20:12:12
--1002 2021-06-14 15:12:12 2021-06-14 16:12:12
--1005 2021-06-14 15:18:12 2021-06-14 20:12:12
--1001 2021-06-14 20:12:12 2021-06-14 23:12:12
--1006 2021-06-14 21:12:12 2021-06-14 23:15:12
--1007 2021-06-14 22:12:12 2021-06-14 23:10:12
select max(cnt)
from (select id,
t,
sum(flag) over (order by t) as cnt
from (select id, stt as t, 1 as flag
from test_c
union all
select id, edt as t, -1 as flag
from test_c) t1) t2
2023.3.17
--有一个5000万的用户文件(user_id,name,age),一个2亿记录的用户看电影的记录文件(user_id,url),根据年龄段观看电影的次数进行排序?
select age_phase,
count(1) as cnt
from (select u.user_id,
name,
age,
age_phase,
url
from (SELECT user_id,
name,
age,
CASE
WHEN age <= 10 AND age > 0 THEN '0-10'
WHEN age <= 20 AND age > 10 THEN '10-20'
WHEN age > 20 AND age <= 30 THEN '20-30'
WHEN age > 30 AND age <= 40 THEN '30-40'
WHEN age > 40 AND age <= 50 THEN '40-50'
WHEN age > 50 AND age <= 60 THEN '50-60'
WHEN age > 60 AND age <= 70 THEN '60-70'
ELSE '70以上' END as age_phase
from test4user) u
join test4log l on u.user_id = l.user_id) t
group by age_phase order by count(1) desc;