【SQL进阶】Day05:窗口函数

〇、概述

一、专用窗口函数

1、每类试卷得分前3名

【SQL进阶】Day05:窗口函数_第1张图片

 【SQL进阶】Day05:窗口函数_第2张图片

自己写出来的部分

SELECT
    tag AS tid,
    uid AS uid,
    Rank AS ranking -- 如何确定排名
FROM examination_info ei
JOIN exam_record er
USING(exam_id)
GROUP BY tid
ORDER BY MIN(score) DESC,uid ASC

答案:

select u.tag tid,u.uid,u.ranking FROM
    (SELECT *,
        row_number() over (partition by t.tag order by t.max_score desc,t.min_score desc,t.uid desc) ranking 
    FROM
        (SELECT i.tag,r.uid,max(r.score) max_score,min(r.score) min_score
        from examination_info i
        join exam_record r
        on i.exam_id=r.exam_id
        where r.score is not null
        group by i.tag,r.uid
        ) t
    ) u
WHERE u.ranking<=3

SELECT tag,uid,ranking
FROM(
    SELECT
        tag,
        uid,
        row_number() OVER (PARTITION BY tag 
                           ORDER BY tag, MAX(score) DESC, MIN(score) DESC, uid DESC)
                     AS ranking
    FROM exam_record
    JOIN examination_info USING(exam_id)
    GROUP BY tag,uid
    ) new_examrecord
WHERE ranking < 4

学到:ROW_NUMBER() OVER( PATITION BY A ORDER BYB)

2、第二快/慢用时之差大于试卷时长一半的试卷

【SQL进阶】Day05:窗口函数_第3张图片

【SQL进阶】Day05:窗口函数_第4张图片

 自己的想法

-- 查到快慢试卷
SELECT
    er.exam_id,
    -- ROW_NUMBER() OVER(PARTITION BY exam_id ORDER BY exam_id DESC)
    -- 查询最大最小值
    NTH_VALUE(time_took, 2) OVER (PARTITION BY exam_id ORDER BY time_took DESC) as max2_time_took,
FROM exam_record er
JOIN examination_info ei
USING(exam_id)

-- 从上表中查询第二快和第二慢的试卷时间之差

-- 从上表中选出时间之差小于dur/2的试卷id

答案:

选择出所有的时间(秒/60)

选择出满足条件的时间

选择出最终结果并排序

-- 步骤:先拼接,后选条件,最后选结果,每一步都要得到相应的数据
-- 最后选出符合条件的数据
-- 存在重复的现象
SELECT 
    exam_id,
    duration,
    release_time
FROM (
    -- 再选出第二快和第二慢的试卷信息
    SELECT
        DISTINCT exam_id,
        duration,
        release_time,
        NTH_VALUE(use_time,2) OVER(PARTITION BY exam_id ORDER BY use_time ASC) AS min_use_time,
        NTH_VALUE(use_time,2) OVER(PARTITION BY exam_id ORDER BY use_time DESC) AS max_use_time
    FROM (
        -- 先两表join选出所有时间
        SELECT
            er.exam_id,
            duration,
            release_time,
            TIMESTAMPDIFF(SECOND,start_time,submit_time)/60 AS use_time
        FROM exam_record er
        JOIN examination_info ei
        USING(exam_id)
        WHERE submit_time IS NOT NULL
    ) a
) b
WHERE 
    max_use_time IS NOT NULL
    AND
    min_use_time IS NOT NULL
    AND
    (max_use_time-min_use_time)>duration/2
ORDER BY exam_id DESC

3、连续两次作答试卷的最大时间窗

【SQL进阶】Day05:窗口函数_第5张图片

自己的想法

-- 在一张表中查询数据
-- 查询2021年至少有两天作答过试卷的人
-- 查询该年连续两次作答试卷的最大时间窗days_window
SELECT 
    uid,
    days_window,
    avg_exam_cnt
FROM (


)

做法1:【学习with a as(),b as ()】

LAG和LEAD

-- 在一张表中查询数据

-- 查询该年连续两次作答试卷的最大时间窗days_window
with a as 
(   -- 查询2021年至少有两天作答过试卷的人
    select uid
    from exam_record
    where year(submit_time) = 2021
    group by 1
    having count(distinct date(start_time)) >= 2
)
,b as 
(
    -- 取相关数据,具体到2021年的天(且至少两次作答)
    select er.uid,er.exam_id,date(er.start_time) as day
    from exam_record er
    join a on er.uid = a.uid
    where year(start_time) = 2021
)
,c as 
(
    -- 求窗口期
    select uid,
        datediff(day,lag(day,1) over(partition by uid order by day asc))+1 as days_window
    from (select uid,day from b group by 1,2) t
)
,d as 
(
    -- 求平均
    select uid,count(*)/(datediff(max(day),min(day))+1) as avg_exam
    from b
    group by 1
)

-- 结果 注意:前面都要用start_time作为做题日期,而不是submit_time,否则会报错
select c.uid,max(c.days_window),round(max(c.days_window)*d.avg_exam,2) as avg_exam_cnt
from c join d on c.uid=d.uid
group by 1
order by 2 desc,3 desc

方案2:

SELECT uid, days_window, round(days_window*exam_cnt/diff_days, 2) as avg_exam_cnt
FROM (
    -- 2.查询出days_window两次作答的最大时间窗以及相差的最大天数
    SELECT uid,
        count(start_time) as exam_cnt,  -- 此人作答的总试卷数
        DATEDIFF(max(start_time), min(start_time))+1 as diff_days, -- 最早一次作答和最晚一次作答的相差天数
        max(DATEDIFF(next_start_time, start_time))+1 as days_window -- 两次作答的最大时间窗
    FROM (
        -- 1.通过LEAD查询出下一次作答的时间,并得到新的属性【窗口函数、uid分区】
        SELECT uid, exam_id, start_time,
            lead(start_time) over(partition by uid ORDER BY start_time) as next_start_time -- LEAD函数可以将连续的下次作答时间拼上
        FROM exam_record
        WHERE year(start_time)=2021
    ) as t_exam_record_lead
    GROUP BY uid
) as t_exam_record_stat
WHERE diff_days>1
ORDER BY days_window DESC, avg_exam_cnt DESC

自己写出来的

-- 3、查询出最终结果
SELECT
    uid,
    days_window,
    ROUND(num*days_window/min_max,2) AS avg_exam_cnt
FROM (
    -- 2.查询出days_window两次作答的最大时间窗以及相差的最大天数
    SELECT
        uid,
        COUNT(start_time) AS num,
        DATEDIFF(MAX(start_time),MIN(start_time))+1 AS min_max, -- 最早最晚的时间差7
        MAX(DATEDIFF(next_start_time,start_time))+1 AS days_window -- 最大时间窗6【大,小】
    FROM (
        -- 1.通过LEAD查询出下一次作答的时间,并得到新的属性【窗口函数、uid分区】
        SELECT
            uid,
            exam_id,
            start_time,
            LEAD(start_time) OVER(PARTITION BY uid ORDER BY start_time ASC) AS next_start_time
        FROM exam_record er
        WHERE YEAR(start_time)=2021
    ) a
    GROUP BY uid 
) b
WHERE days_window>1
ORDER BY days_window DESC,avg_exam_cnt DESC

 4、近三个月未完成试卷数为0的用户完成情况

【SQL进阶】Day05:窗口函数_第6张图片

 【SQL进阶】Day05:窗口函数_第7张图片

思路:

-- 找每个用户的试卷作答完成数
-- 找每个用户近三个有作答记录的月份,
-- 没有试卷是未完成状态的用户【所有试卷都完成count(a)=count(b)】

 答案:dense_rank()进行排序

方式:先按日期排好序得到序号,再找前三个记录并查出来,再分组,找到全部完成的用户情况,通过聚合函数计算

-- 3.分组选出全部都完成的完成数
SELECT
    uid,
    COUNT(start_time) AS exam_complete_cnt
FROM (
    -- 2.选出近三条
    SELECT
        uid,
        start_time,
        submit_time
    FROM (
        -- 1.查询用户排序的作答记录
        SELECT
            uid,
            start_time,
            submit_time,
            DENSE_RANK() OVER(PARTITION BY uid ORDER BY DATE_FORMAT(start_time,'%Y-%m') DESC) AS ranking
        FROM exam_record er
    ) a
    where ranking<=3
) b
GROUP BY uid
HAVING COUNT(start_time)=COUNT(submit_time)
ORDER BY exam_complete_cnt DESC,uid DESC

 5、未完成率较高的50%用户近三个月答卷情况

【SQL进阶】Day05:窗口函数_第8张图片

 【SQL进阶】Day05:窗口函数_第9张图片

 【SQL进阶】Day05:窗口函数_第10张图片

思路:

-- 0.分组统计每个用户的未完成数目和总作答数目group by
-- 1.统计SQL试卷的未完成率排名(前50%)
-- 2.统计有作答记录的近三个月信息【日期排序,近三个月】
-- 3.统计六级和七级用户和每个月的信息

答案:

select uid,
       date_format(start_time, '%Y%m') as start_month,
       count(start_time) as tatol_cnt,
       count(score) as complete_cnt
from(
    select uid, start_time, score,
           dense_rank() over(partition by uid order by date_format(start_time, '%Y%m') desc) as recent_months
    from exam_record
) recent_table
where recent_months <= 3
and uid in(
     select incomplete_rate_table.uid
     from(
         select uid,
            row_number() over(order by (incomplete_cnt / total_cnt) desc, uid desc) as incomplete_rank
         from(
            select uid,
                   sum(if(score is null, 1, 0)) as incomplete_cnt,
                   count(start_time) as total_cnt
            from exam_record
            group by uid
             ) incomplete_cnt_table
     ) incomplete_rate_table join(
         select count(distinct uid) as total_user
         from exam_record
     ) t_u
     join user_info
     on incomplete_rate_table.uid = user_info.uid
     where level >= 6
     and incomplete_rank <= ceiling(total_user / 2)
)
group by uid, start_month
order by uid

6、试卷完成数同比2020年的增长率及排名变化

【SQL进阶】Day05:窗口函数_第11张图片

 【SQL进阶】Day05:窗口函数_第12张图片

 【SQL进阶】Day05:窗口函数_第13张图片

 【SQL进阶】Day05:窗口函数_第14张图片

二、聚合窗口函数

你可能感兴趣的:(sql,数据库,java,开发语言)