各种用sql挖数据需求脚本(持续更新)

sql脚本

当前工作主要挖数据,分析数据,给领导汇报数据,因此已经练就了挖各种数据,复杂的简单的,只要领导有需求,基本现有的能力都可以支撑。现将写过的脚本整理再次,各位可参考。
以下脚本从简单-复杂,主要参考我司现有业务数据。

表链接方式见我另一篇博文:https://blog.csdn.net/weixin_42388255/article/details/116450944

一、简单-仅查询指定表内容,无需计算分类等

  1. 查询单表,单结构
select ---固定语法,不可修改
os,ver,ip_country,cname ----输出表结构,!!!注意输出的最后一个字段不加 ","
from ---固定语法,不可修改
dwd.xxx ---表名,可以根据需求换不同的表
where date>="${startdate}" and date<="${enddate}" and vid in (${vidlist}) ----作为固定筛选项。
  1. 查询两个表以上
with ---固定语法
A as ---生成A表
(select ---固定语法,不可修改
sid,os,ver,ip_country,cname ----输出表结构,!!!注意输出的最后一个字段不加 ","
from ---固定语法,不可修改
dwd.xxx ---表名,可以根据需求换不同的表
where date>="${startdate}" and date<="${enddate}" and vid in (${vidlist}) ----作为固定筛选项。
),

B as ---生成B表
(select ---固定语法,不可修改
sid,os,ver,ip_country,cname ----输出表结构,!!!注意输出的最后一个字段不加 ","
from ---固定语法,不可修改
dim.xxx ---表名,可以根据需求换不同的表
where date>="${startdate}" and date<="${enddate}" and vid in (${vidlist}) ----作为固定筛选项。
)  ----注意这个位置不需要“,”

----A表和B表联合生成新表
select A.sid,A.os,A.ver,A.ip_country,A.cname  ---输出新表表头,根据需要输出A表没有的B表头
from
(A 
join ----链接方式可修改
B 
on A.sid=B.sid -----链接key
)

二、进阶-对查询到的字段进行分类/简单分析

  1. 对查询的指定字段进行计算均值/总和/最大值等,一定要group by。
    求分钟内最大发流用户数
select vid,date,hour,minute,max(spk_uid_cnt) max_spk_uid_cnt from
(select vid,sid,date,hour,minute,
	count(distinct  spk_uid ) spk_uid_cnt 
    from dwd.xxx ---表名,可以根据需求换不同的表
    where date>=${startdate} and date<=${enddate} 
    and vid in (${vidlist}) 
    group by 1,2,3,4,5 --当使用聚合函数,一定要进行group by
)
group by 1,2,3,4
order by date,hour,minute ---order by 目的排序,可有可无,默认从小到大

分位函数求分位数

select 
avg(peak_user_cnt) peak_user_cnt_avg,
percentile(peak_user_cnt,array(0.1,0.25,0.5,0.75,0.9)) `峰值在线用户数分位数10_25_50_75_90`
from 
(select cname,
max(peak_user_cnt) peak_user_cnt ,
sum(total_user_cnt) total_user_cnt 
from dwd.xxx 
where date>=${startdate} and date<=${enddate} and vid in (${vidlist})
group by 1)
  1. 对查询的字段,分段统计。
    统计某类数据占比
with cname_user as 
(select cname,
sum(total_user_cnt) total_user_cnt,
sum(native_host_sender_cnt) native_host_cnt
 from dwd.xxxx
where date>=${startdate} and date<=${enddate} and vid in (${vidlist})
group by 1),

base_date as 
(select cname,
count(case when total_user_cnt >=2 and total_user_cnt<5 then 1 else null end ) cnt_total_list_2_5,
count(case when total_user_cnt >=6 and total_user_cnt<10 then 1 else null end ) cnt_total_list_6_10,
count(case when total_user_cnt >=10 and total_user_cnt<20 then 1 else null end ) cnt_total_list_10_20,
count(case when total_user_cnt >=20  then 1 else null end ) cnt_total_list_20plus,
count(case when total_user_cnt >0  then 1 else null end ) cnt_total_list_all,
sum(case when total_user_cnt >0 then total_user_cnt else null end ) sum_total_cnt,

count(case when native_host_cnt =1  then 1 else null end ) cnt_host_list_1,
count(case when native_host_cnt =2  then 1 else null end ) cnt_host_list_2,
count(case when native_host_cnt =3  then 1 else null end ) cnt_host_list_3,
count(case when native_host_cnt =4  then 1 else null end ) cnt_host_list_4,
count(case when native_host_cnt =5  then 1 else null end ) cnt_host_list_5,
count(case when native_host_cnt >=6 and native_host_cnt<10 then 1 else null end ) cnt_host_list_6_10,
count(case when native_host_cnt >=10 and native_host_cnt<20 then 1 else null end ) cnt_host_list_10_20,
count(case when native_host_cnt >=20  then 1 else null end ) cnt_host_list_20plus,
count(case when native_host_cnt >0  then 1 else null end ) cnt_host_list_all,
sum(case when native_host_cnt >0 then native_host_cnt else null end ) sum_host_cnt
from cname_user
group by 1
),

table_send as 
(select 
sum(cnt_total_list_2_5) cnt_total_list_2_5,
sum(cnt_total_list_6_10) cnt_total_list_6_10,
sum(cnt_total_list_10_20) cnt_total_list_10_20,
sum(cnt_total_list_20plus) cnt_total_list_20plus,
sum(cnt_total_list_all) cnt_total_list_all,
sum(sum_total_cnt) sum_total_cnt,

sum(cnt_host_list_1) cnt_host_list_1,
sum(cnt_host_list_2) cnt_host_list_2,
sum(cnt_host_list_3) cnt_host_list_3,
sum(cnt_host_list_4) cnt_host_list_4,
sum(cnt_host_list_5) cnt_host_list_5,
sum(cnt_host_list_6_10) cnt_host_list_6_10,
sum(cnt_host_list_10_20) cnt_host_list_10_20,
sum(cnt_host_list_20plus) cnt_host_list_20plus,
sum(cnt_host_list_all) cnt_host_list_all,
sum(sum_host_cnt) sum_host_cnt
from base_date)

select 
cnt_host_list_1/cnt_host_list_all ratio_host_list_1,
cnt_host_list_2/cnt_host_list_all ratio_host_list_2,
cnt_host_list_3/cnt_host_list_all ratio_host_list_3,
cnt_host_list_4/cnt_host_list_all ratio_host_list_4,
cnt_host_list_5/cnt_host_list_all ratio_host_list_5,
cnt_host_list_6_10/cnt_host_list_all ratio_host_list_6_10,
cnt_host_list_10_20/cnt_host_list_all ratio_host_list_10_20,
cnt_host_list_20plus/cnt_host_list_all ratio_host_list_20plus
from table_send


  1. 目前最复杂的脚本之一
    统计某个版本客户在不同网络下的卡顿率情况
with vid_info AS
(
    (select distinct sid,ver,vid
    from xxxx
    where date between "${startdate}" and "${enddate}")
),

vid_info2 as 
(select distinct A1.sid,ver,vid from 
    (select distinct sid,ver,vid
    from xxxx
    where date between "${startdate}" and "${enddate}") as A1
    left anti join
    (select distinct date,sid
    from dwd.xxxx
    where date between "${startdate}" and "${enddate}" and spk_screen_share_type_desc <>'UNKNOWN') as B on A1.sid=B.sid
),

s2lv_info AS
(select distinct date,from_unixtime(floor(ts),'HH') hour, from_unixtime(floor(ts),'mm') minute,sid,
      max(delay) as delay_max,
      max(jitter95) as  jitter95_max,
      max(400lostRatio) as lost400_max,
      avg(delay)  as delay_avg, 
      avg(jitter95)  as jitter95_avg,
      avg(400lostRatio)  as lost400_avg,
      count(delay)  as daley_cnt,
      count(jitter95)  as jitter95_cnt,
      count(400lostRatio) as 400lostRatio_cnt
      from
  (select 
    /*+ broadcast(table_vid)*/
                date,
                ts,
                sid,
                voqaStat.delay as delay,
                voqaStat.jitter95 as jitter95,
                voqaStat.lostRatio as 400lostRatio
            from
                xxxx table_voqa
            left anti join (
                    select
                        sid
                    from
                        vid_info
            ) as table_vid 
            on table_voqa.sid = table_vid.sid
            where
                table_voqa.date >= "${startdate}"
                and table_voqa.date <= "${enddate}"
                and table_voqa.name = 's2lv')
  group by 1,2,3,4
),

600ms_video_freeze as 
(select distinct date,hour,minute,sid,spk_uid,
sum(600ms_freeze_ms) as 600ms_freeze_ms,
sum(600ms_total_ms) as 600ms_total_ms
from dwd.xxxx
where date>=${startdate} and date<=${enddate} 
group by 1,2,3,4,5
),

200ms_audio_freeze as 
(select distinct date,hour,minute,sid,spk_uid,
sum(200ms_freeze_ms) as 200ms_audio_freeze,
sum(200ms_total_ms) as 200ms_audio_total
from dwd.xxx
where date>=${startdate} and date<=${enddate} 
group by 1,2,3,4,5
),


base_table as 
(
    select vid,ver,
    (case when lost400_max =0  then 'lost400max_0'
      when lost400_max >0 and lost400_max <= 10 then 'lost400max_0_10'
      when lost400_max >10 and lost400_max <= 20 then 'lost400max_10_20'
      when lost400_max >20 and lost400_max <= 30 then 'lost400max_20_30'
      when lost400_max >30 and lost400_max <= 40 then 'lost400max_30_40'
      when lost400_max >40 and lost400_max <= 50 then 'lost400max_40_50'
      when lost400_max >50 and lost400_max <= 60 then 'lost400max_50_60'
      when lost400_max >60 and lost400_max <= 70 then 'lost400max_60_70'
      when lost400_max >70 and lost400_max <= 80 then 'lost400max_70_80'
      when lost400_max >80  then 'lost400max_80plus'
      else 'unusual'
      end) lost400max_type,
    (case when lost400_avg =0  then 'lost400avg_0'
      when lost400_avg >0 and lost400_avg <= 10 then 'lost400avg_0_10'
      when lost400_avg >10 and lost400_avg <= 20 then 'lost400avg_10_20'
      when lost400_avg >20 and lost400_avg <= 30 then 'lost400avg_20_30'
      when lost400_avg >30 and lost400_avg <= 40 then 'lost400avg_30_40'
      when lost400_avg >40 and lost400_avg <= 50 then 'lost400avg_40_50'
      when lost400_avg >50 and lost400_avg <= 60 then 'lost400avg_50_60'
      when lost400_avg >60 and lost400_avg <= 70 then 'lost400avg_60_70'
      when lost400_avg >70 and lost400_avg <= 80 then 'lost400avg_70_80'
      when lost400_avg >50  then 'lost400avg_80plus'
      else 'unusual'
      end) lost400avg_type,
    (case when delay_jitter95_summax >0 and delay_jitter95_summax <=100  then '1delay_jitter95_summax_0_100'
      when delay_jitter95_summax >100 and delay_jitter95_summax <=200  then '2delay_jitter95_summax_100_200'
      when delay_jitter95_summax >200 and delay_jitter95_summax <=400  then '3delay_jitter95_summax_200_400'
      when delay_jitter95_summax >400 and delay_jitter95_summax <=800  then '4delay_jitter95_summax_400_800'
      when delay_jitter95_summax >800 and delay_jitter95_summax <=1200  then '5delay_jitter95_summax_800_1200'
      when delay_jitter95_summax >1200 and delay_jitter95_summax <=1600  then '6delay_jitter95_summax_1200_1600'
      when delay_jitter95_summax >1600 and delay_jitter95_summax <=2000  then '7delay_jitter95_summax_1600_2000'
      when delay_jitter95_summax >2000 and delay_jitter95_summax <=2500  then '8delay_jitter95_summax_2000_2500'
      when delay_jitter95_summax >2500 and delay_jitter95_summax <=3000  then '9delay_jitter95_summax_2500_3000'
      when delay_jitter95_summax >3000  then '10delay_jitter95_summax_3000plus'
      else 'unusual'
      end) delay_jitter95_summaxtype,
    (case when  delay_jitter95_sumavg>0 and delay_jitter95_sumavg <=100  then '1delay_jitter95_sumavg_0_100'
      when delay_jitter95_sumavg >100 and delay_jitter95_sumavg <=200  then '2delay_jitter95_sumavg_100_200'
      when delay_jitter95_sumavg >200 and delay_jitter95_sumavg <=400  then '3delay_jitter95_sumavg_200_400'
      when delay_jitter95_sumavg >400 and delay_jitter95_sumavg <=800  then '4delay_jitter95_sumavg_400_800'
      when delay_jitter95_sumavg >800 and delay_jitter95_sumavg <=1200  then '5delay_jitter95_sumavg_800_1200'
      when delay_jitter95_sumavg >1200 and delay_jitter95_sumavg <=1600  then '6delay_jitter95_sumavg_1200_1600'
      when delay_jitter95_sumavg >1600 and delay_jitter95_sumavg <=2000  then '7delay_jitter95_sumavg_1600_2000'
      when delay_jitter95_sumavg >2000 and delay_jitter95_sumavg <=2500  then '8delay_jitter95_sumavg_2000_2500'
      when delay_jitter95_sumavg >2500 and delay_jitter95_sumavg <=3000  then '9delay_jitter95_sumavg_2500_3000'
      when delay_jitter95_sumavg >3000  then '10delay_jitter95_sumavg_3000plus'
      else 'unusual'
      end ) delay_jitter95_sumavgtype,
    count(distinct sid,date,hour,minute) sid_cnt,
    sum(600ms_freeze_ms) 600ms_freeze_ms,
    sum(600ms_total_ms) 600ms_total_ms,
    sum(600ms_freeze_ms)/sum(600ms_total_ms) 600ms_video_freeze_ratio,
    sum(200ms_audio_freeze) 200ms_audio_freeze,
    sum(200ms_audio_total) 200ms_audio_total,
    sum(200ms_audio_freeze)/sum(200ms_audio_total) 200ms_audio_freeze_ratio
    from
    (select distinct F.date,F.hour,F.minute,F.sid,
    coalesce(600ms_video_freeze.spk_uid,200ms_audio_freeze.spk_uid) spk_uid,ver,vid,
    delay_max,
    delay_avg,
    jitter95_max,
    jitter95_avg,
    lost400_max,
    lost400_avg,
    (delay_max+jitter95_max) delay_jitter95_summax,
    (lost400_avg+jitter95_avg) delay_jitter95_sumavg,
    600ms_freeze_ms,
    600ms_total_ms,
    200ms_audio_freeze,
    200ms_audio_total
    from
    ( select A1.*,ver,vid from 
      (select * from s2lv_info
      where daley_cnt>=25 and jitter95_cnt>=25 and 400lostRatio_cnt>=25) as A1
      join 
      vid_info2 on A1.sid=vid_info2.sid
    ) as F
    left join
    600ms_video_freeze on F.sid=600ms_video_freeze.sid and F.date=600ms_video_freeze.date and F.hour=600ms_video_freeze.hour and F.minute=600ms_video_freeze.minute   
    left join
    200ms_audio_freeze on F.sid=200ms_audio_freeze.sid and F.date=200ms_audio_freeze.date and F.hour=200ms_audio_freeze.hour and F.minute=200ms_audio_freeze.minute )
group by 1,2,3,4,5,6
)

select * from base_table


你可能感兴趣的:(每日一贴,agora,sql,数据库,mysql,数据分析,数据挖掘)