hive 常用函数

1.分位数

  percentile_approx(DOUBLE col, p [, B])     Returns an approximate pth percentile of a numeric column (including floating point types) in the group

  含义: 在col列中返回p%的分位数

  select 
  percentile_approx( arr_id , 0.5 )
  from 
  (
    select
      arr_id
    from
      (
        select
          array(1, 2, 3, 4, 5, 6, 7, 8, 9, 1000) as arr
      ) a lateral view explode(arr) tt as arr_id
  ) a 

hive 常用函数_第1张图片

2.url解析

解码url: URL转码,encodeURI,encodeURIComponent — 在线工具
解析json: 在线JSON校验格式化工具(Be JSON)

上报:
:path: /log/web?0000171612422098683https%3A%2F%2Fwww.bilibili.com%2Fbangumi%2Flist%2Fsl17662%3Ffrom_spmid%3Dpgc.cinema-tab.0.0%26intentFrom%3D15%26native.theme%3D1%26navhide%3D1%26share_medium%3Dandroid%26share_plat%3Dandroid%26share_source%3DCOPY%26share_tag%3Ds_i%26timestamp%3D1612343552%26unique_k%3DHKz3gy|666.49.selfDef.click_unfollow||1612422098000|0|0|980x1743|2|{%22event%22:%22click_unfollow%22,%22value%22:{%22sl_id%22:%2217662%22,%22season_id%22:35582,%22season_type%22:3,%22mid%22:36865977},%22bsource%22:%22share_source_copy_link%22}|{}|https%3A%2F%2Fm.bilibili.com%2F|FF7C614F-2539-A3E3-1AE8-E4265F23111B03042infoc|zh-CN|null


msg:
{%22event%22:%22click_season%22,%22value%22:{%22sl_id%22:%2218665%22,%22season_id%22:32429,%22season_type%22:2,%22mid%22:179753067},%22bsource%22:%22%22}

解码url:  https://www.sojson.com/encodeurl.html
解析json: https://www.bejson.com/
解析msg 字段:
select
*
,get_json_object(msg, '$.event') as event1
,get_json_object(msg, '$.value') as value1
,get_json_object(get_json_object(msg, '$.value'), '$.sl_id')  as sub_value_sl
,get_json_object(get_json_object(msg, '$.value'), '$.season_id')  as sub_value_season
,get_json_object(get_json_object(msg, '$.value'), '$.season_type')  as sub_value_season_type
,get_json_object(get_json_object(msg, '$.value'), '$.mid')  as sub_value_mid
from
(
select '{"event":"click_season","value":{"sl_id":"18665","season_id":32429,"season_type":2,"mid":179753067},"bsource":""}'  as msg  union all
select '{"event":"unload","value":{"enter":1612292714266,"leave":1612292745642},"bsource":""}'  as msg  union all
select '{"event":"show_hover","value":{"sl_id":"20233","season_id":33987,"season_type":2,"mid":172278354},"bsource":""}'  as msg  union all
select '{"event":"click_season","value":{"sl_id":"17834","season_id":34565,"season_type":2,"mid":271658710},"bsource":"search_baidu"}'  as msg  union all
select '{"event":"show_hover","value":{"sl_id":"17834","season_id":28274,"season_type":2,"mid":281264988},"bsource":""}'  as msg  
)  a
  
limit 2000


select
*,
reflect("java.net.URLDecoder", "decode", msg, "UTF-8")  as url
,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.event') as event_type
,get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value') as event_value
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.sl_id')  as sub_value_sl
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_id')  as sub_value_season
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.season_type')  as sub_value_season_type
,get_json_object(get_json_object(reflect("java.net.URLDecoder", "decode", msg, "UTF-8"), '$.value'), '$.mid')  as sub_value_mid
from
  b_dwd.dwd_flow_web_click_l_d
WHERE
  log_date between '<%= log_date  %>'  and '<%= log_date %>' 
  and   substr(spm_id,1,6)= '666.49'
  
  limit 2000

3.正则 

 滑动验证页面

select
*,
a.from_spmid rlike concat('^','dynamic') ,
a.from_spmid rlike concat('^','dt+') ,
a.from_spmid rlike '^dt'
from
(
select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
select 'out_open_deeplink_gdtlh-and-yf-31yaz-17776718' as from_spmid union ALL
select 'out_open_deeplink_gdtlh-and-yf-31yaz-18253662' as from_spmid union ALL
select 'ott-platform.ott-dynamic.0.0' as from_spmid union ALL
select 'dynamic.video-tab.0.0' as from_spmid union ALL
select 'default-value' as from_spmid
) a

3.hsql 间隔日期补数

方法1:窗口函数

with  tmp_table as (
  select dt ,city ,  val 
  from 
  (
    select  '20220101' as dt ,'上海' as city ,  100  val union all 
    select  '20220102' as dt ,'上海' as city ,  null val union all 
    select  '20220104' as dt ,'上海' as city ,  null val union all 
    select  '20220106' as dt ,'上海' as city ,  200  val 
  ) a 
) 

,tmp_a as (
select dt ,city ,  val ,
case when val  is null then 0 else 1 end val1  -- 将空值设置为0,非空值设置为1
from tmp_table
),
tmp_b as (
select dt ,city ,  val ,val1,
sum(val1) over(partition by '1' order by dt) as cnt_val1 -- 区分 空和费控分空分组 
from tmp_a
)
select dt ,city ,  val ,val1,cnt_val1,
max(val) over(partition by cnt_val1 order by dt) as fill_val  -- 使用开窗函数按照分组求最大值
from tmp_b


hive 常用函数_第2张图片
方法2:udf 

4. 用户大会员在期的天数
with tmp_dt as (
  select id,sdt,edt
  from 
  (
  select 1 id, '2022-02-02' as sdt, '2022-02-07' as edt union all 
  select 1 id, '2022-02-07' as sdt, '2022-02-18' as edt union all
  select 1 id, '2022-02-10' as sdt, '2022-02-23' as edt 
  ) a 
)
select 
id,base_line,max(dt) dt 
,datediff(max(dt),base_line)+1 as days
from 
(

select id ,dt,max(base_f) over (PARTITION BY id ORDER BY dt) base_line
from 
(
  select id ,dt,flag,sum(flag) over(partition by id order by concat(dt,flag) desc) as sm 
  ,case when sum(flag) over(partition by id order by concat(dt,flag) desc)=0 then dt else 0 end as base_f
  from 
  (
  select id,sdt as dt,1 as flag
  from tmp_dt
  union all 
  select id,edt,-1 as flag
  from tmp_dt
  ) a 
) b
) c
group by id ,base_line

hive 常用函数_第3张图片

5.取每个mid同时播放的设备数分布。

已经整理好buvid、mid、开始播放时间、结束播放时间。主要背景是想看下现在“共享账号”的用户量级有多少

https://berserker.bilibili.co/?URL=http://berserker.bilibili.co/#/adhoc?sqlId=100670985

with 
  ogv_play as (
    select mid,buvid,season_id,epid,log_date,
      arch_play_timestamp as stime,
      arch_play_timestamp+total_duration as etime
    from bili_ogv.dwd_flow_ogv_play_app_timed_i_d
    where log_date between '20230226' and '20230226'  and season_id >0 and total_duration>0  

    union 
    select mid,buvid,season_id,epid,log_date,
      arch_play_timestamp as stime,
      arch_play_timestamp+played_duration as etime
    from bili_ogv.dwd_flow_ogv_play_web_timed_i_d
    where log_date between '20230226' and '20230226' and season_id >0 and played_duration>0  
  ),
  vip_user as (
        select mid,log_date
        from bili_ogv.dim_user_full_d 
        where log_date between '20230226' and '20230226'  and is_vip_user = 1  
  ),

  vip_play_pay_ep as (
    select a.mid,a.buvid,a.epid,a.stime,a.etime ,a.log_date
    from ogv_play a 
    join vip_user b 
    on a.mid = b.mid  and a.log_date = b.log_date
    join (
      select epid,log_date
      from bili_ogv.dim_ep_av_full_d 
      where log_date between '20230226' and '20230226' and ep_status not in (2,14)
    ) c 
    on a.epid = c.epid and a.log_date = c.log_date
  )


-- mid同时播放的设备数分布
select 
 i.buvid_cnt
 ,count(*)  as mid_cnt   
from
(
select 
-- mid 的设备数去重 buvid
   j.mid
  ,count(distinct j.buvid ) as buvid_cnt  
from
(

select  
-- 删除重复进入播放记录, 保留同时进入播放的记录
    h.log_date
   ,h.mid
   ,h.buvid
   ,h.epid
   ,h.in_out_time
   ,h.label 
   ,h.num -- 每个mid 同时在线计数
   ,h.in_out_time_rank  -- 每个mid 播放开始时间排序 
   ,h.num1  -- 用户进入次数计数
   ,h.diff 
   ,h.max_num 
   ,h.max_inc_rank -- 升序的最大值
   ,h.max_inc_rank_for_row -- 升序的最大值存放在每一行
   ,h.acc_by_mid_num1  --  每个mid重复进入播放计数
from 
(

select
    g.log_date
   ,g.mid
   ,g.buvid
   ,g.epid
   ,g.in_out_time
   ,g.label 
   ,g.num -- 每个mid 同时在线计数
   ,g.in_out_time_rank  -- 每个mid 播放开始时间排序 
   ,g.num1  -- 用户进入次数计数
   ,g.diff 
   ,g.max_num 
   ,g.max_inc_rank -- 升序的最大值
   ,g.max_inc_rank_for_row -- 升序的最大值存放在每一行
   ,sum(1)  over(partition by mid,num1 )  as acc_by_mid_num1  --  每个mid重复进入播放计数
from
(

select
-- 得到同时在线num 的升序部分 
    f.log_date
   ,f.mid
   ,f.buvid
   ,f.epid
   ,f.in_out_time
   ,f.label 
   ,f.num -- 每个mid 同时在线计数
   ,f.in_out_time_rank  -- 每个mid 播放开始时间排序 
   ,f.num1  -- 用户进入次数计数
   ,f.diff 
   ,f.max_num 
   ,f.max_inc_rank -- 升序的最大值
   ,f.max_inc_rank_for_row -- 升序的最大值存放在每一行
from
(

select 
    e.log_date
   ,e.mid
   ,e.buvid
   ,e.epid
   ,e.in_out_time
   ,e.label 
   ,e.num -- 每个mid 同时在线计数
   ,e.in_out_time_rank
   ,e.num1  -- 用户进入次数计数
   ,e.diff 
   ,e.max_num 
   ,e.max_inc_rank -- 升序的最大值
   ,max(e.max_inc_rank) over(partition by e.mid)  max_inc_rank_for_row -- 升序的最大值存放在每一行
from 
(

select 
    d.log_date
   ,d.mid
   ,d.buvid
   ,d.epid
   ,d.in_out_time
   ,d.label 
   ,d.num  -- 每个mid 同时在线计数
   ,d.in_out_time_rank
   ,d.num1  -- 用户进入次数计数
   ,d.diff 
   ,d.max_num 
   ,if(d.max_num=d.num,d.in_out_time_rank,0)  as max_inc_rank -- 升序的最大值
from
(

select 
    c.log_date
   ,c.mid
   ,c.buvid
   ,c.epid
   ,c.in_out_time
   ,c.label 
   ,c.num   -- 每个mid 同时在线计数
   ,c.in_out_time_rank
   ,c.num1 -- 用户进入次数计数
   ,c.diff 
   ,first_value(c.num) over(partition by c.mid order by c.num desc ) as max_num 
from 
(

 select
       b.log_date
      ,b.mid
      ,b.buvid
      ,b.epid
      ,b.in_out_time
      ,b.label 
      ,b.num  -- 每个mid 同时在线计数
      ,b.in_out_time_rank
      ,sum(b.label1)  over(partition by b.mid  order by b.in_out_time )  as num1  -- 用户进入次数计数
      ,b.in_out_time_rank-b.num as  diff 
 from
 (
   select 
       a.log_date
      ,a.mid
      ,a.buvid
      ,a.epid
      ,a.in_out_time
      ,a.label
       
      ,sum(a.label)  over(partition by a.mid  order by a.in_out_time )  as num -- 每个mid 同时在线计数
      ,row_number() over(partition by  a.mid  order by a.in_out_time ) as in_out_time_rank
      ,if(a.label=-1 ,0,a.label) as label1  -- 退出分组
    from 
    (
       -- 开始时间
       select 
         log_date
         ,mid
         ,buvid
         ,epid
         ,stime  as   in_out_time
         ,1      as   label
       from vip_play_pay_ep
       -- where mid = 57706 -- 1645967203
       union all 
       -- 结束时间
       select 
         log_date
         ,mid
         ,buvid
         ,epid
         ,etime   as   in_out_time
         ,-1      as   label
       from vip_play_pay_ep
      -- where mid = 57706 -- 1645967203
    ) a
   ) b 
  
  ) c 
  
  ) d 
  
  ) e 
  
  ) f 
  where  f.in_out_time_rank between 1 and  f.max_inc_rank_for_row
) g 
  
) h 
where h.acc_by_mid_num1 =1   --  每个mid重复进入播放计数

) j 
group by j.mid

) i 
group by i.buvid_cnt

思路:

1.每个mid 同时在线计数

2.同时播放数提取升序部分

3.删除重复进入播放部分

hive 常用函数_第4张图片

5. 提取一个字符串中重复的item_id 后缀数字

select 
regexp_extract(sub_str,'item_id\\":\\"([0-9]+)(\\")',1)
FROM 
(
  select '{"item_infos":[{"item_id":"701969"},{"item_id":"701965"}]},{"item_infos":[{"item_id":"701964"},{"item_id":"701963"}]}' as  string_test
) a 
lateral view explode(split(string_test,',')) t as sub_str 

hive 常用函数_第5张图片

你可能感兴趣的:(大数据,hive,hadoop,数据仓库)