hive的一些指标计算

已知一个点击行为,求在这之前该用户的点击路径

已知点(DDD)
--collect_list()//不去重

create table ev_1 as
select tb.user_id,collect_set(tb.event_id) as ev_list
from(
select user_id,event_id,time
from user_event,
where from_unixtime(cast(time/1000 as bigint),'yyyy-MM-dd')>='2020-03-05'
and from_unixtime(cast(time/1000 as bigint),'yyyy-MM-dd')<='2020-03-06'
and day >='20200305' and day <='20200306'
group by user_id
ORDER BY time
) tb
group by tb.user_id

按照时间戳排序后列转行,找array中DDD
并把ev_list转为字符串分割
select tb2.user_id,substr(tb2.str_list,1,(index(tb2.str_list,'DDD')+length('DDD'))) ev_path,tb2.lst_size
from(
select user_id,concat_ws(',',ev_list) str_list,size(ev_list) as lst_size
from ev_1
where array_contains(ev_list, 'DDD') 
)tb2

用户连续签到天数(取每个用户的天数最大值)

列明:id,连续签到天数,开始时间,结束时间

select userid,diff,qujian[0] starttime,qujian[size(qujian)-1] endtime
from (select userid,date2,datediff(max(Time),min(Time)) diff,COLLECT_set(Time) qujian
     from (
          select userid,Time,date_sub(Time,rank) as date2
          from (
               select userid,create_time Time,row_number () over (partition by userid order by create_time) as rank 
               from ads.day_xchz where date='201911'
          ) a
     ) a
     group by userid,date2
)a

用户埋点中,数据解析

埋点事件中属性值清洗,事件值中解析出视频播放量,最后和视频表结合
create table day_play_log (usernickname string,videoid string,videoplay string,attr_v string,cuid string,ct string) partitioned by (time string);
show partitions day_play_log;
--取当天视频播放量最大的id(历史数据取9月23号数据)
INSERT INTO TABLE day_play_log
PARTITION (time='2019-09-23')
select tmp.ssmcs[0] usernickname,tmp.ssmcs[1] videoid,tmp.ssmcs[3] videoplay,attr_v,cuid,ct
from (
	select split(attr_v,',') as ssmcs,attr_v,cuid,ct
	from dwd_user_evt_attr
	where date='20190923' and FROM_UNIXTIME(cast(ct/1000 as BIGINT),'yyyy-MM-dd')='2019-09-23' 
	and eid='视频详情' and attr='视频信息') tmp;

用户昵称,视频id,播放量,视频详情长段(每天视频的最大播放量)
create table day_video_play (usernickname string,videoid string,videoplay string,attr_v string) partitioned by (time string);
show partitions day_video_play;

INSERT INTO TABLE day_video_play
PARTITION (time='2019-09-23')
select tmp.usernickname,tmp.videoid,tmp.videoplay,tmp.attr_v
from(select tb1.usernickname,tb1.videoid,tb1.videoplay,tb1.attr_v,
row_number() over(partition by tb1.videoid order by tb1.videoplay desc) as rn
from day_play_log tb1
where time='2019-09-23'
)tmp
where tmp.rn=1;

用户注册后,6天内有登陆判定

6天(d)=518400000 毫秒(ms)
1天(d)=86400000毫秒(ms)

create table tmp_nginx_log_2 as
select a.r_user_id,
case when tb.CREATE_TIME is null then 0 else from_unixtime(cast(tb.CREATE_TIME/1000 as bigint)) end as CREATE_TIME,
case when (b.requesttime>=tb.CREATE_TIME+86400000 and b.requesttime<=tb.CREATE_TIME+518400000) then 1 else 0 end as first_request
from ods.ods_tb a
left join ods.ods_user tb on a.relative_user_id=tb.OP_ID
left join ads.tmp_nginx_log b on a.ruser_id=b.userid
where from_unixtime(cast(a.create_time/1000 as bigint),'yyyy-MM-dd')>='2019-07-21'
and from_unixtime(cast(a.create_time/1000 as bigint),'yyyy-MM-dd')<='2019-08-14'

清洗用户业务埋点,求第一次业务点击的点击行为

day_C=`date -d yesterday +%F`

INSERT INTO TABLE day_table 
select event,count(1) number,'$day_C' 
from 
(select * from (select event,userid,row_number() over(partition by userid order by requesttime) as rn 
          from (select tb1.* from day_event tb1, day_eventname tb2 where tb1.dt='$month_day' and tb2.dt='$month_day' and tb1.event = tb2.ev_name)as tb3) as tb4 where rn =1) tb5 
group by event

常用的统计方法

PV:(页面访问量)

select count(userid)
from requesttime
where time=''

UV:独立访客,PV的去重版

select count(distinct userid)
from requesttime
where time=''

特殊埋点的点击次数,点击人数
select count(userid),count(distinct userid)
from event
where event_id in ('','','','','')

广告事件点击次数人数,按照广告类型和渠道分类
select 类型,渠道,count(userid),count(distinct userid)
from 广告表
group by 类型,渠道

Hive中总体标准差的计算有两个函数可以使用,分别是stddev函数和stddev_pop函数:
select
  stddev_pop(feature1) as std_feature1,
  stddev(feature3) as std_feature3,
from 
  iris;

样本标准差使用stddev_samp方法:
select
  stddev_samp(feature1) as std_feature1,
from 
  iris;

中位数整数
select percentile(feature1,0.5) as median_feature1 from iris;

中位数浮点数
select percentile_approx(feature1,0.5) as median_feature1 from iris;

用户最近一次请求时间

month_day=`date -d "-1 day" +%Y%m%d`
day_C=`date -d "-1day" +%F`
day_D=`date -d "-2day" +%F`

hive -e"
INSERT INTO TABLE day_nginx_user
PARTITION (time='$day_C') 
select distinct userid
from user_nginx_log
where date='$month_day' and (length(userid)=36 or length(userid)=32);
"

hive -e"
INSERT INTO TABLE day_useir_requesttime_max 
PARTITION (date='$day_C')
select tmp.userid,max(tmp.log_time) as log_time from
(
select userid,log_time
from day_useir_requesttime_max
where date='$day_D'
union all
select userid,time as log_time
from day_nginx_user
where time='$day_C'
) tmp
group by tmp.userid;"

 

你可能感兴趣的:(hive)