已知一个点击行为,求在这之前该用户的点击路径
已知点(DDD)
--collect_list()//不去重
create table ev_1 as
select tb.user_id,collect_set(tb.event_id) as ev_list
from(
select user_id,event_id,time
from user_event,
where from_unixtime(cast(time/1000 as bigint),'yyyy-MM-dd')>='2020-03-05'
and from_unixtime(cast(time/1000 as bigint),'yyyy-MM-dd')<='2020-03-06'
and day >='20200305' and day <='20200306'
group by user_id
ORDER BY time
) tb
group by tb.user_id
按照时间戳排序后列转行,找array中DDD
并把ev_list转为字符串分割
select tb2.user_id,substr(tb2.str_list,1,(index(tb2.str_list,'DDD')+length('DDD'))) ev_path,tb2.lst_size
from(
select user_id,concat_ws(',',ev_list) str_list,size(ev_list) as lst_size
from ev_1
where array_contains(ev_list, 'DDD')
)tb2
用户连续签到天数(取每个用户的天数最大值)
列明:id,连续签到天数,开始时间,结束时间
select userid,diff,qujian[0] starttime,qujian[size(qujian)-1] endtime
from (select userid,date2,datediff(max(Time),min(Time)) diff,COLLECT_set(Time) qujian
from (
select userid,Time,date_sub(Time,rank) as date2
from (
select userid,create_time Time,row_number () over (partition by userid order by create_time) as rank
from ads.day_xchz where date='201911'
) a
) a
group by userid,date2
)a
用户埋点中,数据解析
埋点事件中属性值清洗,事件值中解析出视频播放量,最后和视频表结合
create table day_play_log (usernickname string,videoid string,videoplay string,attr_v string,cuid string,ct string) partitioned by (time string);
show partitions day_play_log;
--取当天视频播放量最大的id(历史数据取9月23号数据)
INSERT INTO TABLE day_play_log
PARTITION (time='2019-09-23')
select tmp.ssmcs[0] usernickname,tmp.ssmcs[1] videoid,tmp.ssmcs[3] videoplay,attr_v,cuid,ct
from (
select split(attr_v,',') as ssmcs,attr_v,cuid,ct
from dwd_user_evt_attr
where date='20190923' and FROM_UNIXTIME(cast(ct/1000 as BIGINT),'yyyy-MM-dd')='2019-09-23'
and eid='视频详情' and attr='视频信息') tmp;
用户昵称,视频id,播放量,视频详情长段(每天视频的最大播放量)
create table day_video_play (usernickname string,videoid string,videoplay string,attr_v string) partitioned by (time string);
show partitions day_video_play;
INSERT INTO TABLE day_video_play
PARTITION (time='2019-09-23')
select tmp.usernickname,tmp.videoid,tmp.videoplay,tmp.attr_v
from(select tb1.usernickname,tb1.videoid,tb1.videoplay,tb1.attr_v,
row_number() over(partition by tb1.videoid order by tb1.videoplay desc) as rn
from day_play_log tb1
where time='2019-09-23'
)tmp
where tmp.rn=1;
用户注册后,6天内有登陆判定
6天(d)=518400000 毫秒(ms)
1天(d)=86400000毫秒(ms)
create table tmp_nginx_log_2 as
select a.r_user_id,
case when tb.CREATE_TIME is null then 0 else from_unixtime(cast(tb.CREATE_TIME/1000 as bigint)) end as CREATE_TIME,
case when (b.requesttime>=tb.CREATE_TIME+86400000 and b.requesttime<=tb.CREATE_TIME+518400000) then 1 else 0 end as first_request
from ods.ods_tb a
left join ods.ods_user tb on a.relative_user_id=tb.OP_ID
left join ads.tmp_nginx_log b on a.ruser_id=b.userid
where from_unixtime(cast(a.create_time/1000 as bigint),'yyyy-MM-dd')>='2019-07-21'
and from_unixtime(cast(a.create_time/1000 as bigint),'yyyy-MM-dd')<='2019-08-14'
清洗用户业务埋点,求第一次业务点击的点击行为
day_C=`date -d yesterday +%F`
INSERT INTO TABLE day_table
select event,count(1) number,'$day_C'
from
(select * from (select event,userid,row_number() over(partition by userid order by requesttime) as rn
from (select tb1.* from day_event tb1, day_eventname tb2 where tb1.dt='$month_day' and tb2.dt='$month_day' and tb1.event = tb2.ev_name)as tb3) as tb4 where rn =1) tb5
group by event
常用的统计方法
PV:(页面访问量)
select count(userid)
from requesttime
where time=''
UV:独立访客,PV的去重版
select count(distinct userid)
from requesttime
where time=''
特殊埋点的点击次数,点击人数
select count(userid),count(distinct userid)
from event
where event_id in ('','','','','')
广告事件点击次数人数,按照广告类型和渠道分类
select 类型,渠道,count(userid),count(distinct userid)
from 广告表
group by 类型,渠道
Hive中总体标准差的计算有两个函数可以使用,分别是stddev函数和stddev_pop函数:
select
stddev_pop(feature1) as std_feature1,
stddev(feature3) as std_feature3,
from
iris;
样本标准差使用stddev_samp方法:
select
stddev_samp(feature1) as std_feature1,
from
iris;
中位数整数
select percentile(feature1,0.5) as median_feature1 from iris;
中位数浮点数
select percentile_approx(feature1,0.5) as median_feature1 from iris;
用户最近一次请求时间
month_day=`date -d "-1 day" +%Y%m%d`
day_C=`date -d "-1day" +%F`
day_D=`date -d "-2day" +%F`
hive -e"
INSERT INTO TABLE day_nginx_user
PARTITION (time='$day_C')
select distinct userid
from user_nginx_log
where date='$month_day' and (length(userid)=36 or length(userid)=32);
"
hive -e"
INSERT INTO TABLE day_useir_requesttime_max
PARTITION (date='$day_C')
select tmp.userid,max(tmp.log_time) as log_time from
(
select userid,log_time
from day_useir_requesttime_max
where date='$day_D'
union all
select userid,time as log_time
from day_nginx_user
where time='$day_C'
) tmp
group by tmp.userid;"