登录表 t_login: user_id,login_time,login_ip
需求:
1)、 加工出一张用户首末次登录的信息表。
user_id,first_login_time,first_login_ip,last_login_time,last_login_ip2)、 新增一个字段--距离上次登录的时间间隔(sub_dur)
用户id,访问时间,访问国家
s001|100|en
s001|120|cn
s001|45|ls
s001|34|ca
s002|38|en
s002|150|cn
s002|90|ls
s003|90|ls
s003|33|ca
s004|39|zwCREATE TABLE `tb_login`(
`stu_id` string,
`login_time` int,
`cname` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'load data local inpath '/home/xxxx/tmp/tb_login.txt' overwrite into table tb_login;
hive> select * from tb_login;
OK
s001 100 en
s001 120 cn
s001 45 ls
s001 34 ca
s002 38 en
s002 150 cn
s002 90 ls
s003 90 ls
s003 33 ca
s004 39 zw
Time taken: 0.356 seconds, Fetched: 10 row(s)
spark-sql> select * from tb_login;
s001 100 en
s001 120 cn
s001 45 ls
s001 34 math
s002 38 en
s002 150 cn
s002 90 ls
s003 90 ls
s003 33 math
s004 39 zw
select stu_id,first_value(login_time) over(partition by stu_id order by login_time asc) first_login_time,first_value(cname) over(partition by stu_id order by login_time asc) first_cname
from tb_login;
s001 34 ca
s001 34 ca
s001 34 ca
s001 34 ca
s002 38 en
s002 38 en
s002 38 en
s003 33 ca
s003 33 ca
s004 39 zwselect stu_id,last_value(login_time) over(partition by stu_id order by login_time asc) last_login_time,last_value(cname) over(partition by stu_id order by login_time asc) last_cname
from tb_login;
s001 34 ca
s001 45 ls
s001 100 en
s001 120 cn
s002 38 en
s002 90 ls
s002 150 cn
s003 33 ca
s003 90 ls
s004 39 zwselect stu_id,last_value(login_time) over(partition by stu_id order by login_time desc) last_login_time,last_value(cname) over(partition by stu_id order by login_time desc) last_cname
from tb_login;
s001 120 cn
s001 100 en
s001 45 ls
s001 34 ca
s002 150 cn
s002 90 ls
s002 38 en
s003 90 ls
s003 33 ca
s004 39 zwselect stu_id
,first_value(login_time) over(partition by stu_id order by login_time asc) first_login_time,first_value(cname) over(partition by stu_id order by login_time asc) first_cname
,last_value(login_time) over(partition by stu_id order by login_time asc) last_login_time,last_value(cname) over(partition by stu_id order by login_time asc) last_cname
from tb_login;s001 34 math 34 math
s001 34 math 45 ls
s001 34 math 100 en
s001 34 math 120 cn
s002 38 en 38 en
s002 38 en 90 ls
s002 38 en 150 cn
s003 33 math 33 math
s003 33 math 90 ls
s004 39 zw 39 zwselect stu_id
,last_value(login_time) over(partition by stu_id order by login_time asc) first_login_time,first_value(cname) over(partition by stu_id order by login_time asc) first_cname
,last_value(login_time) over(partition by stu_id order by login_time desc) last_login_time,first_value(cname) over(partition by stu_id order by login_time desc) last_cname
from tb_logins001 120 ca 120 cn
s001 100 ca 100 cn
s001 45 ca 45 cn
s001 34 ca 34 cn
s002 150 en 150 cn
s002 90 en 90 cn
s002 38 en 38 cn
s003 90 ca 90 ls
s003 33 ca 33 ls
s004 39 zw 39 zw
select stu_id
,first_value(login_time) over(partition by stu_id order by login_time asc) first_login_time,first_value(cname) over(partition by stu_id order by login_time asc) first_cname
,first_value(login_time) over(partition by stu_id order by login_time desc) last_login_time,first_value(cname) over(partition by stu_id order by login_time desc) last_cname
from tb_logins001 34 ca 120 cn
s001 34 ca 120 cn
s001 34 ca 120 cn
s001 34 ca 120 cn
s002 38 en 150 cn
s002 38 en 150 cn
s002 38 en 150 cn
s003 33 ca 90 ls
s003 33 ca 90 ls
s004 39 zw 39 zwselect distinct stu_id
,first_value(login_time) over(partition by stu_id order by login_time asc) first_login_time,first_value(cname) over(partition by stu_id order by login_time asc) first_cname
,first_value(login_time) over(partition by stu_id order by login_time desc) last_login_time,first_value(cname) over(partition by stu_id order by login_time desc) last_cname
from tb_login;s001 34 ca 120 cn
s002 38 en 150 cn
s003 33 ca 90 ls
s004 39 zw 39 zw
根据hive中groupby和distinct区别以及性能比较,能够distinct的地方,最好换成 group by:都会在map阶段count,但reduce阶段,distinct只有一个, group by 可以有多个进行并行聚合,所以group by会快。
select stu_id,first_login_time,first_cname,last_login_time,last_cname
from
(
select stu_id
,first_value(login_time) over(partition by stu_id order by login_time asc) first_login_time,first_value(cname) over(partition by stu_id order by login_time asc) first_cname
,first_value(login_time) over(partition by stu_id order by login_time desc) last_login_time,first_value(cname) over(partition by stu_id order by login_time desc) last_cname
from tb_login
)tb_final
group by stu_id,first_login_time,first_cname,last_login_time,last_cname;s001 34 ca 120 cn
s002 38 en 150 cn
s003 33 ca 90 ls
s004 39 zw 39 zw
select stu_id,login_time,cname
,row_number() over(partition by stu_id order by login_time asc) as rk
,lag(login_time,1) over(partition by stu_id order by login_time asc) last_login_time
,login_time-lag(login_time,1) over(partition by stu_id order by login_time asc) sub_dur
from tb_login;OK
s001 34 ca 1 NULL NULL
s001 45 ls 2 34 11
s001 100 en 3 45 55
s001 120 cn 4 100 20
s002 38 en 1 NULL NULL
s002 90 ls 2 38 52
s002 150 cn 3 90 60
s003 33 ca 1 NULL NULL
s003 90 ls 2 33 57
s004 39 zw 1 NULL NULL
Time taken: 32.27 seconds, Fetched: 10 row(s)select stu_id,login_time,cname
,row_number() over(partition by stu_id order by login_time asc) as rk
,lag(login_time,1,0) over(partition by stu_id order by login_time asc) last_login_time
,login_time-lag(login_time,1,0) over(partition by stu_id order by login_time asc) sub_dur
from tb_login;
s001 34 ca 1 0 34
s001 45 ls 2 34 11
s001 100 en 3 45 55
s001 120 cn 4 100 20
s002 38 en 1 0 38
s002 90 ls 2 38 52
s002 150 cn 3 90 60
s003 33 ca 1 0 33
s003 90 ls 2 33 57
s004 39 zw 1 0 39Time taken: 48.621 seconds, Fetched: 10 row(s)
select stu_id,login_time,cname
,row_number() over(partition by stu_id order by login_time asc) as rk
,lag(login_time,1,0) over(partition by stu_id order by login_time asc) last_login_time
,login_time-lag(login_time,1,login_time) over(partition by stu_id order by login_time asc) sub_dur
from tb_login;s001 34 ca 1 0 0
s001 45 ls 2 34 11
s001 100 en 3 45 55
s001 120 cn 4 100 20
s002 38 en 1 0 0
s002 90 ls 2 38 52
s002 150 cn 3 90 60
s003 33 ca 1 0 0
s003 90 ls 2 33 57
s004 39 zw 1 0 0
参考:http://lxw1234.com/archives/2015/04/190.htm
https://www.cnblogs.com/wqbin/p/11050970.html