| 有如下登录信息:
userId day
1 2019-05-01
1 2019-05-02
1 2019-05-03
1 2019-05-04
1 2019-05-05
1 2019-05-06
1 2019-05-07
1 2019-05-08
…
2 2019-05-01
2 2019-05-02
2 2019-05-03
2 2019-05-04
2 2019-05-06
2 2019-05-07
2 2019-05-08
2 2019-05-10
2 2019-05-12
1.求连续登录超过5天的用户
2.求每个用户的连续登录最长天数
create table userlog(
userId int,
day string
)
row format delimited
fields terminated by '\t';
load data local inpath '/data/userlog' into table userlog ;
2.针对需求1,我们可以分析得知,如果连续登陆超过5天,那么用户的数据必然存在至少连续5天的时间,既然连续那么就是一个等差数列,所以我只需要判断数据中连续五行中的第一行和最后一行差值为4天
select sc.userId from(
select userId,unix_timestamp(day,'yyyy-MM-dd')-unix_timestamp(lag(day,4) over(distribute by userid sort by day),'yyyy-MM-dd')rm
from userlog) sc
where sc.rm=4*24*60*60
group by sc.userId;
求出时间差值-------345600代表4x24x60x60分钟
select userId,unix_timestamp(day,'yyyy-MM-dd')-unix_timestamp(lag(day,4) over(distribute by userid sort by day),'yyyy-MM-dd')rm
from userlog
+---------+---------+--+
| 1 | NULL |
| 1 | NULL |
| 1 | NULL |
| 1 | NULL |
| 1 | 345600 |
| 1 | 345600 |
| 1 | 345600 |
| 1 | 345600 |
| 2 | NULL |
| 2 | NULL |
| 2 | NULL |
| 2 | NULL |
| 2 | 432000 |
| 2 | 432000 |
| 2 | 432000 |
| 2 | 518400 |
| 2 | 518400 |
+---------+---------+--+
最终结果:
+------------+--+
| sc.userid |
+------------+--+
| 1 |
+------------+--+
3.针对需求2,我们可以认真观察数据源发现,日期所在的行和其行号的差值如果是同一个值,代表这些行是连续的数据,所以我们只要统计相同的差值的个数,然后求其最大值,即是连续登陆的最大天数
userId day 行标 差值
1 2019-05-01 1 2019-04-31
1 2019-05-02 2 2019-04-31
1 2019-05-03 3 2019-04-31
1 2019-05-04 4 2019-04-31
1 2019-05-05 5 2019-04-31
1 2019-05-06 6 2019-04-31
1 2019-05-07 7 2019-04-31
1 2019-05-08 8 2019-04-31
.....
2 2019-05-01 1 2019-04-31
2 2019-05-02 2 2019-04-31
2 2019-05-03 3 2019-04-31
2 2019-05-04 4 2019-04-31
2 2019-05-06 5 2019-05-01
2 2019-05-07 6 2019-05-01
2 2019-05-08 7 2019-05-01
2 2019-05-10 8 2019-05-02
2 2019-05-12 9 2019-05-03
...
整合后的执行语句:
select ssc.userid,max(total) max_total
from (select sc.userid,count(sc.day) over(distribute by sc.userid ,sc.day sort by sc.day) total
from (select userid,cast(substr(day,-2) as int)-dense_rank() over (distribute by userid sort by day) day from userlog) sc ) ssc
group by ssc.userid;
求差值
select userid,cast(substr(day,-2) as int)-dense_rank() over (distribute by userid sort by day) day from userlog
+---------+------+--+
| userid | day |
+---------+------+--+
| 1 | 0 |
| 1 | 0 |
| 1 | 0 |
| 1 | 0 |
| 1 | 0 |
| 1 | 0 |
| 1 | 0 |
| 1 | 0 |
| 2 | 0 |
| 2 | 0 |
| 2 | 0 |
| 2 | 0 |
| 2 | 1 |
| 2 | 1 |
| 2 | 1 |
| 2 | 2 |
| 2 | 3 |
+---------+------+--+
统计相同差值的个数
select sc.userid,count(sc.day) over(distribute by sc.userid ,sc.day sort by sc.day) total
from (select userid,cast(substr(day,-2) as int)-dense_rank() over (distribute by userid sort by day) day from userlog) sc
+------------+--------+--+
| sc.userid | total |
+------------+--------+--+
| 1 | 8 |
| 1 | 8 |
| 1 | 8 |
| 1 | 8 |
| 1 | 8 |
| 1 | 8 |
| 1 | 8 |
| 1 | 8 |
| 2 | 4 |
| 2 | 4 |
| 2 | 4 |
| 2 | 4 |
| 2 | 3 |
| 2 | 3 |
| 2 | 3 |
| 2 | 1 |
| 2 | 1 |
+------------+--------+--+
求最大值对应的ID值
select ssc.userid,max(total) max_total
from (select sc.userid,count(sc.day) over(distribute by sc.userid ,sc.day sort by sc.day) total
from (select userid,cast(substr(day,-2) as int)-dense_rank() over (distribute by userid sort by day) day from userlog) sc ) ssc
group by ssc.userid;
+-------------+------------+--+
| ssc.userid | max_total |
+-------------+------------+--+
| 1 | 8 |
| 2 | 4 |
+-------------+------------+--+