建立一个表,来存储每天新增的数据(分区表)
统计每天的活跃用户(日活)(需要用户的ip,用户的账号,用户访问时间最早的一条url和时间)
统计每天的新增用户(日新)
2019-08-15号的数据:
192.168.33.6,hunter,2019-08-15 10:30:20,/a
192.168.33.7,hunter,2019-08-15 10:30:26,/b
192.168.33.6,jack,2019-08-15 10:30:27,/a
192.168.33.8,tom,2019-08-15 10:30:28,/b
192.168.33.9,rose,2019-08-15 10:30:30,/b
192.168.33.10,julia,2019-08-15 10:30:40,/c
2019-08-16号的数据:
192.168.33.16,hunter,2019-08-16 10:30:20,/a
192.168.33.18,jerry,2019-08-16 10:30:30,/b
192.168.33.26,jack,2019-08-16 10:30:40,/a
192.168.33.18,polo,2019-08-16 10:30:50,/b
192.168.33.39,nissan,2019-08-16 10:30:53,/b
192.168.33.39,nissan,2019-08-16 10:30:55,/a
192.168.33.39,nissan,2019-08-16 10:30:58,/c
192.168.33.20,ford,2019-08-16 10:30:54,/c
2019-08-17号的数据:
192.168.33.46,hunter,2019-08-17 10:30:21,/a
192.168.43.18,jerry,2019-08-17 10:30:22,/b
192.168.43.26,tom,2019-08-17 10:30:23,/a
192.168.53.18,bmw,2019-08-17 10:30:24,/b
192.168.63.39,benz,2019-08-17 10:30:25,/b
192.168.33.25,haval,2019-08-17 10:30:30,/c
192.168.33.10,julia,2019-08-17 10:30:40,/c
create table weblog(ip string,name string,datetime string,url string)
partitioned by (day string)
row format delimited fields terminated by ',';
load data local inpath '/root/test/log15' into table weblog
partition(day='2019-08-15');
load data local inpath '/root/test/log16' into table weblog
partition(day='2019-08-16');
load data local inpath '/root/test/log17' into table weblog
partition(day='2019-08-17');
show partitions weblog;
+-----------------+
| partition |
+-----------------+
| day=2019-08-15 |
| day=2019-08-16 |
| day=2019-08-17 |
+-----------------+
select * from weblog;
+----------------+--------------+----------------------+-------------+-------------+
| weblog.ip | weblog.name | weblog.datetime | weblog.url | weblog.day |
+----------------+--------------+----------------------+-------------+-------------+
| 192.168.33.6 | hunter | 2019-08-15 10:30:20 | /a | 2019-08-15 |
| 192.168.33.7 | hunter | 2019-08-15 10:30:26 | /b | 2019-08-15 |
| 192.168.33.6 | jack | 2019-08-15 10:30:27 | /a | 2019-08-15 |
| 192.168.33.8 | tom | 2019-08-15 10:30:28 | /b | 2019-08-15 |
| 192.168.33.9 | rose | 2019-08-15 10:30:30 | /b | 2019-08-15 |
| 192.168.33.10 | julia | 2019-08-15 10:30:40 | /c | 2019-08-15 |
| 192.168.33.16 | hunter | 2019-08-16 10:30:20 | /a | 2019-08-16 |
| 192.168.33.18 | jerry | 2019-08-16 10:30:30 | /b | 2019-08-16 |
| 192.168.33.26 | jack | 2019-08-16 10:30:40 | /a | 2019-08-16 |
| 192.168.33.18 | polo | 2019-08-16 10:30:50 | /b | 2019-08-16 |
| 192.168.33.39 | nissan | 2019-08-16 10:30:53 | /b | 2019-08-16 |
| 192.168.33.39 | nissan | 2019-08-16 10:30:55 | /a | 2019-08-16 |
| 192.168.33.39 | nissan | 2019-08-16 10:30:58 | /c | 2019-08-16 |
| 192.168.33.20 | ford | 2019-08-16 10:30:54 | /c | 2019-08-16 |
| 192.168.33.46 | hunter | 2019-08-17 10:30:21 | /a | 2019-08-17 |
| 192.168.43.18 | jerry | 2019-08-17 10:30:22 | /b | 2019-08-17 |
| 192.168.43.26 | tom | 2019-08-17 10:30:23 | /a | 2019-08-17 |
| 192.168.53.18 | bmw | 2019-08-17 10:30:24 | /b | 2019-08-17 |
| 192.168.63.39 | benz | 2019-08-17 10:30:25 | /b | 2019-08-17 |
| 192.168.33.25 | haval | 2019-08-17 10:30:30 | /c | 2019-08-17 |
| 192.168.33.10 | julia | 2019-08-17 10:30:40 | /c | 2019-08-17 |
+----------------+--------------+----------------------+-------------+-------------+
统计每天的活跃用户(日活)(需要用户的ip,用户的账号,用户访问时间最早的一条url和时间)
(1)排序:
select ip,name,datetime,url,row_number()over(partition by name order by datetime) from weblog where day = '2019-08-15';
+----------------+---------+----------------------+------+----------------------+
| ip | name | datetime | url | row_number_window_0 |
+----------------+---------+----------------------+------+----------------------+
| 192.168.33.6 | hunter | 2019-08-15 10:30:20 | /a | 1 |
| 192.168.33.7 | hunter | 2019-08-15 10:30:26 | /b | 2 |
| 192.168.33.6 | jack | 2019-08-15 10:30:27 | /a | 1 |
| 192.168.33.10 | julia | 2019-08-15 10:30:40 | /c | 1 |
| 192.168.33.9 | rose | 2019-08-15 10:30:30 | /b | 1 |
| 192.168.33.8 | tom | 2019-08-15 10:30:28 | /b | 1 |
+----------------+---------+----------------------+------+----------------------+
(2)统计每个用户当天第一次访问的记录
select ip,name,datetime,url from (select ip,name,datetime,url,row_number()over(partition by name order by datetime) as rn from weblog where day = '2019-08-15') tmp where rn = 1;
+----------------+---------+----------------------+------+
| ip | name | datetime | url |
+----------------+---------+----------------------+------+
| 192.168.33.6 | hunter | 2019-08-15 10:30:20 | /a |
| 192.168.33.6 | jack | 2019-08-15 10:30:27 | /a |
| 192.168.33.10 | julia | 2019-08-15 10:30:40 | /c |
| 192.168.33.9 | rose | 2019-08-15 10:30:30 | /b |
| 192.168.33.8 | tom | 2019-08-15 10:30:28 | /b |
+----------------+---------+----------------------+------+
(3)写入active表
create table active(ip string,name string,datetime string,url string) partitioned by (day string);
insert into table active partition (day = '2019-08-15')
select ip,name,datetime,url from (
select ip,name,datetime,url,row_number()over(partition by name order by datetime) as rn from weblog where day = '2019-08-15') tmp where rn = 1;
select * from active;
+----------------+--------------+----------------------+-------------+-------------+
| active.ip | active.name | active.datetime | active.url | active.day |
+----------------+--------------+----------------------+-------------+-------------+
| 192.168.33.6 | hunter | 2019-08-15 10:30:20 | /a | 2019-08-15 |
| 192.168.33.6 | jack | 2019-08-15 10:30:27 | /a | 2019-08-15 |
| 192.168.33.10 | julia | 2019-08-15 10:30:40 | /c | 2019-08-15 |
| 192.168.33.9 | rose | 2019-08-15 10:30:30 | /b | 2019-08-15 |
| 192.168.33.8 | tom | 2019-08-15 10:30:28 | /b | 2019-08-15 |
+----------------+--------------+----------------------+-------------+-------------+
show partitions active;
+-----------------+
| partition |
+-----------------+
| day=2019-08-15 |
+-----------------+
统计每天的新增用户(日新)
create table history_user(name string);
create table new_user like weblog;
select a.ip,a.name,a.datetime,a.url,h.name
from active a left join history_user h on a.name = h.name
where a.day = '2019-08-15';
+----------------+---------+----------------------+--------+---------+
| a.ip | a.name | a.datetime | a.url | h.name |
+----------------+---------+----------------------+--------+---------+
| 192.168.33.6 | hunter | 2019-08-15 10:30:20 | /a | NULL |
| 192.168.33.6 | jack | 2019-08-15 10:30:27 | /a | NULL |
| 192.168.33.10 | julia | 2019-08-15 10:30:40 | /c | NULL |
| 192.168.33.9 | rose | 2019-08-15 10:30:30 | /b | NULL |
| 192.168.33.8 | tom | 2019-08-15 10:30:28 | /b | NULL |
+----------------+---------+----------------------+--------+---------+
select ip,name,datetime,url from (
select a.ip,a.name,a.datetime,a.url,h.name as hn
from active a left join history_user h
on a.name = h.name
where a.day = '2019-08-15'
) tmp where hn is null;
+----------------+---------+----------------------+------+
| ip | name | datetime | url |
+----------------+---------+----------------------+------+
| 192.168.33.6 | hunter | 2019-08-15 10:30:20 | /a |
| 192.168.33.6 | jack | 2019-08-15 10:30:27 | /a |
| 192.168.33.10 | julia | 2019-08-15 10:30:40 | /c |
| 192.168.33.9 | rose | 2019-08-15 10:30:30 | /b |
| 192.168.33.8 | tom | 2019-08-15 10:30:28 | /b |
+----------------+---------+----------------------+------+
将历史表中用户为null的用户记录添加到日新表中
insert into table new_user partition (day = '2019-08-16')
select ip,name,datetime,url from (
select a.ip,a.name,a.datetime,a.url,h.name as hn
from active a left join history_user h
on a.name = h.name
where a.day = '2019-08-16'
) tmp where hn is null;
将日新表的记录写入历史表中
insert into table history_user
select name from new_user where day = '2019-08-16';
select * from history_user;
+--------------------+
| history_user.name |
+--------------------+
| hunter |
| jack |
| julia |
| rose |
| tom |
+--------------------+
insert into table new_user partition (day = '2019-08-16')
select ip,name,datetime,url from (
select a.ip,a.name,a.datetime,a.url,h.name as hn
from active a left join history_user h
on a.name = h.name
where a.day = '2019-08-16'
) tmp where hn is null;
insert into table history_user
select name from new_user where day = '2019-08-16';
最终结果:
select * from history_user;
+--------------------+
| history_user.name |
+--------------------+
| hunter |
| jack |
| julia |
| rose |
| tom |
| ford |
| jerry |
| nissan |
| polo |
| benz |
| bmw |
| haval |
+--------------------+
select * from new_user;
+----------------+----------------+----------------------+---------------+---------------+
| new_user.ip | new_user.name | new_user.datetime | new_user.url | new_user.day |
+----------------+----------------+----------------------+---------------+---------------+
| 192.168.33.6 | hunter | 2019-08-15 10:30:20 | /a | 2019-08-15 |
| 192.168.33.6 | jack | 2019-08-15 10:30:27 | /a | 2019-08-15 |
| 192.168.33.10 | julia | 2019-08-15 10:30:40 | /c | 2019-08-15 |
| 192.168.33.9 | rose | 2019-08-15 10:30:30 | /b | 2019-08-15 |
| 192.168.33.8 | tom | 2019-08-15 10:30:28 | /b | 2019-08-15 |
| 192.168.33.20 | ford | 2019-08-16 10:30:54 | /c | 2019-08-16 |
| 192.168.33.18 | jerry | 2019-08-16 10:30:30 | /b | 2019-08-16 |
| 192.168.33.39 | nissan | 2019-08-16 10:30:53 | /b | 2019-08-16 |
| 192.168.33.18 | polo | 2019-08-16 10:30:50 | /b | 2019-08-16 |
| 192.168.63.39 | benz | 2019-08-17 10:30:25 | /b | 2019-08-17 |
| 192.168.53.18 | bmw | 2019-08-17 10:30:24 | /b | 2019-08-17 |
| 192.168.33.25 | haval | 2019-08-17 10:30:30 | /c | 2019-08-17 |
+----------------+----------------+----------------------+---------------+---------------+
定时每天执行一次求前一天的日活、日新
先求日活插入日活表,再求日新插入日新表,再更新历史用户表。
#!/bin/bash
date_str=`date -d '-1 day' +'%Y-%m-%d'`
echo "准备处理 $date_str 的数据..."
hive_exec=/usr/local/big_data/hive/bin/hive
#日活
HQL_user_active_day="
insert into table test1.active partition (day = \"$date_str\")
select ip,name,datetime,url from (
select ip,name,datetime,url,row_number()over(partition by name order by datetime) as rn from test1.weblog where day = \"$date_str\") tmp where rn = 1;"
$hive_exec -e "$HQL_user_active_day"
#日新
HQL_user_new_day="
insert into table test1.new_user partition (day = \"$date_str\")
select ip,name,datetime,url from (
select a.ip,a.name,a.datetime,a.url,h.name as hn
from test1.active a left join test1.history_user h
on a.name = h.name
where a.day = \"$date_str\"
) tmp where hn is null;"
$hive_exec -e "$HQL_user_new_day"
#更新历史表
HQL_user_history_update="
insert into table test1.history_user
select name from test1.new_user where day = \"$date_str\";"
$hive_exec -e "$HQL_user_history_update"
echo "$date_str 的数据处理完成!"