个人博客:DoubleFJ の Blog
ClickHouse 官方文档
文中举例均为简单示例,根据业务自行扩展。
实际业务场景中很多会需要统计 pv uv 值,比如页面的 pv uv 值,商品、店铺的 pv uv 值……
这边举一个简单的例子,比如说统计商品访问的 pv uv:
CREATE TABLE IF NOT EXISTS test.product_operation
(
ts_date_time DateTime COMMENT '事件触发时间',
gen_date_time DateTime COMMENT '入库时间',
product_id UInt64 COMMENT '商品ID',
user_id UInt64 COMMENT '用户ID',
operation UInt16 COMMENT '商品操作类型(1:访问,2:购买,3:收藏)'
) ENGINE = MergeTree()
ORDER BY (toDate(ts_date_time), operation);
像这种数据量可能很大,又需要预先聚合计算以减小获取耗时的,我们添加 物化视图 来帮助我们做这个事。
CREATE MATERIALIZED VIEW IF NOT EXISTS test.product_operation_day_pv_uv_mv
ENGINE = AggregatingMergeTree()
ORDER BY (ts_date, product_id, operation)
AS
SELECT toDate(ts_date_time) as ts_date,
product_id,
operation,
sumState(1) as pv,
uniqState(user_id, ts_date) as uv
FROM test.product_operation
GROUP BY ts_date, product_id, operation;
CREATE MATERIALIZED VIEW IF NOT EXISTS test.product_operation_total_pv_uv_mv
ENGINE = AggregatingMergeTree()
ORDER BY (product_id, operation)
AS
SELECT product_id,
operation,
sumState(1) as pv,
uniqState(user_id) as uv
FROM test.product_operation
GROUP BY product_id, operation;
INSERT INTO test.product_operation(ts_date_time, gen_date_time, product_id, user_id, operation)
VALUES (toDateTime('2022-01-26 10:10:10'), now(), 1, 1, 1)
忽略时区问题导致的时间显示
SELECT * FROM test.product_operation ORDER BY gen_date_time DESC;
ts_date_time |
gen_date_time |
product_id |
user_id |
operation |
2022-01-26 02:10:10 |
2022-01-26 03:54:36 |
1 |
1 |
1 |
SELECT ts_date, product_id, operation, sumMerge(pv) AS pv, uniqMerge(uv) AS uv
FROM test.product_operation_day_pv_uv_mv GROUP BY ts_date, product_id, operation;
ts_date |
product_id |
operation |
pv |
uv |
2022-01-26 |
1 |
1 |
1 |
1 |
SELECT product_id, operation, sumMerge(pv) AS pv, uniqMerge(uv) AS uv
FROM test.product_operation_total_pv_uv_mv GROUP BY product_id, operation;
product_id |
operation |
pv |
uv |
1 |
1 |
1 |
1 |
INSERT INTO test.product_operation(ts_date_time, gen_date_time, product_id, user_id, operation)
VALUES (toDateTime('2022-01-26 23:10:10'), now(), 1, 1, 1);
ts_date |
product_id |
operation |
pv |
uv |
2022-01-26 |
1 |
1 |
2 |
1 |
product_id |
operation |
pv |
uv |
1 |
1 |
2 |
1 |
INSERT INTO test.product_operation(ts_date_time, gen_date_time, product_id, user_id, operation)
VALUES (toDateTime('2022-01-26 23:20:10'), now(), 1, 2, 1);
ts_date |
product_id |
operation |
pv |
uv |
2022-01-25 |
1 |
1 |
3 |
2 |
product_id |
operation |
pv |
uv |
1 |
1 |
3 |
2 |
INSERT INTO test.product_operation(ts_date_time, gen_date_time, product_id, user_id, operation)
VALUES (toDateTime('2022-01-27 10:20:10'), now(), 1, 1, 1),
(toDateTime('2022-01-27 11:20:10'), now(), 1, 2, 1);
ts_date |
product_id |
operation |
pv |
uv |
2022-01-26 |
1 |
1 |
3 |
2 |
2022-01-27 |
1 |
1 |
2 |
2 |
product_id |
operation |
pv |
uv |
1 |
1 |
5 |
2 |
综上,实现不难,clickhouse 丰富的聚合函数可以节省很多操作。
还可以用 Kafka引擎 做数据的增量同步,结合视图。