SSB(Star Schema Benchmark)是麻省州立大学波士顿校区的研究人员定义的基于现实商业应用的数据模型,用来评价决策支持技术方面应用的性能。本文用它来测试云原生ClickHouse性能。
SSB基准测试包括:
1个事实表:lineorder
4个维度表:customer,part,date,supplier
13条标准SQL查询测试语句:统计查询、多表关联、sum、复杂条件、group by、order by等组合方式。
$ git clone http://github.com/vadimtk/ssb-dbgen.git
$ cd ssb-dbgen
$ make
$ ./dbgen -s 10 -T c
$ ./dbgen -s 10 -T l (数据量较大,谨慎设置-s参数)
$ ./dbgen -s 1000 -T p
$ ./dbgen -s 1000 -T s
$ ./dbgen -s 10000 -T d
首先解决ClickHouse建表不支持换行问题:
进入clickhouse-client时加一个-m的参数即可。
clickhouse-client -m -h 10.43.237.127 -u clickhouse_operator --password clickhouse_operator_password
注意:创建表时要选择集群,每次连接ClickHouse时都会随机进入一个Pod,数据库表如果没建在集群上,建表语句不会自动同步到其他Pod上。退出后pod不会自动销毁,除非手动删除。
#查看所有的pod:
kubectl get pod -n ckk8s
#K8s 进入特定的pod:
kubectl exec -it -n ckk8s podName /bin/bash
创建表:
create database ssb on cluster cluster1;
use ssb;
CREATE TABLE customer on cluster cluster1
(
C_CUSTKEY UInt32,
C_NAME String,
C_ADDRESS String,
C_CITY LowCardinality(String),
C_NATION LowCardinality(String),
C_REGION LowCardinality(String),
C_PHONE String,
C_MKTSEGMENT LowCardinality(String)
)
ENGINE = MergeTree ORDER BY (C_CUSTKEY);
CREATE TABLE lineorder on cluster cluster1
(
LO_ORDERKEY UInt32,
LO_LINENUMBER UInt8,
LO_CUSTKEY UInt32,
LO_PARTKEY UInt32,
LO_SUPPKEY UInt32,
LO_ORDERDATE Date,
LO_ORDERPRIORITY LowCardinality(String),
LO_SHIPPRIORITY UInt8,
LO_QUANTITY UInt8,
LO_EXTENDEDPRICE UInt32,
LO_ORDTOTALPRICE UInt32,
LO_DISCOUNT UInt8,
LO_REVENUE UInt32,
LO_SUPPLYCOST UInt32,
LO_TAX UInt8,
LO_COMMITDATE Date,
LO_SHIPMODE LowCardinality(String)
)
ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY);
CREATE TABLE part on cluster cluster1
(
P_PARTKEY UInt32,
P_NAME String,
P_MFGR LowCardinality(String),
P_CATEGORY LowCardinality(String),
P_BRAND LowCardinality(String),
P_COLOR LowCardinality(String),
P_TYPE LowCardinality(String),
P_SIZE UInt8,
P_CONTAINER LowCardinality(String)
)
ENGINE = MergeTree ORDER BY P_PARTKEY;
CREATE TABLE supplier on cluster cluster1
(
S_SUPPKEY UInt32,
S_NAME String,
S_ADDRESS String,
S_CITY LowCardinality(String),
S_NATION LowCardinality(String),
S_REGION LowCardinality(String),
S_PHONE String
)
ENGINE = MergeTree ORDER BY S_SUPPKEY;
CREATE TABLE date on cluster cluster1
(
D_DATEKEY date32,
D_DATE String,
D_DAYOFWEEK String,
D_MONTH String,
D_YEAR UInt32,
D_YEARMONTHNUM UInt32,
D_YEARMONTH String,
D_DAYNUMINWEEK UInt8,
D_DAYNUMINMONTH UInt8,
D_DAYNUMINYEAR UInt32,
D_MONTHNUMINYEAR UInt8,
D_WEEKNUMINYEAR UInt8,
D_SELLINGSEASON String,
D_LASTDAYINWEEKFL String,
D_LASTDAYINMONTHFL String,
D_HOLIDAYFL String,
D_WEEKDAYFL String
)
ENGINE = MergeTree ORDER BY D_DATEKEY;
$ clickhouse-client -m -h 10.43.237.127 -u clickhouse_operator --password clickhouse_operator_password --query "INSERT INTO ssb.customer FORMAT CSV" < customer.tbl
$ clickhouse-client -m -h 10.43.237.127 -u clickhouse_operator --password clickhouse_operator_password --query "INSERT INTO ssb.part FORMAT CSV" < part.tbl
$ clickhouse-client -m -h 10.43.237.127 -u clickhouse_operator --password clickhouse_operator_password --query "INSERT INTO ssb.supplier FORMAT CSV" < supplier.tbl
$ clickhouse-client -m -h 10.43.237.127 -u clickhouse_operator --password clickhouse_operator_password --query "INSERT INTO ssb.lineorder FORMAT CSV" < lineorder.tbl
文件大小:
[root@p64001v data]# ll
总用量 10404124
-rw-r--r-- 1 root root 3345252364 10月 21 20:11 customer.tbl
-rw-r--r-- 1 root root 6910512767 10月 21 20:12 lineorder.tbl
-rw-r--r-- 1 root root 201396298 10月 21 20:12 part.tbl
-rw-r--r-- 1 root root 196652980 10月 21 20:12 supplier.tbl
[root@p64001v data]# ls -lh
总用量 10G
-rw-r--r-- 1 root root 3.2G 10月 21 20:11 customer.tbl
-rw-r--r-- 1 root root 6.5G 10月 21 20:12 lineorder.tbl
-rw-r--r-- 1 root root 193M 10月 21 20:12 part.tbl
-rw-r--r-- 1 root root 188M 10月 21 20:12 supplier.tbl
数据量(条):
customer:30000000
lineorder:59986052
part:2000000
supplier:2000000
SET max_memory_usage = 20000000000;
CREATE TABLE lineorder_flat on cluster cluster1
ENGINE = MergeTree
PARTITION BY toYear(LO_ORDERDATE)
ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS
SELECT
l.LO_ORDERKEY AS LO_ORDERKEY,
l.LO_LINENUMBER AS LO_LINENUMBER,
l.LO_CUSTKEY AS LO_CUSTKEY,
l.LO_PARTKEY AS LO_PARTKEY,
l.LO_SUPPKEY AS LO_SUPPKEY,
l.LO_ORDERDATE AS LO_ORDERDATE,
l.LO_ORDERPRIORITY AS LO_ORDERPRIORITY,
l.LO_SHIPPRIORITY AS LO_SHIPPRIORITY,
l.LO_QUANTITY AS LO_QUANTITY,
l.LO_EXTENDEDPRICE AS LO_EXTENDEDPRICE,
l.LO_ORDTOTALPRICE AS LO_ORDTOTALPRICE,
l.LO_DISCOUNT AS LO_DISCOUNT,
l.LO_REVENUE AS LO_REVENUE,
l.LO_SUPPLYCOST AS LO_SUPPLYCOST,
l.LO_TAX AS LO_TAX,
l.LO_COMMITDATE AS LO_COMMITDATE,
l.LO_SHIPMODE AS LO_SHIPMODE,
c.C_NAME AS C_NAME,
c.C_ADDRESS AS C_ADDRESS,
c.C_CITY AS C_CITY,
c.C_NATION AS C_NATION,
c.C_REGION AS C_REGION,
c.C_PHONE AS C_PHONE,
c.C_MKTSEGMENT AS C_MKTSEGMENT,
s.S_NAME AS S_NAME,
s.S_ADDRESS AS S_ADDRESS,
s.S_CITY AS S_CITY,
s.S_NATION AS S_NATION,
s.S_REGION AS S_REGION,
s.S_PHONE AS S_PHONE,
p.P_NAME AS P_NAME,
p.P_MFGR AS P_MFGR,
p.P_CATEGORY AS P_CATEGORY,
p.P_BRAND AS P_BRAND,
p.P_COLOR AS P_COLOR,
p.P_TYPE AS P_TYPE,
p.P_SIZE AS P_SIZE,
p.P_CONTAINER AS P_CONTAINER
FROM ssb.lineorder AS l
INNER JOIN ssb.customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY
INNER JOIN ssb.supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY
INNER JOIN ssb.part AS p ON p.P_PARTKEY = l.LO_PARTKEY;
SELECT
toYear(LO_ORDERDATE) AS year,
S_NATION,
P_CATEGORY,
sum(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2')
GROUP BY
year,
S_NATION,
P_CATEGORY
ORDER BY
year ASC,
S_NATION ASC,
P_CATEGORY ASC;
由于我的CK是部署在K8S上,所以在CK上创建表最好是创建分布式表和本地表两张表。如果只创建本地表,查询本地表时数据会不全。
create table if not exists fy.test_table on cluster cluster1 (
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
) engine=ReplicatedMergeTree('/clickhouse/tables/{shard}/fy.test_table','{replica}')
partition by toYYYYMMDD(create_time) primary key (id) order by (id,sku_id)
其中if not exists fy.test_table与/{shard}/fy.test_table中的fy.test_table建议保持一致。其实没有必要一致,但/{shard}/fy.test_table处的必须保证每个表都不一样,因此将此处的值和表名一样即可。
create table if not exists fy.test_table_all on cluster cluster1
(
id UInt32,
sku_id String,
total_amount Decimal(16,2), create_time Datetime
)engine = Distributed(cluster1,fy, test_table,hiveHash(sku_id));
Distributed中的四项分别代表集群名,数据库名,本地表名,分片方法。
我这里是在集群内节点进行的。一般使用分布式表进行查询,使用本地表查询只返回当前节点的数据,使用分布式表查询返回所有节点上符合要求的数据。
clickhouse一般不删除数据,删除的成本太高,都是直接增加数据。一般也不更新数据,alter table update/delete不支持分布式DDL,在分布式环境中需要手动在每个节点上更新/删除数据。