0.运行环境:
centos 7.6
clickhouse 20.4.4.4
RAM:3G
磁盘:vmware 虚拟磁盘 60G
物理磁盘:NVME-SSD
1.SSB概述
概述:
2.SSB操作步骤:
1.安装需要的软件:
yum -y install gcc gcc-c++ make cmake git
2.下载代码:
git clone https://github.com/vadimtk/ssb-dbgen.git
3.编译生成数据:
cd ssb-dbgen
make
$ ./dbgen -s 10 -T c
$ ./dbgen -s 10 -T l
$ ./dbgen -s 10 -T p
$ ./dbgen -s 10 -T s
$ ./dbgen -s 10 -T d
说明:
c--customer.tbl
p--part.tbl
s--supplier.tbl
d--date.tbl
l--lineorder.tbl
上面的表数据可以用如下的命令一次性生成:(for all SSBM tables)
$ dbgen -s 10 -T a
-- 查看生成的数据:
# du -sh *.tbl
32M customer.tbl
272K date.tbl
6.5G lineorder.tbl
77M part.tbl
1.9M supplier.tbl
4.在clickhouse client中执行表定义的脚本:
-- 创建表:
CREATE TABLE customer
(
C_CUSTKEY UInt32,
C_NAME String,
C_ADDRESS String,
C_CITY LowCardinality(String),
C_NATION LowCardinality(String),
C_REGION LowCardinality(String),
C_PHONE String,
C_MKTSEGMENT LowCardinality(String)
)
ENGINE = MergeTree ORDER BY (C_CUSTKEY);
CREATE TABLE lineorder
(
LO_ORDERKEY UInt32,
LO_LINENUMBER UInt8,
LO_CUSTKEY UInt32,
LO_PARTKEY UInt32,
LO_SUPPKEY UInt32,
LO_ORDERDATE Date,
LO_ORDERPRIORITY LowCardinality(String),
LO_SHIPPRIORITY UInt8,
LO_QUANTITY UInt8,
LO_EXTENDEDPRICE UInt32,
LO_ORDTOTALPRICE UInt32,
LO_DISCOUNT UInt8,
LO_REVENUE UInt32,
LO_SUPPLYCOST UInt32,
LO_TAX UInt8,
LO_COMMITDATE Date,
LO_SHIPMODE LowCardinality(String)
)
ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY);
CREATE TABLE part
(
P_PARTKEY UInt32,
P_NAME String,
P_MFGR LowCardinality(String),
P_CATEGORY LowCardinality(String),
P_BRAND LowCardinality(String),
P_COLOR LowCardinality(String),
P_TYPE LowCardinality(String),
P_SIZE UInt8,
P_CONTAINER LowCardinality(String)
)
ENGINE = MergeTree ORDER BY P_PARTKEY;
CREATE TABLE supplier
(
S_SUPPKEY UInt32,
S_NAME String,
S_ADDRESS String,
S_CITY LowCardinality(String),
S_NATION LowCardinality(String),
S_REGION LowCardinality(String),
S_PHONE String
)
ENGINE = MergeTree ORDER BY S_SUPPKEY;
5.将生成的数据导入数据库:
导入数据:
$ clickhouse-client --database ssb --query "INSERT INTO customer FORMAT CSV" < customer.tbl
$ clickhouse-client --database ssb --query "INSERT INTO part FORMAT CSV" < part.tbl
$ clickhouse-client --database ssb --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
$ clickhouse-client --database ssb --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl
6.生成测试数据:将“星型模式”转换为非规范化的“平面模式”
SET max_memory_usage = 30000000000;
CREATE TABLE lineorder_flat
ENGINE = MergeTree
PARTITION BY toYear(LO_ORDERDATE)
ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS
SELECT
l.LO_ORDERKEY AS LO_ORDERKEY,
l.LO_LINENUMBER AS LO_LINENUMBER,
l.LO_CUSTKEY AS LO_CUSTKEY,
l.LO_PARTKEY AS LO_PARTKEY,
l.LO_SUPPKEY AS LO_SUPPKEY,
l.LO_ORDERDATE AS LO_ORDERDATE,
l.LO_ORDERPRIORITY AS LO_ORDERPRIORITY,
l.LO_SHIPPRIORITY AS LO_SHIPPRIORITY,
l.LO_QUANTITY AS LO_QUANTITY,
l.LO_EXTENDEDPRICE AS LO_EXTENDEDPRICE,
l.LO_ORDTOTALPRICE AS LO_ORDTOTALPRICE,
l.LO_DISCOUNT AS LO_DISCOUNT,
l.LO_REVENUE AS LO_REVENUE,
l.LO_SUPPLYCOST AS LO_SUPPLYCOST,
l.LO_TAX AS LO_TAX,
l.LO_COMMITDATE AS LO_COMMITDATE,
l.LO_SHIPMODE AS LO_SHIPMODE,
c.C_NAME AS C_NAME,
c.C_ADDRESS AS C_ADDRESS,
c.C_CITY AS C_CITY,
c.C_NATION AS C_NATION,
c.C_REGION AS C_REGION,
c.C_PHONE AS C_PHONE,
c.C_MKTSEGMENT AS C_MKTSEGMENT,
s.S_NAME AS S_NAME,
s.S_ADDRESS AS S_ADDRESS,
s.S_CITY AS S_CITY,
s.S_NATION AS S_NATION,
s.S_REGION AS S_REGION,
s.S_PHONE AS S_PHONE,
p.P_NAME AS P_NAME,
p.P_MFGR AS P_MFGR,
p.P_CATEGORY AS P_CATEGORY,
p.P_BRAND AS P_BRAND,
p.P_COLOR AS P_COLOR,
p.P_TYPE AS P_TYPE,
p.P_SIZE AS P_SIZE,
p.P_CONTAINER AS P_CONTAINER
FROM lineorder AS l
INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY
INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY
INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY;
0 rows in set. Elapsed: 169.826 sec. Processed 61.11 million rows, 2.63 GB
(359.82 thousand rows/s., 15.51 MB/s.)
Clickhouse> OPTIMIZE TABLE ssb.lineorder_flat FINAL ;
OPTIMIZE TABLE ssd.lineorder_flat FINAL ;
OPTIMIZE TABLE ssb.lineorder_flat FINAL
7.查询导入的数据信息:
select
database,
table,
formatReadableSize(size) as size,
formatReadableSize(bytes_on_disk) as bytes_on_disk,
formatReadableSize(data_uncompressed_bytes) as data_uncompressed_bytes,
formatReadableSize(data_compressed_bytes) as data_compressed_bytes,
compress_rate,
rows,
days,
formatReadableSize(avgDaySize) as avgDaySize
from
(
select
database,
table,
sum(bytes) as size,
sum(rows) as rows,
min(min_date) as min_date,
max(max_date) as max_date,
sum(bytes_on_disk) as bytes_on_disk,
sum(data_uncompressed_bytes) as data_uncompressed_bytes,
sum(data_compressed_bytes) as data_compressed_bytes,
(data_compressed_bytes / data_uncompressed_bytes) * 100 as compress_rate,
max_date - min_date as days,
size / (max_date - min_date) as avgDaySize
from system.parts
where active
and database='ssb'
group by
database,
table
);
┌─database─┬─table──────────┬─size───────┬─bytes_on_disk─┬─data_uncompressed_bytes─┬─data_compressed_bytes─┬──────compress_rate─┬─────rows─┬─days─┬─avgDaySize─┐
│ ssb │ supplier │ 771.98 KiB │ 771.98 KiB │ 1.11 MiB │ 771.02 KiB │ 67.91314318351702 │ 20000 │ 0 │ inf YiB │
│ ssb │ part │ 13.79 MiB │ 13.79 MiB │ 19.59 MiB │ 13.76 MiB │ 70.2197570555349 │ 800000 │ 0 │ inf YiB │
│ ssb │ customer │ 11.50 MiB │ 11.50 MiB │ 16.89 MiB │ 11.49 MiB │ 67.99832662134101 │ 300000 │ 0 │ inf YiB │
│ ssb │ lineorder_flat │ 5.19 GiB │ 5.19 GiB │ 9.70 GiB │ 5.18 GiB │ 53.379435463024095 │ 59986052 │ 2405 │ 2.21 MiB │
└──────────┴────────────────┴────────────┴───────────────┴─────────────────────────┴───────────────────────┴────────────────────┴──────────┴──────┴────────────┘
行数:
Clickhouse> select count(1) from lineorder;
┌─count(1)─┐
│ 59986052 │
└──────────┘
1 rows in set. Elapsed: 0.009 sec.
Clickhouse> select count(1) from lineorder_flat;
┌─count(1)─┐
│ 59986052 │
└──────────┘
1 rows in set. Elapsed: 0.002 sec.
3.SSB测试脚本:
3.1.1
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM lineorder_flat
WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25;
┌───────revenue─┐
│ 4472807765583 │
└───────────────┘
1 rows in set. Elapsed: 0.174 sec. Processed 9.11 million rows, 72.86 MB (52.48 million rows/s., 419.87 MB/s.)
3.1.2
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM lineorder_flat
WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35;
┌──────revenue─┐
│ 965049065847 │
└──────────────┘
1 rows in set. Elapsed: 0.035 sec. Processed 786.43 thousand rows, 6.29 MB (22.67 million rows/s., 181.33 MB/s.)
3.1.3
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM lineorder_flat
WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994
AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35;
┌──────revenue─┐
│ 261680925983 │
└──────────────┘
1 rows in set. Elapsed: 0.011 sec. Processed 212.99 thousand rows, 1.64 MB (20.16 million rows/s., 155.64 MB/s.)
Clickhouse> select distinct LO_DISCOUNT from lineorder_flat order by LO_DISCOUNT ;
Clickhouse> select distinct LO_DISCOUNT from lineorder_flat order by LO_DISCOUNT ;
SELECT DISTINCT LO_DISCOUNT
FROM lineorder_flat
ORDER BY LO_DISCOUNT ASC
┌─LO_DISCOUNT─┐
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
│ 7 │
│ 8 │
│ 9 │
│ 10 │
└─────────────┘
11 rows in set. Elapsed: 0.079 sec. Processed 59.99 million rows, 59.99 MB (755.22 million rows/s., 755.22 MB/s.)
第二部分
3.2
2.1
SELECT
sum(LO_REVENUE),toYear(LO_ORDERDATE) AS year,P_BRAND
FROM lineorder_flat
WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA'
GROUP BY year,P_BRAND
ORDER BY year,P_BRAND;
2.2
SELECT sum(LO_REVENUE),toYear(LO_ORDERDATE) AS year,P_BRAND
FROM lineorder_flat
WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA'
GROUP BY year,P_BRAND
ORDER BY year,P_BRAND;
2.3
SELECT sum(LO_REVENUE),toYear(LO_ORDERDATE) AS year,P_BRAND
FROM lineorder_flat
WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE'
GROUP BY year,P_BRAND
ORDER BY year,P_BRAND;
3.3
3.1
SELECT C_NATION,S_NATION,toYear(LO_ORDERDATE) AS year,sum(LO_REVENUE) AS revenue
FROM lineorder_flat
WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997
GROUP BY C_NATION,S_NATION,year
ORDER BY year ASC,revenue DESC;
3.2
SELECT C_CITY, S_CITY,toYear(LO_ORDERDATE) AS year,sum(LO_REVENUE) AS revenue
FROM lineorder_flat
WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997
GROUP BY C_CITY,S_CITY,year
ORDER BY year ASC, revenue DESC;
3.3
SELECT C_CITY, S_CITY,toYear(LO_ORDERDATE) AS year,sum(LO_REVENUE) AS revenue
FROM lineorder_flat
WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997
GROUP BY C_CITY,S_CITY,year
ORDER BY year ASC, revenue DESC;
3.4
SELECT C_CITY, S_CITY,toYear(LO_ORDERDATE) AS year,sum(LO_REVENUE) AS revenue
FROM lineorder_flat
WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = 199712
GROUP BY C_CITY,S_CITY,year
ORDER BY year ASC, revenue DESC;
3.4
4.1
SELECT toYear(LO_ORDERDATE) AS year,C_NATION,sum(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2')
GROUP BY year,C_NATION
ORDER BY year ASC,C_NATION ASC;
4.2
SELECT toYear(LO_ORDERDATE) AS year,S_NATION,P_CATEGORY,sum(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2')
GROUP BY year,S_NATION,P_CATEGORY
ORDER BY year ASC,S_NATION ASC,P_CATEGORY ASC;
4.3
SELECT toYear(LO_ORDERDATE) AS year,S_CITY,P_BRAND,sum(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14'
GROUP BY year,S_CITY,P_BRAND
ORDER BY year ASC,S_CITY ASC,P_BRAND ASC;
4.执行过程报错信息:
-- 报错信息:
↗ Progress: 13.78 million rows, 598.81 MB (397.87 thousand rows/s., 17.29 MB/s.) █████████████████████████████████ 22%Received exception from server (version 20.4.4):
Code: 241. DB::Exception: Received from localhost:9000. DB::Exception: Memory limit (total) exceeded: would use 2.63 GiB (attempt to allocate chunk of 4502632 bytes), maximum: 2.63 GiB.
0 rows in set. Elapsed: 35.138 sec. Processed 13.78 million rows, 598.81 MB (392.05 thousand rows/s., 17.04 MB/s.)
SELECT count(1)
FROM lineorder_flat
┌─count(1)─┐
│ 12060260 │
└──────────┘
解决办法: SET max_memory_usage = 30000000000;
将此值由 20000000000 --> 30000000000
-- 磁盘空间不足:
↗ Progress: 36.92 million rows, 1.59 GB (373.84 thousand rows/s., 16.14 MB/s.) █████████████████████████████████████████████████████████████████████████████████████████▊ 59%Received exception from server (version 20.4.4):
Code: 243. DB::Exception: Received from localhost:9000. DB::Exception: Cannot reserve 118.10 MiB, not enough space.
0 rows in set. Elapsed: 99.118 sec. Processed 36.92 million rows, 1.59 GB (372.45 thousand rows/s., 16.08 MB/s.)
解决办法:
扩充磁盘或者删除不用的数据
参考:
https://clickhouse.tech/docs/en/getting-started/example-datasets/star-schema/
https://github.com/vadimtk/ssb-dbgen