0、结论
(1)原理
ClickHouse默认使用LZ4压缩格式。当数据类型不同,ClickHouse支持字段级别的压缩格式,可以使用不同的CODEC,更好的标识数据类型,理论上可以提高性能。
测试2种压缩codec:
- Delta:在timestamp上使用
- Gorilla:在float、integer等随机数值类型上使用
(2)结论:
写入、压缩、查询没有明显变优,甚至有些方面弱于默认LZ4。
1、准备数据
$ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar
$ tar xvf ontime.tar -C [CLICKHOUSE-DATA-PATH]
$ sudo service clickhouse-server restart
构造自己要用的数据。
insert into datasets.compressBase SELECT FlightDate,FlightNum,OriginAirportID,DestAirportID,Distance,ArrDelay,plus(plus(toUnixTimestamp(concat(toString(FlightDate),' 00:00:00'))*1000 ,multiply(intDiv(DepTime,100),3600000)),multiply(modulo(DepTime,100),60000)) from datasets.ontime where Year > 2005;
2、写入
(1)测试方式
INSERT INTO [TABLE] SELECT * FROM datasets.compressBase
(2)基准
CREATE TABLE datasets.compressBase (`FlightDate` Date,
`FlightNum` String,
`OriginAirportID` String,
`DestAirportID` String,
`Distance` Int32,
`ArrDelay` Int32,
`timestamp` Int32) ENGINE = MergeTree() PARTITION BY FlightDate
ORDER BY
(timestamp,OriginAirportID) SETTINGS index_granularity = 8192
序号 |
耗时 |
速度 |
1 |
37.551 sec |
2.18 million rows/s., 119.32 MB/s. |
2 |
41.833 sec |
1.96 million rows/s., 107.11 MB/s. |
3 |
39.941 sec |
2.05 million rows/s., 112.18 MB/s. |
(3)Gorilla
CREATE TABLE datasets.gorillaCodec (`FlightDate` Date,
`FlightNum` String,
`OriginAirportID` String,
`DestAirportID` String,
`Distance` Int32 CODEC(Gorilla),
`ArrDelay` Int32 CODEC(Gorilla),
`timestamp` Int32) ENGINE = MergeTree() PARTITION BY FlightDate
ORDER BY
(timestamp,OriginAirportID) SETTINGS index_granularity = 8192
序号 |
耗时 |
速度 |
1 |
39.328 sec |
2.09 million rows/s., 113.93 MB/s. |
2 |
44.405 sec |
1.85 million rows/s., 100.90 MB/s. |
3 |
42.132 sec |
1.95 million rows/s., 106.35 MB/s. |
(4)Delta
CREATE TABLE datasets.deltaCodec (`FlightDate` Date,
`FlightNum` String,
`OriginAirportID` String,
`DestAirportID` String,
`Distance` Int32,
`ArrDelay` Int32,
`timestamp` Int32 CODEC(Delta)) ENGINE = MergeTree() PARTITION BY FlightDate
ORDER BY
(timestamp,OriginAirportID) SETTINGS index_granularity = 8192
序号 |
耗时 |
速度 |
1 |
55.676 sec |
1.10 million rows/s., 40.46 MB/s. |
2 |
53.192 sec |
1.16 million rows/s., 42.35 MB/s. |
3 |
56.626 sec |
1.09 million rows/s., 39.78 MB/s. |
(5)ALL
CREATE TABLE datasets.all (`FlightDate` Date,
`FlightNum` String,
`OriginAirportID` String,
`DestAirportID` String,
`Distance` Int32 CODEC(Gorilla),
`ArrDelay` Int32 CODEC(Gorilla),
`timestamp` Int32 CODEC(Delta)) ENGINE = MergeTree() PARTITION BY FlightDate
ORDER BY
(timestamp,OriginAirportID) SETTINGS index_granularity = 8192
序号 |
耗时 |
速度 |
1 |
54.830 sec |
1.12 million rows/s., 41.08 MB/s. |
2 |
54.722 sec |
1.12 million rows/s., 41.16 MB/s. |
3 |
60.001 sec |
1.02 million rows/s., 37.54 MB/s. |
3、存储
select sum(data_compressed_bytes),table from system.parts where table in ('compressBase','gorillaCodec','deltaCodec','all' ) group by table
(1)查询
select sum(Distance) as all_distince, FlightNum from [TABLE] where (OriginAirportID LIKE '15%' OR OriginAirportID LIKE '13%') AND DestAirportID LIKE '13%' group by FlightNum, OriginAirportID
(2)基准
序号 |
耗时 |
速度 |
1 |
1.299 sec |
46.50 million rows/s., 453.19 MB/s. |
2 |
1.279 sec |
47.23 million rows/s., 460.34 MB/s. |
3 |
1.161 sec |
52.04 million rows/s., 507.19 MB/s. |
(3)Gorilla
序号 |
耗时 |
速度 |
1 |
1.512 sec |
39.96 million rows/s., 389.49 MB/s. |
2 |
1.135 sec |
53.22 million rows/s., 518.69 MB/s. |
3 |
1.266 sec |
47.71 million rows/s., 464.98 MB/s. |
(4)delta
序号 |
耗时 |
速度 |
1 |
1.252 sec |
48.26 million rows/s., 470.32 MB/s. |
2 |
1.184 sec |
51.04 million rows/s., 497.46 MB/s. |
3 |
1.147 sec |
52.66 million rows/s., 513.24 MB/s. |
(5)all
序号 |
耗时 |
速度 |
1 |
1.315 sec |
45.93 million rows/s., 447.60 MB/s. |
2 |
1.297 sec |
46.57 million rows/s., 453.83 MB/s. |
3 |
1.190 sec |
50.78 million rows/s., 494.94 MB/s. |