EXPLAIN [AST | SYNTAX | PLAN | PIPELINE | TABLE OVERRIDE] [setting = value, ...]
[
SELECT ... |
tableFunction(...) [COLUMNS (...)] [ORDER BY ...] [PARTITION BY ...] [PRIMARY KEY] [SAMPLE BY ...] [TTL ...]
]
[FORMAT ...]
select
EXPLAIN AST SELECT 1;
┌─explain───────────────────────────┐
│ SelectWithUnionQuery (children 1) │
│ ExpressionList (children 1) │
│ SelectQuery (children 1) │
│ ExpressionList (children 1) │
│ Literal UInt64_1 │
└───────────────────────────────────┘
EXPLAIN AST ALTER TABLE t1 DELETE WHERE date = today();
┌─explain────────────────────────────┐
│ AlterQuery t1 (children 2) │
│ ExpressionList (children 1) │
│ AlterCommand DELETE (children 1) │
│ Function equals (children 1) │
│ ExpressionList (children 2) │
│ Identifier date │
│ Function today (children 1) │
│ ExpressionList │
│ Identifier t1 │
└────────────────────────────────────┘
EXPLAIN SYNTAX SELECT * FROM system.numbers AS a, system.numbers AS b, system.numbers AS c;
┌─explain────────────────────────────┐
│ SELECT │
│ `--a.number` AS `a.number`, │
│ `--b.number` AS `b.number`, │
│ number AS `c.number` │
│ FROM │
│ ( │
│ SELECT │
│ number AS `--a.number`, │
│ b.number AS `--b.number` │
│ FROM system.numbers AS a │
│ CROSS JOIN system.numbers AS b │
│ ) AS `--.s` │
│ CROSS JOIN system.numbers AS c │
└────────────────────────────────────┘
EXPLAIN SELECT sum(number) FROM numbers(10) GROUP BY number % 4;
┌─explain───────────────────────────────────────────────────────────────────────┐
│ Expression ((Projection + Before ORDER BY)) │
│ Aggregating │
│ Expression (Before GROUP BY) │
│ SettingQuotaAndLimits (Set limits and quota after reading from storage) │
│ ReadFromStorage (SystemNumbers) │
└───────────────────────────────────────────────────────────────────────────────┘
EXPLAIN PIPELINE SELECT sum(number) FROM numbers_mt(100000) GROUP BY number % 4;
┌─explain───────────────────────┐
│ (Expression) │
│ ExpressionTransform │
│ (Aggregating) │
│ AggregatingTransform │
│ (Expression) │
│ ExpressionTransform │
│ (SettingQuotaAndLimits) │
│ (ReadFromStorage) │
│ Limit │
│ Numbers 0 → 1 │
└───────────────────────────────┘
-- 创建表
CREATE TABLE ttt (i Int64) ENGINE = MergeTree() ORDER BY i SETTINGS index_granularity = 16, write_final_mark = 0;
-- 插入数据
INSERT INTO ttt SELECT number FROM numbers(128);
-- 优化表
OPTIMIZE TABLE ttt;
EXPLAIN ESTIMATE SELECT * FROM ttt;
┌─database─┬─table─┬─parts─┬─rows─┬─marks─┐
│ test │ ttt │ 1 │ 128 │ 8 │
└──────────┴───────┴───────┴──────┴───────┘
-- 在远程 MySQL 创建表
CREATE TABLE test.tbl (
id INT PRIMARY KEY,
created DATETIME DEFAULT now()
);
EXPLAIN TABLE OVERRIDE mysql('127.0.0.1:3306', 'test', 'tbl2', 'root', '123456')
PARTITION BY toYYYYMM(assumeNotNull(created));
┌─explain─────────────────────────────────────────────────┐
│ PARTITION BY uses columns: `created` Nullable(DateTime) │
└─────────────────────────────────────────────────────────┘
create table t_type (
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Int32
) engine = ReplacingMergeTree(create_time)
partition by toYYYYMMDD(toDate(create_time)) -- 需要转换一次,否则报错
primary key (id)
order by (id, sku_id);
-- 创建表
CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog;
-- 插入数据
INSERT INTO t_null VALUES (1, NULL), (2, 3);
-- 查询
SELECT x + y FROM t_null;
-- 官方案例 hits_v1 表
...
PARTTION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
...
-- 官方案例 visits_v1 表
...
PARTTION BY toYYYYMM(StartDate)
ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)
...
写入过快报错,报错信息
\1. Code: 252, e.displayText() = DB::Exception: Too many parts(304). Merges are processing significantly slower than inserts
\2. Code: 241, e.displayText() = DB::Exception: Memory limit (for query) exceeded:would use 9.37 GiB (attempt to allocate chunk of 301989888 bytes), maximum: 9.31 GiB
配置项主要在 config.xml 或 users.xml 中, 基本上都在 users.xml 里
config.xml 配置项:https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings/
user.xml 配置项:https://clickhouse.com/docs/en/operations/settings/settings/
CPU 资源:
配置 | 描述 |
---|---|
background_pool_size | 后台线程池的大小,merge 线程就是在该线程池中执行,该线程池不仅仅是给 merge 线程用的,默认值 16,建议改成 cpu 个数的 2 倍(线程数)。 |
background_schedule_pool_size | 执行后台任务(复制表、Kafka 流、DNS 缓存更新)的线程数。默认 128,建议改成 cpu 个数的 2 倍(线程数)。 |
background_distributed_schedule_pool_size | 设置为分布式发送执行后台任务的线程数,默认 16,建议改成 cpu 个数的 2 倍(线程数)。 |
max_concurrent_queries | 最大并发处理的请求数(包含 select,insert 等),默认值 100,推荐 150(不够再加)~300。 |
max_threads | 设置单个查询所能使用的最大 cpu 个数,默认是 cpu 核数。 |
配置 | 描述 |
---|---|
max_memory_usage | 此参数在 users.xml 中,表示单次 Query 占用内存最大值,该值可以设置的比较大,这样可以提升集群查询的上限。保留一点给 OS,比如 128G 内存的机器,设置为 100GB。 |
max_bytes_before_external_group_by | 一般按照 max_memory_usage 的一半设置内存,当 group 使用内存超过阈值后会刷新到磁盘进行。因为 clickhouse 聚合分两个阶段:查询并及建立中间数据、合并中间数据,结合上一项,建议 50GB。 |
max_bytes_before_external_sort | 当 order by 已使用 max_bytes_before_external_sort 内存就进行溢写磁盘(基于磁盘排序),如果不设置该值,那么当内存不够时直接抛错,设置了该值 order by 可以正常完成,但是速度相对存内存来说肯定要慢点(实测慢的非常多,无法接受)。 |
max_table_size_to_drop | 此参数在 config.xml 中,应用于需要删除表或分区的情况,默认是50GB,意思是如果删除 50GB 以上的分区表会失败。建议修改为 0,这样不管多大的分区表都可以删除。 |
# 下载
wget https://datasets.clickhouse.com/hits/partitions/hits_v1.tar
wget https://datasets.clickhouse.com/visits/partitions/visits_v1.tar
# 解压缩
tar -xvf hits_v1.tar -C /var/lib/clickhouse
tar -xvf visits_v1.tar -C /var/lib/clickhouse
# 修改所属用户
chown -R clickhouse:clickhouse /var/lib/clickhouse/data/datasets
chown -R clickhouse:clickhouse /var/lib/clickhouse/metadata/datasets
clickhouse-client --password 123456 --query "select count(*), (select count(*) from system.columns where database = 'datasets' and table = 'hits_v1') from datasets.hits_v1"
clickhouse-client --password 123456 --query "select count(*), (select count(*) from system.columns where database = 'datasets' and table = 'visits_v1') from datasets.visits_v1"
官方的 tar 包,包含了建库、建表语句、数据内容,这种方式不需要手动建库、建表,最方便。
explain syntax select a.UserID, b.VisitID, a.URL, b.UserID from hits_v1 as a left join (select UserID, UserID as uId, VisitID from visits_v1) as b using (UserID) limit 3;
-- 返回优化语句
┌─explain───────────────┐
│ SELECT │
│ UserID, │
│ VisitID, │
│ URL, │
│ b.UserID │
│ FROM hits_v1 AS a │
│ ALL LEFT JOIN │
│ ( │
│ SELECT │
│ UserID, │
│ VisitID │
│ FROM visits_v1 │
│ ) AS b USING (UserID) │
│ LIMIT 3 │
└───────────────────────┘
explain syntax select UserID from hits_v1 group by UserID having UserID = '8585742290196126178';
explain syntax select * from (select UserID from visits_v1) where UserID = '8585742290196126178';
explain syntax select * from (
select * from (select UserID from visits_v1)
union all
select * from (select UserID from visits_v1)
) where UserID = '8585742290196126178';
explain syntax select sum(UserID * 2) from visits_v1;
-- 优化后的语句
┌─explain────────────────┐
│ SELECT sum(UserID) * 2 │
│ FROM visits_v1 │
└────────────────────────┘
explain syntax select sum(UserID * 2), max(VisitID), max(UserID) from visits_v1 group by UserID;
-- 优化后的语句
┌─explain──────────────┐
│ SELECT │
│ sum(UserID) * 2, │
│ max(VisitID), │
│ UserID │
│ FROM visits_v1 │
│ GROUP BY UserID │
└──────────────────────┘
explain syntax select * from visits_v1 order by UserID asc, UserID asc, VisitID asc, VisitID asc;
-- 优化后的语句
┌─explain───────────────────────────────────┐
│ SELECT │
│ ...... │
│ FROM visits_v1 │
│ ORDER BY │
│ UserID ASC, │
│ VisitID ASC │
└───────────────────────────────────────────┘
explain syntax select * from visits_v1 limit 3 by VisitID, VisitID limit 10;
-- 优化后的语句
┌─explain───────────────────────────────────┐
│ SELECT │
│ ...... │
│ FROM visits_v1 │
│ LIMIT 3 BY VisitID │
│ LIMIT 10 │
└───────────────────────────────────────────┘
explain syntax select a.UserID, a.UserID, b.VisitID, a.URL, b.UserID from hits_v1 as a left join visits_v1 as b using(UserID, UserID);
-- 优化后的语句
┌─explain─────────────────────────────────────┐
│ SELECT │
│ UserID, │
│ UserID, │
│ VisitID, │
│ URL, │
│ b.UserID │
│ FROM hits_v1 AS a │
│ ALL LEFT JOIN visits_v1 AS b USING (UserID) │
└─────────────────────────────────────────────┘
-- 统计各个表使用disk的情况
explain syntax with (select sum(bytes) from system.parts where active) as total_disk_usage
select (sum(bytes) / total_disk_usage) * 100 as table_disk_usage, table from system.parts group by table order by table_disk_usage desc limit 10;
-- 优化后的语句
┌─explain─────────────────────────────────────────────────────────────────────────┐
│ WITH identity(_CAST(0, 'Nullable(UInt64)')) AS total_disk_usage │
│ SELECT │
│ (sum(bytes_on_disk AS bytes) / total_disk_usage) * 100 AS table_disk_usage, │
│ table │
│ FROM system.parts │
│ GROUP BY table │
│ ORDER BY table_disk_usage DESC │
│ LIMIT 10 │
└─────────────────────────────────────────────────────────────────────────────────┘
explain syntax select number = 1 ? 'hello' : (number = 2 ? 'world' : 'hehe') from numbers(10) settings optimize_if_chain_to_multiif = 1;
-- 优化后的语句
┌─explain──────────────────────────────────────────────────────────┐
│ SELECT multiIf(number = 1, 'hello', number = 2, 'world', 'hehe') │
│ FROM numbers(10) │
│ SETTINGS optimize_if_chain_to_multiif = 1 │
└──────────────────────────────────────────────────────────────────┘
-- 为了演示对比,先关闭 where 自动转 prewhere
set optimize_move_to_prewhere = 0;
-- 使用 where
select WatchID,
JavaEnable,
Title,
GoodEvent,
EventTime,
EventDate,
CounterID,
ClientIP,
ClientIP6,
RegionID,
UserID,
CounterClass,
OS,
UserAgent,
URL,
Referer,
URLDomain,
RefererDomain,
Refresh,
IsRobot,
RefererCategories,
URLCategories,
URLRegions,
RefererRegions,
ResolutionWidth,
ResolutionHeight,
ResolutionDepth,
FlashMajor,
FlashMinor,
FlashMinor2
from datasets.hits_v1 where UserID='3198390223272470366';
-- 使用 prewhere 关键字
select WatchID,
JavaEnable,
Title,
GoodEvent,
EventTime,
EventDate,
CounterID,
ClientIP,
ClientIP6,
RegionID,
UserID,
CounterClass,
OS,
UserAgent,
URL,
Referer,
URLDomain,
RefererDomain,
Refresh,
IsRobot,
RefererCategories,
URLCategories,
URLRegions,
RefererRegions,
ResolutionWidth,
ResolutionHeight,
ResolutionDepth,
FlashMajor,
FlashMinor,
FlashMinor2
from datasets.hits_v1 prewhere UserID='3198390223272470366';
select UserID from datasets.hits_v1 where UserID = '3198390223272470366';
select Title, count(*) as PageViews from hits_v1
sample 0.1 -- 代表采样 10% 的数据,也可以是具体的条数
where CounterID = 57
group by Title
order by PageViews desc limit 1000;
select Title, count(*) as PageViews from hits_v1
where CounterID = 57
group by Title
order by PageViews desc limit 1000;
-- 反例
select * from datasets.hits_v1;
-- 正例
select WatchID,
JavaEnable,
Title,
GoodEvent,
EventTime,
EventDate,
CounterID,
ClientIP,
ClientIP6,
RegionID,
UserID
from datasets.hits_v1;
select WatchID,
JavaEnable,
Title,
GoodEvent,
EventTime,
EventDate,
CounterID,
ClientIP,
ClientIP6,
RegionID,
UserID
from datasets.hits_v1
where EventDate='2014-03-23';
-- 正例
select UserID, Age from hits_v1 where CounterID = 57 order by Age desc limit 1000;
-- 反例
select UserID, Age from hits_v1 order by Age desc;
-- 反例
select Income, Age, Income/Age as IncRate from datasets.hits_v1;
-- 正例: 查出 Income, Age 后,考虑在前端进行处理,或者在表中构造实际字段进行额外存储
select Income, Age from datasets.hits_v1;
-- 反例
select count(distinct rand()) from hits_v1;
explain syntax select count(distinct rand()) from hits_v1;
┌─explain──────────────────┐
│ SELECT uniqExact(rand()) │
│ FROM hits_v1 │
└──────────────────────────┘
-- 正例
select uniqCombined(rand()) from datasets.hits_v1
-- 创建小表
CREATE TABLE visits_v2 ENGINE = CollapsingMergeTree(Sign)
PARTITION BY toYYYYMM(StartDate)
ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)
SAMPLE BY intHash32(UserID)
SETTINGS index_granularity = 8192
as select * from visits_v1 limit 10000;
-- 创建 join 结果表:避免控制台疯狂打印数据
CREATE TABLE hits_v2 ENGINE = MergeTree()
PARTITION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID)
SETTINGS index_granularity = 8192
as select * from hits_v1 where 1=0;
insert into hits_v2 select a.* from hits_v1 a where a.CounterID in (select CounterID from visits_v1);
-- 反例:使用join
insert into hits_v2 select a.* from hits_v1 a left join visits_v1 b on a.CounterID = b.CounterID;
-- 小表在右
insert into table hits_v2 select a.* from hits_v1 a left join visits_v2 b on a.CounterID = b.CounterID;
-- 大表在右
insert into table hits_v2 select a.* from visits_v2 b left join hits_v1 a on a.CounterID = b.CounterID;
explain syntax select a.* from hits_v1 a left join visits_v2 b on a.CounterID = b.CounterID having a.EventDate = '2014-03-17';
explain syntax select a.* from hits_v1 a left join visits_v2 b on a.CounterID = b.CounterID having b.StartDate = '2014-03-17';
insert into hits_v2 select a.* from hits_v1 a left join visits_v2 b on a.CounterID=b.CounterID where a.EventDate = '2014-03-17';
insert into hits_v2 select a.* from (select * from hits_v1 where EventDate = '2014-03-17') a left join visits_v2 b on a.CounterID = b.CounterID;