来源:尚硅谷clickhouse教程文档
clickhouse是俄罗斯的Yandex在2016年开源的列式存储数据库DBMS,使用c++语言用于在线分析处理查询OLAP,能够使用SQL查询实时生成分析数据报告
列式存储的优点:
- 对于聚合 计数 求和等统计操作 优于 行式存储
- 因为每一列数据类型都是相同的,可以进行数据压缩
采用LSM Tree结构,数据导入是顺序写,不可更改,利用磁盘的吞吐能力
将数据划分为多个partition,每个partition进一步划分为多个index granularity,单挑query就能利用整机所有cpu,极致的并行处理能力
因为单挑就是多cpu。所以不利于同时并发多条查询
单表查询优于关联查询
准备:
防火墙关闭
取消打开文件数限制
[atguigu@hadoop102 ~]$ sudo vim /etc/security/limits.conf * soft nofile 65536 * hard nofile 65536 * soft nproc 131072 * hard nproc 131072 [atguigu@hadoop102 ~]$ sudo vim /etc/security/limits.d/20-nproc.conf * soft nofile 65536 * hard nofile 65536 * soft nproc 131072 * hard nproc 131072 然后3台机子分发
安装依赖
sudo yu install -y libtool sudo yum install -y *unixODBC*
取消selinux
[atguigu@hadoop102 ~]$ sudo vim /etc/selinux/config SELINUX=disabled 3台机子分发
安装:
官网:
下载地址:
rpm -ivh *.rpm sudo rpm -qa|grep clickhouse 查看安装情况 sudo vim /etc/clickhouse-server/config.xml 数据文件路径:<path>/var/lib/clickhouse/</path> 日志文件路径:<log>/var/log/clickhouse-server/clickhouse-server.log</log> 对外访问: <listen_host>::</listen_host> 开启本机以外服务器访问 3台分发
启动:
systemctl start clickhouse-server
/etc/init.d/clickhouse-server start
开机自启
systemctl disable clickhouse-server
连接:
clickhouse-client -m
````clickhouse-client --host=192.168.1.151 --port=9000 -u default --password``
-m可以多行输入
nohup clickhouse-server --config-file=/etc/clickhouse-server/config.xml &
name | fun |
---|---|
整形 | Int8/16/32/64 UInt8/16/32/64 |
浮点型 | float32 float float64 double |
布尔型 | 没有,可以用Uint 限制为0 或 1 |
decimal | Decimal32|64|128(s),相当于 Decimal(9|18|38-s,s) |
字符串 | string fixedString |
枚举 | Enum8 /16 create table name(x Enum8(‘hello’=1,‘world’=2)) engine=Tinylog; insert into name values(‘hello’),(‘world’),(‘hello’); select * from name; select cast(x,‘Int8’) from name //只能存储hello或world |
时间 | Date 年-月-日 Datetime 年-月-日 时:分:秒 Datetime64 年-月-日 时:分:秒.亚秒 |
数组 | Array select array(1,2) as x,toTypeName(x); |
name | fun |
---|---|
TinyLog | 列文件形式保存在磁盘,不支持所以没有并发 用来存少量数据的小表,和平时练习测试 create table name(id String,name String) engine=TinyLog; |
Memory | 保存在内存,服务器重启数据消失,不支持索引,简单查询性能极高 用于测试和需要非常高兴能且数据量不太大(1亿)的场景 |
MergeTree | 支持索引和分区 |
create table t_order_mt(
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
) engine =MergeTree
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id,sku_id);
//order key必填项 如果order by是id,sku,id 主键则必须是 id或者id,sku_id
insert into t_order_mt values
(101,'sku_001',1000.00,'2020-06-01 12:00:00') ,
(102,'sku_002',2000.00,'2020-06-01 11:00:00'),
(102,'sku_004',2500.00,'2020-06-01 12:00:00'),
(102,'sku_002',2000.00,'2020-06-01 13:00:00'),
(102,'sku_002',12000.00,'2020-06-01 13:00:00'),
(102,'sku_002',600.00,'2020-06-02 12:00:00');
//手动合并,自动在10-15分钟后
optimize table xxxx final;
create table t_order_mt2(
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime,
INDEX a total_amount TYPE minmax GRANULARITY 5
) engine =MergeTree
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id, sku_id);
insert into t_order_mt2 values
(101,'sku_001',1000.00,'2020-06-01 12:00:00') ,
(102,'sku_002',2000.00,'2020-06-01 11:00:00'),
(102,'sku_004',2500.00,'2020-06-01 12:00:00'),
(102,'sku_002',2000.00,'2020-06-01 13:00:00'),
(102,'sku_002',12000.00,'2020-06-01 13:00:00'),
(102,'sku_002',600.00,'2020-06-02 12:00:00');
clickhouse-client --send_logs_level=trace <<< 'select
* from t_order_mt2 where total_amount > toDecimal32(900., 2)';
生命周期
SECOND MINUTE HOUR DAY WEEK MONTH QUARTER YEAR
create table t_order_mt3(
id UInt32,
sku_id String,
total_amount Decimal(16,2) TTL create_time+interval 10 SECOND,
create_time Datetime
) engine =MergeTree
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id, sku_id);
insert into t_order_mt3 values
(106,'sku_001',1000.00,'2020-06-12 22:52:30'),
(107,'sku_002',2000.00,'2020-06-12 22:52:30'),
(110,'sku_003',600.00,'2020-06-13 12:00:00');
optimize table t_order_mt3 final;
//10秒丢失
alter table t_order_mt3 MODIFY TTL create_time + INTERVAL 10 SECOND;
name | fun |
---|---|
ReplacingMergeTree | 继承于MergeTree,多了一个去重 数据的去重只会在合并的过程中出现 如果表经过了分区,去重只会在分区内部进行去重,不能执行跨分区的去重 |
create table t_order_rmt(
id UInt32,
sku_id String,
total_amount Decimal(16,2) ,
create_time Datetime
) engine =ReplacingMergeTree(create_time)
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id, sku_id);
insert into t_order_rmt values
(101,'sku_001',1000.00,'2020-06-01 12:00:00') ,
(102,'sku_002',2000.00,'2020-06-01 11:00:00'),
(102,'sku_004',2500.00,'2020-06-01 12:00:00'),
(102,'sku_002',2000.00,'2020-06-01 13:00:00'),
(102,'sku_002',12000.00,'2020-06-01 13:00:00'),
(102,'sku_002',600.00,'2020-06-02 12:00:00');
select * from t_order_rmt;
OPTIMIZE TABLE t_order_rmt FINAL;
select * from t_order_rmt;
name | fun |
---|---|
SummingMergeTree | 预聚合 |
create table t_order_smt(
id UInt32,
sku_id String,
total_amount Decimal(16,2) ,
create_time Datetime
) engine =SummingMergeTree(total_amount)
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id,sku_id );
insert into t_order_smt values
(101,'sku_001',1000.00,'2020-06-01 12:00:00'),
(102,'sku_002',2000.00,'2020-06-01 11:00:00'),
(102,'sku_004',2500.00,'2020-06-01 12:00:00'),
(102,'sku_002',2000.00,'2020-06-01 13:00:00'),
(102,'sku_002',12000.00,'2020-06-01 13:00:00'),
(102,'sku_002',600.00,'2020-06-02 12:00:00');
(102,'sku_002',600.00,'2020-06-02 12:00:00');
select * from t_order_smt;
OPTIMIZE TABLE t_order_smt FINAL;
select * from t_order_smt;
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cfNN1RT5-1665884712833)(C:\Users\cry\AppData\Roaming\Typora\typora-user-images\image-20221003163221132.png)][外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DXpRFdwa-1665884712834)(C:\Users\cry\AppData\Roaming\Typora\typora-user-images\image-20221003163232367.png)]
设计聚合表的话,唯一键值、流水号可以去掉,所有字段全部是维度、度量或者时间戳
insert
insert into [table_name] values(…),(….)
insert into [table_name] select a,b,c from [table_name_2]
update 和 delete (Mutation,改和删需要放弃原有的 重新插入)
alter table t_order_smt delete where sku_id ='sku_001';
alter table t_order_smt update total_amount=toDecimal32(2000.00,2) where id
=102;
查询
- 支持子查询,CTE(with子句) JOIN但不缓存还是多个sql
- 窗口函数还在测试
- 不支持自定义
- group by 增加了with rollup\with cube\with total来进行小计和总计
alter table t_order_mt delete where 1=1;
insert into t_order_mt values
(101,'sku_001',1000.00,'2020-06-01 12:00:00'),
(101,'sku_002',2000.00,'2020-06-01 12:00:00'),
(103,'sku_004',2500.00,'2020-06-01 12:00:00'),
(104,'sku_002',2000.00,'2020-06-01 12:00:00'),
(105,'sku_003',600.00,'2020-06-02 12:00:00'),
(106,'sku_001',1000.00,'2020-06-04 12:00:00'),
(107,'sku_002',2000.00,'2020-06-04 12:00:00'),
(108,'sku_004',2500.00,'2020-06-04 12:00:00'),
(109,'sku_002',2000.00,'2020-06-04 12:00:00'),
(110,'sku_003',600.00,'2020-06-01 12:00:00');
------------------------------------
// with rollup 从右到左去掉维度进行小计
select id , sku_id,sum(total_amount) from t_order_mt group by
id,sku_id with rollup;
//从右到左再从左到右去掉维度进行小计
select id , sku_id,sum(total_amount) from t_order_mt group by
id,sku_id with cube;
//只计算合计
select id , sku_id,sum(total_amount) from t_order_mt group by
id,sku_id with totals;
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jul8K3al-1665884712836)(C:\Users\cry\AppData\Roaming\Typora\typora-user-images\image-20221003165259719.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EVs4s65p-1665884712837)(C:\Users\cry\AppData\Roaming\Typora\typora-user-images\image-20221003165134603.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xhnMSN0F-1665884712839)(C:\Users\cry\AppData\Roaming\Typora\typora-user-images\image-20221003165333798.png)]
alter
添加字段
alter table tableName add column newcolname String after col1;
//更改字段类型
alter table tableName modify column newcolname String;
//删除字段
alter table tableName drop column newcolname;
//导出数据
clickhouse-client --query "select * from t_order_mt where
create_time='2020-06-01 12:00:00'" --format CSVWithNames>
/opt/module/data/rs1.csv
为了保障数据的高可用性
<yandex>
<zookeeper-servers>
<node index="1">
<host>hadoop102host>
<port>2181port>
node>
<node index="2">
<host>hadoop103host>
<port>2181port>
node>
<node index="3">
<host>hadoop104host>
<port>2181port>
node>
zookeeper-servers>
yandex>
//然后同步3台
<zookeeper incl="zookeeper-servers" optional="true" />
<include_from>/etc/clickhouse-server/config.d/metrika.xmlinclude_from>
//让metrika.xml生效
//然后同步
clickhouse restart
create table t_order_rep2 (
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
) engine =ReplicatedMergeTree('/clickhouse/table/01/t_order_rep','rep_102')
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id,sku_id);
create table t_order_rep2 (
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
) engine =ReplicatedMergeTree('/clickhouse/table/01/t_order_rep','rep_103')
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id,sku_id);
//一台机子执行插入,相同表名的都有数据
insert into t_order_rep2 values
(101,'sku_001',1000.00,'2020-06-01 12:00:00'),
(102,'sku_002',2000.00,'2020-06-01 12:00:00'),
(103,'sku_004',2500.00,'2020-06-01 12:00:00'),
(104,'sku_002',2000.00,'2020-06-01 12:00:00'),
(105,'sku_003',600.00,'2020-06-02 12:00:00');
ReplicateMergeTree(zk_path,name)
zk_path: /clickhouse/table/{shard}/{table_name}
shard:分片只有一个用01就行
name: 相同的分片副本名称不能相同。
完成数据的横向扩容
metrika.xml
/etc/clickhouse-server/config.d/metrika.xml
shard 代表分片 replica代表分片副本
多少个shard就代表多少个分片,每个分片又有replica个副本
<yandex>
<remote_servers>
<gmall_cluster>
<shard>
<internal_replication>trueinternal_replication>
<replica>
<host>hadoop101host>
<port>9000port>
replica>
<replica>
<host>hadoop102host>
<port>9000port>
replica>
shard>
<shard>
<internal_replication>trueinternal_replication>
<replica>
<host>hadoop103host>
<port>9000port>
replica>
<replica>
<host>hadoop104host>
<port>9000port>
replica>
shard>
<shard>
<internal_replication>trueinternal_replication>
<replica>
<host>hadoop105host>
<port>9000port>
replica>
<replica>
<host>hadoop106host>
<port>9000port>
replica>
shard>
gmall_cluster>
remote_servers>
<zookeeper-servers>
<node index="1">
<host>hadoop102host>
<port>2181port>
node>
<node index="2">
<host>hadoop103host>
<port>2181port>
node>
<node index="3">
<host>hadoop104host>
<port>2181port>
node>
zookeeper-servers>
<macros>
<shard>01shard>
<replica>rep_1_1replica>
macros>
yandex>
//然后同步3到集群,记得改macros
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-khyK0UFr-1665884712842)(C:\Users\cry\AppData\Roaming\Typora\typora-user-images\image-20221003172802306.png)]
vim /etc/clickhouse-server/config.d/metrika-shard.xml
<zookeeper incl="zookeeper-servers" optional="true">zookeeper>
<include_from>/etc/clickhouse-server/config.d/metrika-shard.xmlinclude_from>
//同步到集群
clickhouse restart
ps -ef | grep click
create table st_order_mt on cluster gmall_cluster (
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
) engine
=ReplicatedMergeTree('/clickhouse/tables/{shard}/st_order_mt','{replica}')
partition by toYYYYMMDD(create_time)
primary key (id)
order by (id,sku_id);
show tables
=================
//分布式表
create table st_order_mt_all2 on cluster gmall_cluster
(
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
)engine = Distributed(gmall_cluster,default, st_order_mt,hiveHash(sku_id));
name | fun |
---|---|
Distributed | 分布式表 Distributed(集群名,库名,本地表名,分片键) 分片键必须整形数字,可以HiveHash转换 也可以rand() |
create table st_order_mt_all2 on cluster gmall_cluster
(
id UInt32,
sku_id String,
total_amount Decimal(16,2),
create_time Datetime
)engine = Distributed(gmall_cluster,default, st_order_mt,hiveHash(sku_id));
-----------------------
insert into st_order_mt_all2 values
(201,'sku_001',1000.00,'2020-06-01 12:00:00') ,
(202,'sku_002',2000.00,'2020-06-01 12:00:00'),
(203,'sku_004',2500.00,'2020-06-01 12:00:00'),
(204,'sku_002',2000.00,'2020-06-01 12:00:00'),
(205,'sku_003',600.00,'2020-06-02 12:00:00');
SELECT * FROM st_order_mt_all;
select * from st_order_mt;
<dependency>
<groupId>com.github.housepowergroupId>
<artifactId>clickhouse-native-jdbcartifactId> //>clickhouse-jdbc
<version>2.6.0version>
dependency>
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.{SparkConf}
object SparkCoreAndClickHouse {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("ClickHouse")
val spark: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val url = "jdbc:clickhouse://10.10.10.113:9000"
val drivce = "com.github.housepower.jdbc.ClickHouseDriver"
//ru.yandex.clickhouse.ClickHouseDriver
// 官方的jar包使用8123作为端口
val df = spark.read
.format("jdbc")
.option("driver", drivce)
.option("url", url)
.option("user", "default")
.option("password", "")
.option("dbtable", "default.user")
.load
df.show()
val pro = new java.util.Properties
pro.put("driver", drivce)
df.write
.mode(SaveMode.Append)
.option("batchsize", "20000")
.option("isolationLevel", "NONE")
.option("numPartitions", "1")
.jdbc(url, "default.user", pro)
// 关闭环境
spark.close()
}
}