add partition
添加分区,可以为该分区单独指定Bucket数量mysql>
mysql> create table if not exists test_db.range_tb(
-> user_id largeint not null comment '用户id',
-> date date not null comment '数据插入日期',
-> timestamp datetime not null comment '数据插入时间戳',
-> city varchar(20) comment '城市',
-> age smallint comment '年龄',
-> sex tinyint comment '性别',
-> last_visit_date datetime replace default '1970-01-01 00:00:00' comment '用户最后一次访问时间',
-> cost bigint sum default '0' comment '用户总消费',
-> max_dwell_time int max default '0' comment '用户最大停留时间',
-> min_dwell_time int min default '0' comment '用户最小停留时间'
-> )
-> engine = olap
-> aggregate key(user_id, date, timestamp, city, age, sex)
-> partition by range(date)
-> (
-> partition p202107 values less than ('2017-08-01'),
-> partition p202108 values less than ('2017-09-01'),
-> partition p202109 values less than ('2017-10-01')
-> )
-> distributed by hash(user_id) buckets 10
-> properties
-> (
-> 'replication_num' = '3',
-> 'storage_medium'='SSD',
-> 'storage_cooldown_time'='2022-01-01 00:00:00'
-> );
Query OK, 0 rows affected (0.30 sec)
mysql>
mysql>
mysql> create table if not exists test_db.multi_range_tb(
-> user_id largeint not null comment '用户id',
-> date date not null comment '数据插入日期',
-> timestamp datetime not null comment '数据插入时间戳',
-> city varchar(20) comment '城市',
-> age smallint comment '年龄',
-> sex tinyint comment '性别',
-> last_visit_date datetime replace default '1970-01-01 00:00:00' comment '用户最后一次访问时间',
-> cost bigint sum default '0' comment '用户总消费',
-> max_dwell_time int max default '0' comment '用户最大停留时间',
-> min_dwell_time int min default '0' comment '用户最小停留时间'
-> )
-> engine = olap
-> aggregate key(user_id, date, timestamp, city, age, sex)
-> partition by range(date, user_id)
-> (
-> partition p202107_1000 values less than ('2021-08-01', '1000'),
-> partition p202108_2000 values less than ('2021-09-01', '2000'),
-> partition p202109_all values less than ('2021-10-01')
-> )
-> distributed by hash(user_id) buckets 10
-> properties
-> (
-> 'replication_num' = '3',
-> 'storage_medium'='SSD',
-> 'storage_cooldown_time'='2022-01-01 00:00:00'
-> );
Query OK, 0 rows affected (0.07 sec)
mysql>
分区结果为:
p202107_1000: [(MIN_VALUE, MIN_VALUE), ("2021-08-01", "1000") )
p202108_2000: [("2021-08-01", "1000"), ("2021-09-01", "2000") )
p202109_all: [("2021-09-01", "2000"), ("2021-10-01", MIN_VALUE))
插入数据落入分区的情况如下:
数据 --> 分区
2021-07-01, 200 --> p202107_1000
2021-07-01, 2000 --> p202107_1000
2021-08-01, 100 --> p202107_1000
2021-08-01, 2000 --> p202108_2000
2021-08-15, 5000 --> p202108_2000
2021-09-01, 2000 --> p202109_all
2021-09-10, 1 --> p202109_all
2021-10-01, 1000 --> 无法导入
2021-11-01, 1000 --> 无法导入
0.14.0版本还未支持,只是看了最新master版本的文档,做了以下记录,并未实操
create table if not exists test_db.list_tb(
user_id largeint not null comment '用户id',
date date not null comment '数据插入日期',
timestamp datetime not null comment '数据插入时间戳',
city varchar(20) comment '城市',
age smallint comment '年龄',
sex tinyint comment '性别',
last_visit_date datetime replace default '1970-01-01 00:00:00' comment '用户最后一次访问时间',
cost bigint sum default '0' comment '用户总消费',
max_dwell_time int max default '0' comment '用户最大停留时间',
min_dwell_time int min default '0' comment '用户最小停留时间'
)
engine = olap
aggregate key(user_id, date, timestamp, city, age, sex)
partition by list(city)
(
partition p_cn values in ('Beijing', 'Shanghai', 'Hong Kong'),
partition p_usa values in ('New York', 'San Francisco'),
partition p_jp values in ('Tokyo')
)
distributed by hash(user_id) buckets 10
properties
(
'replication_num' = '3',
'storage_medium'='SSD',
'storage_cooldown_time'='2022-01-01 00:00:00'
);
create table if not exists test_db.list_tb(
user_id largeint not null comment '用户id',
date date not null comment '数据插入日期',
timestamp datetime not null comment '数据插入时间戳',
city varchar(20) comment '城市',
age smallint comment '年龄',
sex tinyint comment '性别',
last_visit_date datetime replace default '1970-01-01 00:00:00' comment '用户最后一次访问时间',
cost bigint sum default '0' comment '用户总消费',
max_dwell_time int max default '0' comment '用户最大停留时间',
min_dwell_time int min default '0' comment '用户最小停留时间'
)
engine = olap
aggregate key(user_id, date, timestamp, city, age, sex)
partition by list(user_id, city)
(
partition p1_city values in (('1', 'Beijing'), ('1', 'Shanghai')),
partition p2_city values in (('2', 'Beijing'), ('2', 'Shanghai')),
partition p3_city values in (('3', 'Beijing'), ('3', 'Shanghai'))
)
distributed by hash(user_id) buckets 10
properties
(
'replication_num' = '3',
'storage_medium'='SSD',
'storage_cooldown_time'='2022-01-01 00:00:00'
);
分区结果为:
p1_city: [("1", "Beijing"), ("1", "Shanghai")]
p2_city: [("2", "Beijing"), ("2", "Shanghai")]
p3_city: [("3", "Beijing"), ("3", "Shanghai")]
插入数据落入分区的情况如下:
数据 ---> 分区
1, Beijing ---> p1_city
1, Shanghai ---> p1_city
2, Shanghai ---> p2_city
3, Beijing ---> p3_city
1, Tianjin ---> 无法导入
4, Beijing ---> 无法导入
show data
命令查看,结果除以副本数,即表的数据量alter table multi_range_tb add partition p202110_3000 values less than ('2021-11-01', '3000') ('replication_num' = '1');
默认的表ENGINE是olap,由Doris负责管理。其它表引擎如mysql、es等,Doris只负责元数据映射,不储存数据,以便进行数据读取