#1.下载安装包
wget https://dlcdn.apache.org/kafka/3.2.0/kafka_2.13-3.2.0.tgz
#2. 解析安装包
$ tar -xzf kafka_2.13-3.2.0.tgz
$ cd kafka_2.13-3.2.0
#3. 启动zookeeper
$ nohup bin/zookeeper-server-start.sh config/zookeeper.properties &
#4. 启动kafka
$ nohup bin/kafka-server-start.sh config/server.properties &
#5. 创建topic
$ bin/kafka-topics.sh --create --topic test --bootstrap-server localhost:9092
# 创建数据库
create database kafka_doris;
#切换数据库
use kafka_doris;
#创建clicklog表
CREATE TABLE IF NOT EXISTS kafka_doris.clicklog
(
`clickTime` DATETIME NOT NULL COMMENT "点击时间",
`type` VARCHAR(20) NOT NULL COMMENT "点击类型",
`id` VARCHAR(100) COMMENT "唯一id",
`user` VARCHAR(100) COMMENT "用户名称",
`city` VARCHAR(50) COMMENT "所在城市"
)
DUPLICATE KEY(`clickTime`, `type`)
DISTRIBUTED BY HASH(`type`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
创建有正则表达式的Routine Load任务:
CREATE ROUTINE LOAD kafka_doris.load_from_kafka ON clicklog
COLUMNS(clickTime,id,type,user=regexp_extract(type, '([[:lower:]]+)C([[:lower:]]+)', 2))
PROPERTIES
(
"desired_concurrent_number"="1",
"max_batch_interval" = "5",
"max_batch_rows" = "300000",
"max_batch_size" = "209715200",
"strict_mode" = "false",
"format" = "json"
)
FROM KAFKA
(
"kafka_broker_list" = "127.0.0.1:9092",
"kafka_topic" = "test",
"property.group.id" = "doris"
);
其中user=regexp_extract(type, ‘([[:lower:]]+)C([[:lower:]]+)’, 2)将type字段中的数据按照正则表达函数抽取到user字段。
mysql> SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 2);
+-------------------------------------------------------------+
| regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 2) |
+-------------------------------------------------------------+
| d |
具体Doirs 的regexp_extract函数参照: https://doris.apache.org/zh-CN/docs/sql-manual/sql-functions/string-functions/regexp/regexp_extract?_highlight=regexp_extract
通过kafka-console-producer向kafka集群发送数据。
[root@17a5da45700b kafka_2.12-2.8.0]# ./bin/kafka-console-producer.sh --topic test --bootstrap-server localhost:9092
>
>{"id":"1","id":"user","type":"AbCdE","clickTime":"2022-06-17 01:08:21"}
数据验证:
mysql> select * from clicklog;
+---------------------+---------+------+------+------+
| clickTime | type | id | user | city |
+---------------------+---------+------+------+------+
| 2022-06-17 01:08:21 | AbCdE | 1 | d | NULL |
+---------------------+---------+------+------+------+
可以看到user字段的值是对type执行正则表达式【user=regexp_extract(type, ‘([[:lower:]]+)C([[:lower:]]+)’, 2)】抽取的结果。
任务查看:
mysql> SHOW ALL ROUTINE LOAD FOR load_from_kafka \G;
*************************** 1. row ***************************
Id: 1884278
Name: load_from_kafka_test
CreateTime: 2022-08-23 13:16:38
PauseTime: NULL
EndTime: NULL
DbName: default_cluster:kafka_doris
TableName: clicklog
State: RUNNING
DataSourceType: KAFKA
CurrentTaskNum: 1
JobProperties: {"timezone":"Europe/London","send_batch_parallelism":"1","load_to_single_tablet":"false","maxBatchSizeBytes":"209715200","exec_mem_limit":"2147483648","strict_mode":"false","jsonpaths":"","currentTaskConcurrentNum":"1","fuzzy_parse":"false","partitions":"*","columnToColumnExpr":"clickTime,id,type,user=regexp_extract(`type`, '([[:lower:]]+)C([[:lower:]]+)', 2)","maxBatchIntervalS":"5","whereExpr":"*","dataFormat":"json","precedingFilter":"*","mergeType":"APPEND","format":"json","json_root":"","deleteCondition":"*","desireTaskConcurrentNum":"1","maxErrorNum":"0","strip_outer_array":"false","execMemLimit":"2147483648","num_as_string":"false","maxBatchRows":"300000"}
DataSourceProperties: {"topic":"test","currentKafkaPartitions":"0","brokerList":"127.0.0.1:9092"}
CustomProperties: {"group.id":"doris","kafka_default_offsets":"OFFSET_END"}
Statistic: {"receivedBytes":71,"runningTxns":[],"errorRows":0,"committedTaskNum":7,"loadedRows":1,"loadRowsRate":0,"abortedTaskNum":0,"errorRowsAfterResumed":0,"totalRows":1,"unselectedRows":0,"receivedBytesRate":0,"taskExecuteTimeMs":35621}
Progress: {"0":"49"}
Lag: {"0":0}
ReasonOfStateChanged:
ErrorLogUrls:
OtherMsg:
1 row in set (0.00 sec)
ERROR:
No query specified
从任务状态可看出
最后宣传下我的书:《Spark内核和应用实战》,可以购买我的新书。
京东地址: https://item.jd.com/13613302.html