Hudi Master节点最新代码打包
a.最外层install(进flink群 v:zoomake1024)
mvn clean install -DskipTests -Dscala-2.12 -Pflink-bundle-shade-hive1
b.选中packaging/hudi-flink-bundle
mvn clean install -DskipTests -Dhadoop.version=2.6.0-cdh5.12.1 -Dscala-2.12 -Pflink-bundle-shade-hive1
准备阶段
a.启动Flink集群
./bin/yarn-session.sh -s 2 -jm 2048 -tm 5120 -nm FlinkAndHudi -qu root.media -at Flink -d
b.执行
export HADOOP_CONF_DIR=/home/.../hadoopconf
export HADOOP_CLASSPATH=`hadoop classpath`
Flink SQL入湖流程
a.新建mylib
flink-sql-connector-kafka_2.12-1.14.2.jar
hudi-flink-bundle_2.1?-..*.jar
hadoop-mapreduce-client-core-2.7.3.jar
包如果放到lib目录下,容易造成冲突
b.
./bin/sql-client.sh embedded -l mylib -s yarn-session shell
c.
set execution.result-mode=tableau;
or
set execution.result-mode=changelog;
set execution.checkpointing.interval=5sec;
d.创建数据源表
CREATE TABLE KafkaTable (
uid VARCHAR(20),
name VARCHAR(10),
`partition` VARCHAR(20)
) WITH (
'connector' = 'kafka',
'topic' = 'up_test',
'properties.bootstrap.servers' = '',
'properties.group.id' = 'fsql_mytest',
'scan.startup.mode' = 'latest-offset',
'format' = 'csv'
);
e:创建Hudi表
CREATE TABLE HudiTable(
uid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
name VARCHAR(10),
`partition` VARCHAR(20)
)
PARTITIONED BY (`partition`)
WITH (
'connector' = 'hudi',
'path' = 'hdfs://dc2/user/*',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'true',
'read.streaming.start-commit' = '20210316134557',
'read.streaming.check-interval' = '1',
'compaction.trigger.strategy' = 'num_or_time',
'compaction.delta_commits' = '1',
'compaction.delta_seconds' = '30'
);
f:入湖
insert into HudiTable (select * from KafkaTable);
注:
read.streaming.enabled 设置为 true,表明通过 streaming 的方式读取表数据
read.streaming.check-interval 指定了 source 监控新的 commits 的间隔为 4s
默认情况下 5个checkpoint 触发一次commit,5个commit触发一次compact
查询Hudi,验证数据已入湖
a.批读
读hudi数据,默认情况下,hoodie 表是批量读取的,即读取最新的快照数据集并返回
CREATE TABLE HudiTable6(
uid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
name VARCHAR(10),
`partition` VARCHAR(20)
)
PARTITIONED BY (`partition`)
WITH (
'connector' = 'hudi',
'path' = 'hdfs://dc2/user/*',
'table.type' = 'MERGE_ON_READ'
);
b.流读
CREATE TABLE HudiTable8(
uid VARCHAR(20) PRIMARY KEY NOT ENFORCED,
name VARCHAR(10),
`partition` VARCHAR(20)
)
PARTITIONED BY (`partition`)
WITH (
'connector' = 'hudi',
'path' = 'hdfs://dc2/user**',
'table.type' = 'MERGE_ON_READ',
'read.streaming.enabled' = 'true',
'read.start-commit' = 'earliest'
);
注:
Start commit time in format ‘yyyyMMddHHmmss’, use earliest to consume from the start commit
Flink使用DataStream API集成Hudi、Hudi删除数据
a.–record-key-field、–source-ordering-field partition参数说明和设置在源码org.apache.hudi.streamer.FlinkStreamerConfig中,如果不设置,会取默认,导致报错
b.所需jar包
flink-connector-kafka_2.12-1.14.2.jar
kafka-clients-2.4.1.jar
c.执行
./bin/flink run -c org.apache.hudi.streamer.HoodieFlinkStreamer -m yarn-cluster -yat MAPREDUCE -yqu media -ynm 164812123 -yjm 2048 -ytm 3072 -p 4 -ys 2 mylib/hudi-flink-bundle_2.12-0.11.0-SNAPSHOT.jar \
--kafka-bootstrap-servers ip:port,... \
--kafka-topic up_test \
--kafka-group-id fstreamer_mytest \
--checkpoint-interval 3000 \
--target-base-path hdfs://dc2/user/* \
--record-key-field uid \
--table-type MERGE_ON_READ \
--target-table t1 \
--partition-path-field partition \
--source-ordering-field partition \
--source-avro-schema-path hdfs://dc2/user/*/t1_schema.avsc
d.发送Json格式数据,_hoodie_is_deleted为true时,即删除数据
{
"type":"record",
"name":"tl",
"fields":[{
"name": "uid",
"type": "string"
}, {
"name": "name",
"type": "string"
}, {
"name": "partition",
"type": "string"
}, {
"name" : "_hoodie_is_deleted",
"type" : "boolean",
"default" : false
}
]}