本文只提供FLINK 1.12.2 读取KAFKA写入HIVE的完整示例,具体说明可以参考《FLINK 1.12.2 流式写入HDFS(hive)的几种方式》和《FLINK 1.12.2 读取KAFKA的2种方式》。
目录
1依赖
2代码
3发布集群
3.1打包后上传到集群执行
3.2IDE远程发布集群执行
com.fasterxml.jackson.core
jackson-databind
2.12.0
org.apache.flink
flink-connector-kafka_2.11
${flink.version}
org.apache.flink
flink-core
${flink.version}
org.apache.flink
flink-clients_2.11
${flink.version}
org.apache.flink
flink-json
${flink.version}
org.apache.thrift
libfb303
0.9.3
pom
org.apache.flink
flink-connector-hive_2.11
${flink.version}
org.apache.flink
flink-table-api-java-bridge_2.11
${flink.version}
org.apache.flink
flink-table-planner-blink_2.11
${flink.version}
org.apache.flink
flink-streaming-scala_2.11
${flink.version}
org.apache.flink
flink-table-common
${flink.version}
org.apache.hive
hive-exec
${hive.version}
org.apache.hive
hive-jdbc
${hive.version}
org.apache.hive
hive-metastore
${hive.version}
org.apache.flink
flink-hadoop-compatibility_2.11
${flink.version}
org.apache.hadoop
hadoop-common
${hadoop.version}
org.apache.hadoop
hadoop-client
${hadoop.version}
package com.kafkatest.consumer;
import com.kafkatest.KafkaProperties;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import java.time.Duration;
// flink消费者
public class KafkaFlinkSQLConsumerDemo3 {
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME","root");
// 1.创建HiveCatalog
String name = "myhive"; // Catalog名字
String defaultDatabase = "flink_onhive_test"; //默认数据库
String hiveConfDir = "/kafkatest/hiveconf"; // hive配置文件的目录.需要把hive-site.xml添加到该目录,目前只认本地文件系统
//String hiveConfDir = "/home/dingyi/hive-3.1.2/conf";
HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//使用StreamExecutionEnvironment创建StreamTableEnvironment,必须设置StreamExecutionEnvironment的checkpoint
env.enableCheckpointing(10000,CheckpointingMode.EXACTLY_ONCE);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//创建onhive环境
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.useBlinkPlanner() //hive要求使用
.inStreamingMode() //提供streaming方式
.build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create( env, settings );
Configuration configuration = tableEnv.getConfig().getConfiguration();
configuration.setString("table.exec.hive.fallback-mapred-reader", "true");
//直接可以使用TableEnvironment的话需要直接设置tenv的checkpoint
//TableEnvironment tableEnv = TableEnvironment.create(settings);
//configuration.set(ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE);
//configuration.set(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(2));
// 2.注册HiveCatalog
tableEnv.registerCatalog(name, hive);
// 3.把HiveCatalog: myhive作为当前session的catalog
tableEnv.useCatalog(name);
tableEnv.useDatabase(defaultDatabase);
//指定方言
tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
// 4.建表sql以kafka为数据源,创建后就会监听kafka并写入数据至flink的内存
tableEnv.executeSql("drop table if exists t_KafkaMsgSourceTable");
tableEnv.executeSql("CREATE TABLE IF NOT EXISTS t_KafkaMsgSourceTable ("
+ "ip STRING"
+ ",msg STRING"
+ ",ts BIGINT" //13位原始时间戳
+ ",ts3 AS TO_TIMESTAMP(FROM_UNIXTIME(ts/ 1000, 'yyyy-MM-dd HH:mm:ss'))" //flink的TIMESTAMP(3)格式
+ ",WATERMARK FOR ts3 AS ts3 - INTERVAL '15' SECOND" //水印最迟15s
+ ")"
+ " WITH ("
+ " 'connector' = 'kafka',"
+ " 'topic' = '" + KafkaProperties.TOPIC + "',"
+ " 'properties.bootstrap.servers' = '" + KafkaProperties.KAFKA_SERVER_URL + ":" + KafkaProperties.KAFKA_SERVER_PORT0 + "',"
+ " 'properties.group.id' = 'kafkaflinkhivedemo',"
+ " 'scan.startup.mode' = 'latest-offset'," //earliest-offset
+ " 'format' = 'json'"
+ ")");
//指定方言
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
// 5. 建表sql以hive为目的地
tableEnv.executeSql("drop table if exists t_kafkaMsg2hiveTable");
tableEnv.executeSql("CREATE TABLE IF NOT EXISTS t_kafkaMsg2hiveTable ("
+ "ip STRING,"
+ "msg STRING"
+ ")"
+ " PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES ("
+ " 'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00'," // hive 分区提取器提取时间戳的格式
+ " 'sink.partition-commit.trigger'='partition-time'," // 分区触发提交的类型可以指定 "process-time" 和 "partition-time" 处理时间和分区时间
+ " 'sink.partition-commit.delay'='0s'," // 提交延迟
+ " 'sink.partition-commit.policy.kind'='metastore,success-file'" // 提交类型
+ ")");
// 6. 同步2个表
tableEnv.executeSql("INSERT INTO t_kafkaMsg2hiveTable "
+ "SELECT ip,msg,DATE_FORMAT(ts3, 'yyyy-MM-dd'), DATE_FORMAT(ts3, 'HH') FROM t_KafkaMsgSourceTable");
}
}
在pom中按照官网配置build属性
org.apache.maven.plugins
maven-shade-plugin
3.1.1
package
shade
com.google.code.findbugs:jsr305
org.slf4j:*
log4j:*
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
com.kafkatest.consumer.KafkaFlinkSQLConsumerDemo3
最好把依赖都设置成provided,打包后手动从Flink的WEB界面上传到集群中执行。
把代码中创建执行环境的方式改为创建远程执行环境,就可以在IDE中直接发布job到Flink集群中,即把下面的语句改成createRemoteEnvironment。
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("ip", port);