FLINK 1.12.2 读取KAFKA写入HIVE的完整示例

本文只提供FLINK 1.12.2 读取KAFKA写入HIVE的完整示例,具体说明可以参考《FLINK 1.12.2 流式写入HDFS(hive)的几种方式》和《FLINK 1.12.2 读取KAFKA的2种方式》。

目录

1依赖

2代码

3发布集群

3.1打包后上传到集群执行

3.2IDE远程发布集群执行


1依赖





com.fasterxml.jackson.core

jackson-databind

2.12.0





org.apache.flink

flink-connector-kafka_2.11

${flink.version}







org.apache.flink

flink-core

${flink.version}





org.apache.flink

flink-clients_2.11

${flink.version}











org.apache.flink

flink-json

${flink.version}









org.apache.thrift

libfb303

0.9.3

pom





org.apache.flink

flink-connector-hive_2.11

${flink.version}









org.apache.flink

flink-table-api-java-bridge_2.11

${flink.version}





org.apache.flink

flink-table-planner-blink_2.11

${flink.version}





org.apache.flink

flink-streaming-scala_2.11

${flink.version}





org.apache.flink

flink-table-common

${flink.version}









org.apache.hive

hive-exec

${hive.version}





org.apache.hive

hive-jdbc

${hive.version}





org.apache.hive

hive-metastore

${hive.version}







org.apache.flink

flink-hadoop-compatibility_2.11

${flink.version}







org.apache.hadoop

hadoop-common

${hadoop.version}





org.apache.hadoop

hadoop-client

${hadoop.version}



2代码

package com.kafkatest.consumer;



import com.kafkatest.KafkaProperties;

import org.apache.flink.configuration.Configuration;

import org.apache.flink.streaming.api.CheckpointingMode;

import org.apache.flink.streaming.api.TimeCharacteristic;

import org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.flink.table.api.EnvironmentSettings;

import org.apache.flink.table.api.SqlDialect;

import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

import org.apache.flink.table.catalog.hive.HiveCatalog;



import java.time.Duration;





// flink消费者

public class KafkaFlinkSQLConsumerDemo3 {



public static void main(String[] args) throws Exception {



System.setProperty("HADOOP_USER_NAME","root");



// 1.创建HiveCatalog

String name = "myhive"; // Catalog名字

String defaultDatabase = "flink_onhive_test"; //默认数据库

String hiveConfDir = "/kafkatest/hiveconf"; // hive配置文件的目录.需要把hive-site.xml添加到该目录,目前只认本地文件系统

//String hiveConfDir = "/home/dingyi/hive-3.1.2/conf";



HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir);



StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();



//使用StreamExecutionEnvironment创建StreamTableEnvironment,必须设置StreamExecutionEnvironment的checkpoint

env.enableCheckpointing(10000,CheckpointingMode.EXACTLY_ONCE);

env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);



//创建onhive环境

EnvironmentSettings settings = EnvironmentSettings.newInstance()

.useBlinkPlanner() //hive要求使用

.inStreamingMode() //提供streaming方式

.build();

StreamTableEnvironment tableEnv = StreamTableEnvironment.create( env, settings );

Configuration configuration = tableEnv.getConfig().getConfiguration();

configuration.setString("table.exec.hive.fallback-mapred-reader", "true");



//直接可以使用TableEnvironment的话需要直接设置tenv的checkpoint

//TableEnvironment tableEnv = TableEnvironment.create(settings);

//configuration.set(ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE);

//configuration.set(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(2));



// 2.注册HiveCatalog

tableEnv.registerCatalog(name, hive);

// 3.把HiveCatalog: myhive作为当前session的catalog

tableEnv.useCatalog(name);

tableEnv.useDatabase(defaultDatabase);



//指定方言

tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);



// 4.建表sql以kafka为数据源,创建后就会监听kafka并写入数据至flink的内存

tableEnv.executeSql("drop table if exists t_KafkaMsgSourceTable");

tableEnv.executeSql("CREATE TABLE IF NOT EXISTS t_KafkaMsgSourceTable ("

+ "ip STRING"

+ ",msg STRING"

+ ",ts BIGINT" //13位原始时间戳

+ ",ts3 AS TO_TIMESTAMP(FROM_UNIXTIME(ts/ 1000, 'yyyy-MM-dd HH:mm:ss'))" //flink的TIMESTAMP(3)格式

+ ",WATERMARK FOR ts3 AS ts3 - INTERVAL '15' SECOND" //水印最迟15s

+ ")"

+ " WITH ("

+ " 'connector' = 'kafka',"

+ " 'topic' = '" + KafkaProperties.TOPIC + "',"

+ " 'properties.bootstrap.servers' = '" + KafkaProperties.KAFKA_SERVER_URL + ":" + KafkaProperties.KAFKA_SERVER_PORT0 + "',"

+ " 'properties.group.id' = 'kafkaflinkhivedemo',"

+ " 'scan.startup.mode' = 'latest-offset'," //earliest-offset

+ " 'format' = 'json'"

+ ")");



//指定方言

tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);



// 5. 建表sql以hive为目的地

tableEnv.executeSql("drop table if exists t_kafkaMsg2hiveTable");

tableEnv.executeSql("CREATE TABLE IF NOT EXISTS t_kafkaMsg2hiveTable ("

+ "ip STRING,"

+ "msg STRING"

+ ")"

+ " PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES ("

+ " 'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00'," // hive 分区提取器提取时间戳的格式

+ " 'sink.partition-commit.trigger'='partition-time'," // 分区触发提交的类型可以指定 "process-time" 和 "partition-time" 处理时间和分区时间

+ " 'sink.partition-commit.delay'='0s'," // 提交延迟

+ " 'sink.partition-commit.policy.kind'='metastore,success-file'" // 提交类型

+ ")");



// 6. 同步2个表

tableEnv.executeSql("INSERT INTO t_kafkaMsg2hiveTable "

+ "SELECT ip,msg,DATE_FORMAT(ts3, 'yyyy-MM-dd'), DATE_FORMAT(ts3, 'HH') FROM t_KafkaMsgSourceTable");



}

}

 

3发布集群

3.1打包后上传到集群执行

在pom中按照官网配置build属性







org.apache.maven.plugins

maven-shade-plugin

3.1.1





package



shade









com.google.code.findbugs:jsr305

org.slf4j:*

log4j:*











*:*



META-INF/*.SF

META-INF/*.DSA

META-INF/*.RSA











com.kafkatest.consumer.KafkaFlinkSQLConsumerDemo3















最好把依赖都设置成provided,打包后手动从Flink的WEB界面上传到集群中执行。

3.2IDE远程发布集群执行

把代码中创建执行环境的方式改为创建远程执行环境,就可以在IDE中直接发布job到Flink集群中,即把下面的语句改成createRemoteEnvironment。

StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("ip", port);

 

你可能感兴趣的:(FLINK,1.12.2,学习实践记录,flink,hive,hadoop,java,kafka)