在调试flink table api 查询 hive数据的时候,真的是遇到很多坑,特别是要hive存储的数据是在腾讯云的cos上,而且我是跨集群查询数据,要解决各种依赖和环境问题,下面的代码和pom.xml 已经调试成功,在本地和集群 on yarn都可以运行,本地的时候需要在idea里面加args为dev,集群 on yarn不用加。
package com.bigdata.etl
import org.apache.flink.table.api.{EnvironmentSettings, TableEnvironment}
import org.apache.flink.table.catalog.hive.HiveCatalog
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
object FlinkTableTest extends App {
//本地idea环境需要设置用户为hadoop
System.setProperty("HADOOP_USER_NAME", "hadoop")
val settings = EnvironmentSettings.newInstance().useBlinkPlanner().build()
println(settings.isStreamingMode)
val stenv = TableEnvironment.create(settings)
//查询默认的元数据
stenv.executeSql("show catalogs").print()
stenv.useCatalog("default_catalog")
stenv.executeSql("show databases").print()
stenv.executeSql("select 1").print()
println("-----fengexian--------------")
val name = "hive"
val defaultDatabase = "odl"
var hiveConfDir = ""
var hadoopConf=""
val hiveVision = "2.3.6"
if (args.size > 0) {
hiveConfDir = "/Users/duzhixin/Documents/flink-hive-conf"
hadoopConf="/Users/duzhixin/Documents/flink-hive-conf"
} else {
hiveConfDir = "/usr/local/service/hive/conf"
}
//在这里把hive,hadoop的配置文件加进去,本地idea环境需要制定hive和hadoop的配置文件,集群上不用制定hadoop的配置
val hive = new HiveCatalog(name, defaultDatabase, hiveConfDir,hadoopConf,hiveVision)
hive.getHiveConf.set("streaming-source.enable ", "true")
stenv.getConfig.getConfiguration.setString("streaming-source.enable ", "true")
stenv.getConfig.getConfiguration.setString("table.exec.hive.infer-source-parallelism.max ", "10000")
stenv.getConfig.getConfiguration.setString("table.exec.hive.infer-source-parallelism ", "true")
stenv.registerCatalog("hive", hive)
// set the HiveCatalog as the current catalog of the session
stenv.useCatalog("hive")
//执行sql,查询hive的元数据
stenv.executeSql("show databases").print()
stenv.executeSql("show tables").print()
stenv.executeSql("select 1 from test.app limit 1").print()
stenv.executeSql("select * from odl.tb_book where dt='2021-06-05' limit 10").print()
stenv.executeSql("select * from odl.dadian where dt='2021-06-05' limit 10").print()
}
cloudera
https://repository.cloudera.com/artifactory/cloudera-repos/
spring-plugin
https://repo.spring.io/plugins-release/
4.0.0
org.jiashu
flink-dw
1.0-SNAPSHOT
jar
8
8
2.4.3
2.12
2.8.2
1.2.14
9.2.5.v20141112
2.17
1.8
1.2.0
2.3.6
1.13.0
src/main/scala
flink-dw
org.apache.maven.plugins
maven-compiler-plugin
3.5.1
1.8
org.scala-tools
maven-scala-plugin
2.12
compile
testCompile
org.apache.maven.plugins
maven-jar-plugin
target/classes/
.
org.apache.maven.plugins
maven-dependency-plugin
copy-dependencies
package
copy-dependencies
jar
jar
${project.build.directory}/lib
org.apache.spark
spark-core_${scala.version}
${spark.version}
org.apache.flink
flink-clients_2.12
${flink.version}
redis.clients
jedis
2.9.0
org.apache.flink
flink-connector-kafka_2.12
${flink.version}
org.apache.kafka
kafka-clients
2.6.0
org.apache.flink
flink-table-api-scala-bridge_${scala.version}
${flink.version}
org.apache.flink
flink-table-planner-blink_${scala.version}
${flink.version}
org.apache.flink
flink-streaming-scala_${scala.version}
${flink.version}
org.codehaus.jackson
jackson-core-asl
1.9.13
com.alibaba
fastjson
1.2.47
org.apache.flink
flink-connector-hive_2.12
1.13.1
org.apache.flink
flink-table-api-java-bridge_2.12
1.13.1
org.apache.hive
hive-exec
${hive.version}
org.apache.hadoop
hadoop-auth
2.8.5
org.apache.hadoop
hadoop-client
2.8.5
org.apache.hadoop
hadoop-common
2.8.5
hadoop-util
hadoop-util
0.3.0
com.qcloud
cos_api
5.6.42
com.qcloud
qcloud-java-sdk
2.0.1
commons-codec
commons-codec
1.11
org.apache.commons
commons-compress
1.19
org.apache.commons
commons-lang3
3.11
org.apache.maven.plugins
maven-resources-plugin
2.4
org.apache.maven.plugins
maven-compiler-plugin
3.8.1
org.apache.maven.plugins
maven-dependency-plugin
2.8
org.apache.maven.plugins
maven-jar-plugin
2.4