yarn Cluster模式也可适用于从集群外部提交(接口机)
spark-submit的最后一行为args参数,经多次尝试发现yarn cluster模式通过该方式传入kerberos文件可行
krb5.conf
需要在yarn集群所有节点相同位置放置
client.keytab
需要在命令提交节点,yarn集群所有节点相同位置放置,container连接HDFS[kerberos]集群使用
kafka_client_jaas.conf
需要在yarn集群所有节点相同位置放置,container连接kafka[kerberos]集群使用
spark-submit --master yarn \
--deploy-mode cluster \
--class com.liubin.spark.kerberos.HiveSinkDemoKerberosYarnCluster \
--keytab /tmp/client.keytab \
--principal [email protected] \
--conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=/tmp/kafka_client_jaas.conf" \
spark-example-1.0.0.jar \
/tmp/krb5.conf [email protected] /tmp/client.keytab /tmp/kafka_client_jaas.conf
需要从kerberos集群中将相关文件拷贝到项目的resource目录下(需打入jar包中)
core-site.xml
hdfs-site.xml
yarn-site.xml
hive-site.xml
package com.liubin.spark.kerberos
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
import scala.collection.JavaConversions._
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, key: K, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, key, value))
def send(topic: String, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, value))
}
object KafkaSink {
def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
val createProducerFunc = () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook {
producer.close()
}
producer
}
new KafkaSink(createProducerFunc)
}
def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}
package com.liubin.spark.kerberos
import java.util.Properties
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* author : liubin
* date : 2019/5/8
* Description : 利用kafkaDS工具类实现将数据存储到Hive[kerberos],数据源为kafka[kerberos]
*/
object HiveSinkDemoKerberosYarnCluster {
// kafka conf
val sinkTopic = "kafkaSink"
val sourceTopic = "kafkaSource"
val bootstrapServers = "node1:9092,node2:9092,node3:9092"
val autoOffsetReset = "latest"
val groupId = "test-kerberos"
// kerberos conf
val krb5Debug = "true"
val krb5Path = args(0)
val principal = args(1)
val keytab = args(2)
val kafkaKerberos = args(3)
def main(args: Array[String]): Unit = {
// set global kerberos conf
System.setProperty("java.security.krb5.conf", krb5Path)
System.setProperty("sun.security.krb5.debug", krb5Debug)
System.setProperty("java.security.auth.login.config", kafkaKerberos)
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
val session = SparkSession.builder().config(conf) .enableHiveSupport().getOrCreate()
// 配置hive支持动态分区
session.sqlContext.setConf("hive.exec.dynamic.partition", "true")
// 配置hive动态分区为非严格模式
session.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
// 配置是否合并小文件
session.sqlContext.setConf("hive.merge.sparkfiles", "true")
val ssc = new StreamingContext(session.sparkContext, Seconds(5))
// kafka source config
val kafkaParams = Map[String, Object](
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"bootstrap.servers" -> bootstrapServers,
"enable.auto.commit" -> (true: java.lang.Boolean),
"auto.offset.reset" -> autoOffsetReset,
//在kerberos环境下,以下配置需要增加
"security.protocol" -> "SASL_PLAINTEXT",
"sasl.kerberos.service.name" -> "kafka",
"sasl.mechanism" -> "GSSAPI"
)
// kafka sink config
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", bootstrapServers)
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
//在kerberos环境下,以下配置需要增加
p.setProperty("security.protocol", "SASL_PLAINTEXT")
p.setProperty("sasl.mechanism", "GSSAPI")
p.setProperty("sasl.kerberos.service.name", "kafka")
p
}
ssc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
val hadoopConf = new Configuration()
hbaseConf.addResource("hive-site.xml")
// Hbase操作:根据userPrincipal和keytabFile获取连接
UserGroupInformation.setConfiguration(hadoopConf)
UserGroupInformation.loginUserFromKeytab(principal, keytab)
// kafka数据源
val kafkaDStream = KafkaSource.createDirectStream[String, String](ssc, sourceTopic, kafkaParams)
try {
dstream.foreachRDD(rdd => {
})
} catch {
case e: RuntimeException => e.printStackTrace()
}
ssc.start()
ssc.awaitTermination()
}
}