SparkStreaming读写kerberos HDFS(Yarn Cluster)

yarn Cluster模式也可适用于从集群外部提交(接口机)

提交命令

spark-submit的最后一行为args参数,经多次尝试发现yarn cluster模式通过该方式传入kerberos文件可行

  1. krb5.conf

    需要在yarn集群所有节点相同位置放置

  2. client.keytab

    需要在命令提交节点,yarn集群所有节点相同位置放置,container连接HDFS[kerberos]集群使用

  3. kafka_client_jaas.conf

    需要在yarn集群所有节点相同位置放置,container连接kafka[kerberos]集群使用

spark-submit --master yarn \
--deploy-mode cluster \
--class com.liubin.spark.kerberos.HdfsSinkDemoKerberosYarnCluster \
--keytab /tmp/client.keytab \
--principal [email protected] \
--conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=/tmp/kafka_client_jaas.conf" \
spark-example-1.0.0.jar \
/tmp/krb5.conf [email protected] /tmp/client.keytab /tmp/kafka_client_jaas.conf
resources文件

需要从kerberos集群中将相关文件拷贝到项目的resource目录下(需打入jar包中)

core-site.xml
hdfs-site.xml
yarn-site.xml
util类
package com.liubin.spark.kerberos

import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}

import scala.collection.JavaConversions._

class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {

  lazy val producer = createProducer()

  def send(topic: String, key: K, value: V): Future[RecordMetadata] =
    producer.send(new ProducerRecord[K, V](topic, key, value))

  def send(topic: String, value: V): Future[RecordMetadata] =
    producer.send(new ProducerRecord[K, V](topic, value))
}

object KafkaSink {

  def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
    val createProducerFunc = () => {
      val producer = new KafkaProducer[K, V](config)
      sys.addShutdownHook {
        producer.close()
      }
      producer
    }
    new KafkaSink(createProducerFunc)
  }

  def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}
主代码
package com.liubin.spark.kerberos

import java.util.Properties

import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

/**
  * author : liubin
  * date : 2019/5/8
  * Description : 利用kafkaDS工具类实现将数据存储到hdfs[HA,kerberos],数据源为kafka[kerberos]
  */
object HdfsSinkDemoKerberosYarnCluster {

  // kafka conf
  val sinkTopic = "kafkaSink"
  val sourceTopic = "kafkaSource"
  val bootstrapServers = "node1:9092,node2:9092,node3:9092"
  val autoOffsetReset = "latest"
  val groupId = "test-kerberos"

  // kerberos conf
  val krb5Debug = "true"
  val krb5Path = args(0)
  val principal = args(1)
  val keytab = args(2)
  val kafkaKerberos = args(3)

  def main(args: Array[String]): Unit = {

    // set global kerberos conf
    System.setProperty("java.security.krb5.conf", krb5Path)
    System.setProperty("sun.security.krb5.debug", krb5Debug)
    System.setProperty("java.security.auth.login.config", kafkaKerberos)

    val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
    val session = SparkSession.builder().config(conf).getOrCreate()
    val ssc = new StreamingContext(session.sparkContext, Seconds(5))

    // kafka source config
    val kafkaParams = Map[String, Object](
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "bootstrap.servers" -> bootstrapServers,
      "enable.auto.commit" -> (true: java.lang.Boolean),
      "auto.offset.reset" -> autoOffsetReset,
      //在kerberos环境下,以下配置需要增加
      "security.protocol" -> "SASL_PLAINTEXT",
      "sasl.kerberos.service.name" -> "kafka",
      "sasl.mechanism" -> "GSSAPI"
    )

    // kafka sink config
    val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
      val kafkaProducerConfig = {
        val p = new Properties()
        p.setProperty("bootstrap.servers", bootstrapServers)
        p.setProperty("key.serializer", classOf[StringSerializer].getName)
        p.setProperty("value.serializer", classOf[StringSerializer].getName)
        //在kerberos环境下,以下配置需要增加
        p.setProperty("security.protocol", "SASL_PLAINTEXT")
        p.setProperty("sasl.mechanism", "GSSAPI")
        p.setProperty("sasl.kerberos.service.name", "kafka")
        p
      }
      ssc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
    }
      
    // HDFS config
    val hadoopConf = new Configuration()
    hadoopConf.set("path", hdfsSource)
    hadoopConf.set("inputFormatClass", "TextInputFormat")
    hadoopConf.set("keyClass", "LongWritable")
    hadoopConf.set("valueClass", "Text")

    //进行kerberos安全认证
    UserGroupInformation.setConfiguration(hadoopConf)
    UserGroupInformation.loginUserFromKeytab(principal, keytab)

    // kafka数据源
    val kafkaDStream = KafkaSource.createDirectStream[String, String](ssc, sourceTopic, kafkaParams)

    try {
      kafkaDStream.foreachRDD(rdd => {
        rdd.saveAsTextFile("hdfs://HAname/user/spark/data")
      })
    } catch {
      case e: RuntimeException => e.printStackTrace()
    }

    ssc.start()
    ssc.awaitTermination()

  }
}

你可能感兴趣的:(spark,kerberos)