SparkStream2.0.0 和kafka的无缝结合

Kafka是一个分布式的发布-订阅式的消息系统,简单来说就是一个消息队列,好处是数据是持久化到磁盘的(本文重点不是介绍kafka,就不多说了)。Kafka的使用场景还是比较多的,比如用作异步系统间的缓冲队列,另外,在很多场景下,我们都会如如下的设计:将一些数据(比如日志)写入到kafka做持久化存储,然后另一个服务消费kafka中的数据,做业务级别的分析,然后将分析结果写入HBase或者HDFS;正因为这个设计很通用,所以像Storm这样的大数据流式处理框架已经支持与kafka的无缝连接。当然,作为后起之秀,Spark同样对kafka提供了原生的支持。

本文要介绍的是Spark streaming + kafka的实战。



	4.0.0
	sprakStream
	sprakStream
	0.0.1-SNAPSHOT

	
		
		
			org.apache.spark
			spark-core_2.11
			2.0.0
			provided
		
		
		
			org.apache.spark
			spark-sql_2.11
			2.0.0
			provided
		

		
		
			org.apache.spark
			spark-streaming_2.11
			2.0.0
			provided
		
		
		
			org.apache.spark
			spark-mllib_2.11
			2.0.0
			provided
		

		
			org.apache.spark
			spark-streaming-kafka-0-10_2.11
			2.0.0
		

		
			org.apache.hbase
			hbase-client
			1.2.1
			provided
		
		
			org.apache.hbase
			hbase-server
			1.2.1
			provided
		

		
			redis.clients
			jedis
			2.8.0
			provided
		

		
			org.postgresql
			postgresql
			9.4-1202-jdbc4
			provided
		

		
			net.sf.json-lib
			json-lib
			2.2.3
		

		
			org.apache.commons
			commons-pool2
			2.2
		
	

	
		${basedir}/src/main/scala
		${basedir}/src/test/scala
		
			
				${basedir}/src/main/resources
			
		
		
			
				${basedir}/src/test/resources
			
		
		
			
				maven-compiler-plugin
				3.1
				
					1.8
					1.8
				
			
			
				org.apache.maven.plugins
				maven-shade-plugin
				2.2
				
					true
				
				
					
						package
						
							shade
						
						
							
								
									*:*
								
							
							
								
									*:*
									
										META-INF/*.SF
										META-INF/*.DSA
										META-INF/*.RSA
									
								
							
							
								
								
									reference.conf
								
								
									log4j.properties
								
							
						
					
				
			
		
	







package com.sprakStream.demo

import java.util.Properties
import java.util.regex.Matcher

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent

import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.ConsumerStrategies
import org.apache.spark.streaming.kafka010.LocationStrategies
import org.apache.spark.streaming.kafka010.HasOffsetRanges
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.TaskContext
import com.sprakStream.util.AppConstant
import com.sprakStream.bean.IpMapper
import com.sprakStream.util.CommUtil
import kafka.common.TopicAndPartition
import com.logger.util.LoggerUtil

object KafkaExampleOffset {

  def main(args: Array[String]): Unit = {

    //val conf = new SparkConf()
    //val sc = new SparkContext()
    //屋企的环境
    //    System.setProperty("spark.sql.warehouse.dir", "D:\\tools\\spark-2.0.0-bin-hadoop2.6");
    //    System.setProperty("hadoop.home.dir", "D:\\tools\\hadoop-2.6.0");
    //公司的环境
    System.setProperty("spark.sql.warehouse.dir", "D:\\DevelopTool\\spark-2.0.0-bin-hadoop2.6");
    println("success to Init...")
    val url = "jdbc:postgresql://172.16.12.190:5432/dataex_tmp"
    val prop = new Properties()
    prop.put("user", "postgres")
    prop.put("password", "issing")
    val conf = new SparkConf().setAppName("wordcount").setMaster("local")
    val ssc = new StreamingContext(conf, Seconds(2))
    val sparkSession = SparkSession.builder().config(conf).getOrCreate()
    val util = Utilities;
    util.setupLogging()
    // Construct a regular expression (regex) to extract fields from raw Apache log lines  
    val pattern = util.apacheLogPattern()
    // hostname:port for Kafka brokers, not Zookeeper  
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> AppConstant.KAFKA_HOST,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "example",
      "enable.auto.commit" -> (false: java.lang.Boolean) //"auto.offset.reset" -> "latest",
      //      "auto.offset.reset" -> "largest"   //自动将偏移重置为最新偏移(默认)
      //      "auto.offset.reset" -> "earliest"  //自动将偏移重置为最早的偏移
      //      "auto.offset.reset" -> "none"      //如果没有为消费者组找到以前的偏移,则向消费者抛出异常  
      )
    // List of topics you want to listen for from Kafka  
    val topics = List(AppConstant.KAFKA_TOPIC).toSet

    /**
     * kafka          offset
     */

    /**
     * 从指定位置开始读取kakfa数据
     * 注意:由于Exactly  Once的机制,所以任何情况下,数据只会被消费一次!
     *      指定了开始的offset后,将会从上一次Streaming程序停止处,开始读取kafka数据
     */
    //实验得出,当TopicPartition有被放到offsets中的时候,程序可以去消费,否则不消费;消费者消费的模式是按照分区,一个分区一个分区消费的
    //2L:L表示long类型,2指从偏移值为2的消息开始消费
    val offsets = Map[TopicPartition, Long](
      new TopicPartition(AppConstant.KAFKA_TOPIC, 0) -> 5000L,
      new TopicPartition(AppConstant.KAFKA_TOPIC, 1) -> 5000L,
      new TopicPartition(AppConstant.KAFKA_TOPIC, 2) -> 5000L)
    //通过KafkaUtils.createDirectStream(...)获得kafka数据,kafka相关参数由kafkaParams指定

    val line = KafkaUtils.createDirectStream(
      ssc,
      PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, offsets));
    //数据操作
    line.foreachRDD(mess => {
      //获取offset集合
      val offsetsList = mess.asInstanceOf[HasOffsetRanges].offsetRanges
      mess.foreachPartition(lines => {
        lines.foreach(line => {
          //println()
          //println("---------------------------------------------------------------------------------------")
          val o: OffsetRange = offsetsList(TaskContext.get.partitionId)
          println("++++++++++++++++++++++++++++++此处记录offset+++++++++++++++++++++++++++++++++++++++")
          //println("--topic::" + o.topic + "--partition:" + o.partition + "--fromOffset:" + o.fromOffset + "--untilOffset:" + o.untilOffset)
          //println("+++++++++++++++++++++++++++++++此处消费数据操作++++++++++++++++++++++++++++++++++++++")
          println("The kafka  line is " + line)
          LoggerUtil.loggerToBuffer(line.toString())
          //println("---------------------------------------------------------------------------------------")
          //println()
        })
      })
    })
    // Kick it off  
    ssc.checkpoint("/user/root/spark/checkpoint")
    ssc.start()
    ssc.awaitTermination()
    println("KafkaExample-结束.................................")
  }
}
object SQLContextSingleton2 {
  @transient private var instance: SQLContext = _
  def getInstance(sparkContext: SparkContext): SQLContext = {
    if (instance == null) {
      instance = new SQLContext(sparkContext)
    }
    instance
  }
}

注:以上测试通过,可以根据需要修改。如有疑问,请留言!


你可能感兴趣的:(scala,spark)