spark streaming 处理kafka数据并写入mysql

spark streaming 从kafka读取数据,将流处理结果写入mysql

//spark streaming 从kafka读取数据,将流处理结果写入mysql
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import java.sql.{PreparedStatement,Connection,DriverManager}
import java.util.concurrent.atomic.AtomicInteger
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, Minutes}
import org.apache.log4j.Logger
import org.apache.log4j.Level

object KafkaWordCount{
    def main(args:Array[String]){
	  Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
        Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
        val kafkaParams = Map[String, Object](
            "bootstrap.servers" -> "localhost:9092",
            "key.deserializer" -> classOf[StringDeserializer],
            "value.deserializer" -> classOf[StringDeserializer],
            "group.id" -> "use_a_separate_group_id_for_each_stream",
            "auto.offset.reset" -> "latest",
            "enable.auto.commit" -> (false: java.lang.Boolean))
                
		//val numThreads = 3  //每个topic的分区数
        //val topicMap =topics.split(",").map((_,numThreads.toInt)).toMap
	    val sc = new SparkConf().setAppName("ZkWordCount")
        val ssc = new StreamingContext(sc, Seconds(10))
	    ssc.checkpoint("hdfs://nameservice1/test/checkpoint") //设置检查点
		val topics = Array("zklog")
        val stream = KafkaUtils.createDirectStream[String, String](
            ssc,
            PreferConsistent,
            Subscribe[String, String](topics, kafkaParams)
         )
        
		//val line = stream.map(record => (record.key, record.value))
	    val lines = stream.map(record => (record.value))
        val words = lines.flatMap(_.split(","))
        val pair = words.map(x => (x,1))
        val wordCounts = pair.reduceByKeyAndWindow(_ + _,_ - _,Minutes(1),Seconds(10),3) //这行代码的含义在下一节的窗口转换操作中会有介绍
        wordCounts.print()
		//下面是新增的语句,把DStream保存到MySQL数据库中     
        wordCounts.foreachRDD(rdd => {
			def func(records: Iterator[(String,Int)]) {
				var conn: Connection = null
				var stmt: PreparedStatement = null
				try {
					val url = "jdbc:mysql://172.xx.xx.xx:3306/bigdata"
					val user = "admin"
					val password = "admin"  
					conn = DriverManager.getConnection(url, user, password)
					records.foreach(p => {
						val sql = "insert into zklog(information,count) values (?,?)"
						stmt = conn.prepareStatement(sql);
						stmt.setString(1, p._1.trim)
						stmt.setInt(2,p._2.toInt)
						stmt.executeUpdate()
					})
				} catch {
					case e: Exception => e.printStackTrace()
				} finally {
					if (stmt != null) {
						stmt.close()
					}
					if (conn != null) {
					conn.close()
					}
				}
			}

			val repartitionedRDD = rdd.repartition(3)
			repartitionedRDD.foreachPartition(func)
		})
        ssc.start
        ssc.awaitTermination
	}
}

你可能感兴趣的:(kafka,spark,大数据)