spark streaming任务,读kafka写入mysql

1.背景

spark streaming direct方式读取kafka消息,设置checkpoint 并写入mysql

2.发送数据到kafka

package com.bigdata.kafka;

import java.util.Properties;
import java.util.concurrent.TimeUnit;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;

public class KafkaSend {

    public static void main(String[] args) {
        Properties props = new Properties();
        props.put("bootstrap.servers", "dn1:9092");
        props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        //生产者发送消息
        String topic = "ss";//topic
        Producer<String, String> procuder = new KafkaProducer<String,String>(props);
        for (int i = 1; i <= 10000; i++) {
            String value = "value_" + i;
            ProducerRecord<String, String> msg = new ProducerRecord<String, String>(topic, value);
            procuder.send(msg);
            try {
                Thread.sleep(1000);
                System.out.println(msg.toString());
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        System.out.println("send message over.");
        procuder.close(100,TimeUnit.MILLISECONDS);
    }
}

3.spark streaming 程序消费

package com.bigdata.sparkstreaming

import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer

/**
  * Spark Streaming对接Kafka,写到mysql
  */
object KafkaDirectWordCount2Mysql {


  def createContext(brokers: String, topics: String, checkpointDirectory: String)
  : StreamingContext = {
    //程序第一运行时会创建该条语句,如果应用程序失败,则会从checkpoint中恢复,该条语句不会执行
    val sparkConf = new SparkConf().setAppName("KafkaReceiverWordCount").setMaster("local[2]")

    val ssc = new StreamingContext(sparkConf, Seconds(5))
    ssc.checkpoint(checkpointDirectory)

    val topicsSet = topics.split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
    val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
      ssc, kafkaParams, topicsSet
    )
    messages.map(_._2).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).foreachRDD(rdd => {
      rdd.foreachPartition(p => {
        val paramsList = ListBuffer[ParamsList]()
        val jdbcWrapper = JDBCWrapper.getInstance()
        while (p.hasNext) {
          val record = p.next()
          val paramsListTmp = new ParamsList
          paramsListTmp.word = record._1
          paramsListTmp.count = record._2
          paramsList += paramsListTmp
        }
        jdbcWrapper.doBatch("INSERT INTO spark_streaming_test(word,count) VALUES(?,?)", paramsList)
      })
    })
    ssc
  }

  def main(args: Array[String]): Unit = {

    if (args.length != 3) {
      System.err.println("Usage: KafkaDirectWordCount   ")
      System.exit(1)
    }
    val Array(brokers, topics, checkpointDirectory) = args
    val ssc = StreamingContext.getOrCreate(checkpointDirectory, () => {
      createContext(brokers, topics, checkpointDirectory)
    })
    ssc.start()
    ssc.awaitTermination()
  }
}

3.依赖mysql连接池和插入

package com.bigdata.sparkstreaming

import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet, SQLException}
import java.util.concurrent.LinkedBlockingDeque
import scala.collection.mutable.ListBuffer

object JDBCWrapper {
  private var jdbcInstance: JDBCWrapper = _

  def getInstance(): JDBCWrapper = {
    synchronized {
      if (jdbcInstance == null) {
        jdbcInstance = new JDBCWrapper()
      }
    }
    jdbcInstance
  }
}

class JDBCWrapper {
  // 连接池的大小
  val POOL_SIZE: Int = 10

  val dbConnectionPool = new LinkedBlockingDeque[Connection](POOL_SIZE)
  try
    Class.forName("com.mysql.jdbc.Driver")
  catch {
    case e:
      ClassNotFoundException => e.printStackTrace()
  }

  for (i <- 0 until POOL_SIZE) {
    try {
      val conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/test", "root", "root");
      dbConnectionPool.put(conn)
    } catch {
      case e:
        Exception => e.printStackTrace()
    }
  }

  def getConnection(): Connection =

    synchronized {
      while (0 == dbConnectionPool.size()) {
        try {
          Thread.sleep(20)
        } catch {
          case e:
            InterruptedException => e.printStackTrace()
        }
      }
      dbConnectionPool.poll()
    }


  /**
    * 批量插入
    *
    * @param sqlText    sql语句字符
    * @param paramsList 参数列表
    * @return Array[Int]
    */
  def doBatch(sqlText: String, paramsList: ListBuffer[ParamsList]): Array[Int] = {
    val conn: Connection = getConnection()
    var ps: PreparedStatement = null
    var result: Array[Int] = null

    try {
      conn.setAutoCommit(false)
      ps = conn.prepareStatement(sqlText)

      for (parameters <- paramsList) {
        println("word_count\t" + parameters)
        // // word, count
        ps.setObject(1, parameters.word)
        ps.setObject(2, parameters.count)
        ps.addBatch()
      }
      result = ps.executeBatch
      conn.commit()
    } catch {
      case e:
        Exception => e.printStackTrace()
    } finally {
      if (ps != null) try {
        ps.close()
      } catch {
        case e:
          SQLException => e.printStackTrace()
      }

      if (conn != null) try {
        dbConnectionPool.put(conn)
      } catch {
        case e:
          InterruptedException => e.printStackTrace()
      }
    }
    result
  }
}

class ParamsList extends Serializable {
  var word: String = _
  var count: Int = _

  override def toString = s"ParamsList($word,$count)"
}

4.运行

pom依赖

         <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-core_2.11artifactId>
            <version>${spark.version}version>
        dependency>
        <dependency>
            <groupId>org.scala-langgroupId>
            <artifactId>scala-libraryartifactId>
            <version>${scala.version}version>
        dependency>
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming_2.11artifactId>
            <version>${spark.version}version>
        dependency>
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming-kafka-0-8_2.11artifactId>
            <version>${spark.version}version>
        dependency>       
        <dependency>
            <groupId>mysqlgroupId>
            <artifactId>mysql-connector-javaartifactId>
            <version>5.1.38version>
        dependency>

建表语句

CREATE TABLE `spark_streaming_test` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `word` varchar(50) DEFAULT NULL,
  `count` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=86 DEFAULT CHARSET=utf8;

执行KafkaDirectWordCount2Mysql main方法 传入参数 dn1:9092 ss /Users/xxx/checkpoint

你可能感兴趣的:(Apache,Spark)