spark streaming direct方式读取kafka消息,设置checkpoint 并写入mysql
package com.bigdata.kafka;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
public class KafkaSend {
public static void main(String[] args) {
Properties props = new Properties();
props.put("bootstrap.servers", "dn1:9092");
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
//生产者发送消息
String topic = "ss";//topic
Producer<String, String> procuder = new KafkaProducer<String,String>(props);
for (int i = 1; i <= 10000; i++) {
String value = "value_" + i;
ProducerRecord<String, String> msg = new ProducerRecord<String, String>(topic, value);
procuder.send(msg);
try {
Thread.sleep(1000);
System.out.println(msg.toString());
} catch (InterruptedException e) {
e.printStackTrace();
}
}
System.out.println("send message over.");
procuder.close(100,TimeUnit.MILLISECONDS);
}
}
package com.bigdata.sparkstreaming
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
/**
* Spark Streaming对接Kafka,写到mysql
*/
object KafkaDirectWordCount2Mysql {
def createContext(brokers: String, topics: String, checkpointDirectory: String)
: StreamingContext = {
//程序第一运行时会创建该条语句,如果应用程序失败,则会从checkpoint中恢复,该条语句不会执行
val sparkConf = new SparkConf().setAppName("KafkaReceiverWordCount").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
ssc.checkpoint(checkpointDirectory)
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topicsSet
)
messages.map(_._2).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).foreachRDD(rdd => {
rdd.foreachPartition(p => {
val paramsList = ListBuffer[ParamsList]()
val jdbcWrapper = JDBCWrapper.getInstance()
while (p.hasNext) {
val record = p.next()
val paramsListTmp = new ParamsList
paramsListTmp.word = record._1
paramsListTmp.count = record._2
paramsList += paramsListTmp
}
jdbcWrapper.doBatch("INSERT INTO spark_streaming_test(word,count) VALUES(?,?)", paramsList)
})
})
ssc
}
def main(args: Array[String]): Unit = {
if (args.length != 3) {
System.err.println("Usage: KafkaDirectWordCount " )
System.exit(1)
}
val Array(brokers, topics, checkpointDirectory) = args
val ssc = StreamingContext.getOrCreate(checkpointDirectory, () => {
createContext(brokers, topics, checkpointDirectory)
})
ssc.start()
ssc.awaitTermination()
}
}
package com.bigdata.sparkstreaming
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet, SQLException}
import java.util.concurrent.LinkedBlockingDeque
import scala.collection.mutable.ListBuffer
object JDBCWrapper {
private var jdbcInstance: JDBCWrapper = _
def getInstance(): JDBCWrapper = {
synchronized {
if (jdbcInstance == null) {
jdbcInstance = new JDBCWrapper()
}
}
jdbcInstance
}
}
class JDBCWrapper {
// 连接池的大小
val POOL_SIZE: Int = 10
val dbConnectionPool = new LinkedBlockingDeque[Connection](POOL_SIZE)
try
Class.forName("com.mysql.jdbc.Driver")
catch {
case e:
ClassNotFoundException => e.printStackTrace()
}
for (i <- 0 until POOL_SIZE) {
try {
val conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/test", "root", "root");
dbConnectionPool.put(conn)
} catch {
case e:
Exception => e.printStackTrace()
}
}
def getConnection(): Connection =
synchronized {
while (0 == dbConnectionPool.size()) {
try {
Thread.sleep(20)
} catch {
case e:
InterruptedException => e.printStackTrace()
}
}
dbConnectionPool.poll()
}
/**
* 批量插入
*
* @param sqlText sql语句字符
* @param paramsList 参数列表
* @return Array[Int]
*/
def doBatch(sqlText: String, paramsList: ListBuffer[ParamsList]): Array[Int] = {
val conn: Connection = getConnection()
var ps: PreparedStatement = null
var result: Array[Int] = null
try {
conn.setAutoCommit(false)
ps = conn.prepareStatement(sqlText)
for (parameters <- paramsList) {
println("word_count\t" + parameters)
// // word, count
ps.setObject(1, parameters.word)
ps.setObject(2, parameters.count)
ps.addBatch()
}
result = ps.executeBatch
conn.commit()
} catch {
case e:
Exception => e.printStackTrace()
} finally {
if (ps != null) try {
ps.close()
} catch {
case e:
SQLException => e.printStackTrace()
}
if (conn != null) try {
dbConnectionPool.put(conn)
} catch {
case e:
InterruptedException => e.printStackTrace()
}
}
result
}
}
class ParamsList extends Serializable {
var word: String = _
var count: Int = _
override def toString = s"ParamsList($word,$count)"
}
pom依赖
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-core_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.scala-langgroupId>
<artifactId>scala-libraryartifactId>
<version>${scala.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming-kafka-0-8_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<version>5.1.38version>
dependency>
建表语句
CREATE TABLE `spark_streaming_test` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`word` varchar(50) DEFAULT NULL,
`count` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=86 DEFAULT CHARSET=utf8;
执行KafkaDirectWordCount2Mysql main方法 传入参数 dn1:9092 ss /Users/xxx/checkpoint