从Kafka中读取数据,完成聚合类的操作,
最后将【偏移量】和【计算好的聚合结果】同时写入到MySQL中
MySQL 是一个【支持事务】的关系型数据库,使用事务可以保证【计算好的聚合结果】和【偏移量】同时写入成功
-- kafka中读取数据,写入到mysql中所创建的表
-- 1、写入的数据
CREATE TABLE word_counts(
word VARCHAR(255) NOT NULL PRIMARY KEY,
counts INT
);
-- 测试
insert into word_counts (word,counts) values('a',10) on duplicate key update counts = counts + 10
insert into word_counts (word,counts) values('a',5) on duplicate key update counts = counts + 5
-- 2、写入的偏移量
CREATE TABLE word_offsets(
app_gid VARCHAR(255) NOT NULL,
topic_partition VARCHAR(255) NOT NULL,
offset int,
PRIMARY KEY (app_gid,topic_partition) -- 联合主键
);
-- 测试
insert into word_offsets (app_gid,topic_partition,offset) values('wc_g001','t1_0',10) on duplicate key update offset = 10
insert into word_offsets (app_gid,topic_partition,offset) values('wc_g001','t1_0',12) on duplicate key update offset = 12
import java.sql.{Connection, DriverManager, PreparedStatement}
import com.mysql.jdbc.Driver
import com.sparkstreaming.utils._00_OffsetUtils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object _04_SparkStreaming_KafkaToMySQL {
def main(args: Array[String]): Unit = {
//创建StreamingContext对象
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("warn")
val ssc: StreamingContext = new StreamingContext(sc,Seconds(5))
//kafka的配置参数
//kafkaParams
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "centos01:9092,centos02:9092,centos03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "g0018",
"auto.offset.reset" -> "earliest" ,//"latest"
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("producer_01")
//要先在mysql中查询偏移量,如果有就把偏移量进行提交
val map = _00_OffsetUtils.queryHistoryOffsetFromMySQL(this.getClass.getSimpleName, "g0018")
val dStream: DStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams,map)
)
dStream.foreachRDD(rdd => {
if(!rdd.isEmpty()){
val reduced: RDD[(String, Int)] = rdd.map(_.value()).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
val res: Array[(String, Int)] = reduced.collect()
var conn: Connection = null
var ps: PreparedStatement = null
//注册驱动
try {
classOf[Driver]
//获得连接
conn = DriverManager.getConnection(
"jdbc:mysql://localhost:3306/sql_01?characterEncoding=utf8",
"root",
"123456")
//开启事务
conn.setAutoCommit(false)
ps = conn.prepareStatement("insert into word_counts (word,counts) values(?,?) " +
"on duplicate key update counts = counts + ?")
for (elem <- res) {
val word: String = elem._1
val cnt: Int = elem._2
ps.setString(1,word)
ps.setInt(2,cnt)
ps.setInt(3,cnt)
ps.execute()
}
//把偏移量信息写入到mysql中
ps = conn.prepareStatement("insert into word_offsets (app_gid,topic_partition,offset) " +
"values(?,?,?) on duplicate key update offset = ?")
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
println("-----------------------------------------------")
for (elem <- offsetRanges) {
val partition = elem.partition
val topic = elem.topic
val untilOffset: Int = elem.untilOffset.toInt
val app_gid = this.getClass.getSimpleName + "_" + "g0018"
val topic_partition = topic + "_" + partition
ps.setString(1,app_gid)
ps.setString(2,topic_partition)
ps.setInt(3,untilOffset)
ps.setInt(4,untilOffset)
println("topic: " + elem.topic + ", partition: " + elem.partition +
", untilOffset: " + elem.untilOffset)
ps.execute()
}
//提交事务
conn.commit()
} catch {
case e:Exception =>
//回滚事务
conn.rollback()
throw e
ssc.stop(true)
} finally {
if (ps != null){
ps.close()
}
if(conn != null){
conn.close()
}
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
import java.sql.{Connection, DriverManager}
import org.apache.kafka.common.TopicPartition
import scala.collection.mutable
object _00_OffsetUtils {
//从mysql中查询数据
//offsets: ju.Map[TopicPartition, jl.Long])
def queryHistoryOffsetFromMySQL(appName: String, groupId: String): Map[TopicPartition, Long] = {
val historyOffset: mutable.HashMap[TopicPartition, Long] = new mutable.HashMap[TopicPartition, Long]()
val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/sql_01?characterEncoding=utf8",
"root",
"123456")
val ps2 = conn.prepareStatement("select topic_partition,offset from word_offsets where " +
"app_gid = ?")
ps2.setString(1,appName + "_" + groupId)
val resultSet = ps2.executeQuery()
while (resultSet.next()){
val topicPartition = resultSet.getString(1)
val offset: Long = resultSet.getInt(2).toLong
val array: Array[String] = topicPartition.split("_")
historyOffset.put(new TopicPartition(array(0)+"_" + array(1),array(2).toInt),offset)
}
historyOffset.toMap
}
}