spark streaming作为当下依然火热的流计算引擎,对于kafka和elasticsearch都有很好的api支持,以下是笔者在测试环境中的一个数据从kafka到es案例。
首先说一下几个重点:
这里相关工具的版本:
scala:2.11.8
spark:2.3.4
kafka:0.10.1.0
elasticsearch:7.0.0
maven配置如下:
<properties>
<scala.version>2.11.8scala.version>
<spark.version>2.3.4spark.version>
properties>
<dependencies>
<dependency>
<groupId>org.scala-langgroupId>
<artifactId>scala-libraryartifactId>
<version>${scala.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-core_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.elasticsearchgroupId>
<artifactId>elasticsearch-spark-20_2.11artifactId>
<version>7.0.0version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming-kafka-0-10_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.49version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<version>5.1.38version>
dependency>
dependencies>
CREATE TABLE `kafka_offset` (
`topic` varchar(255) NOT NULL,
`groupid` varchar(128) NOT NULL,
`partition` int(11) NOT NULL,
`fromoffset` bigint(20) DEFAULT NULL,
`untiloffset` bigint(20) DEFAULT NULL
PRIMARY KEY (`topic`,`groupid`,`partition`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8
JDBCUtil.scala
package com.test
import java.math.BigDecimal
import java.sql._
import org.apache.kafka.common.TopicPartition
import org.apache.spark.sql.Row
import org.apache.spark.streaming.kafka010.OffsetRange
import scala.language.postfixOps
object JDBCUtil {
/*
1.jdbc 地址
2.用户名
3.密码
*/
val url="jdbc:mysql://localhost:3306/test?characterEncoding=utf-8&autoReconnect=true&failOverReadOnly=false&useSSL=false&rewriteBatchedStatements=true"
val username="user"
val password="12345"
/*
获取连接
*/
def getConnection:Connection={
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection(url,username,password)
}
/*
关闭连接
*/
def closeConnection(conn:Connection): Unit ={
if(conn!=null)
conn.close()
}
/*
关闭查询通道
*/
def closePreparedStatement(prepareStatement:PreparedStatement): Unit ={
if(prepareStatement!=null)
prepareStatement.close()
}
def closeResultSet(rs:ResultSet):Unit={
if(rs!=null)
rs.close()
}
/**
* 查询kafka的offset
* @param groupid
* @return
*/
def selectKafkaOffset(groupid:String):Map[TopicPartition,Long]={
val conn:Connection=getConnection
val sql="select * from `kafka_offset` where `groupid`='"+groupid+"' "
val pstmt:PreparedStatement=conn.prepareStatement(sql)
val rs:ResultSet=pstmt.executeQuery()
var map=Map[TopicPartition,Long]()
while(rs.next()){
map+=(new TopicPartition(rs.getString("topic"),rs.getInt("partition")) -> rs.getLong("untiloffset"))
}
closePreparedStatement(pstmt)
closeResultSet(rs)
closeConnection(conn)
map
}
/**
* 插入kafka的offset
* @param offsetRanges
* @param groupid
*/
def replaceKafkaOffset(offsetRanges:scala.Array[OffsetRange],groupid:String):Unit={
val conn:Connection=getConnection
val sql="replace into `kafka_offset`(`topic`,`groupid`,`partition`,`fromoffset`,`untiloffset`) values (?,?,?,?,?)"
val preparedStatement:PreparedStatement=conn.prepareStatement(sql)
for(or <- offsetRanges){
preparedStatement.setString(1,or.topic)
preparedStatement.setString(2,groupid)
preparedStatement.setInt(3,or.partition)
preparedStatement.setLong(4,or.fromOffset)
preparedStatement.setLong(5,or.untilOffset)
preparedStatement.addBatch()
}
preparedStatement.executeBatch()
closePreparedStatement(preparedStatement)
closeConnection(conn)
}
}
package com.test
import com.alibaba.fastjson.JSON
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import org.elasticsearch.spark.rdd.EsSpark
import scala.collection.JavaConverters._
object KafkaToES {
// 检查mysql里offset是否超出范围
def getCheckedOffset(topics:Set[String],kafkaParams:Map[String, Object],fromdbOffset:Map[TopicPartition,Long]):Map[TopicPartition,Long]={
val kc=new KafkaConsumer[String,String](kafkaParams.asJava)
val beginOffsetMap=scala.collection.mutable.Map[TopicPartition,Long]()
for (topic <- topics) {
kc.partitionsFor(topic).asScala.foreach(partitionInfo => {
val topicPartition=new TopicPartition(topic,partitionInfo.partition())
kc.assign(Seq(topicPartition).asJava)
kc.seekToBeginning(Seq(topicPartition).asJava)
beginOffsetMap+=(topicPartition -> kc.position(topicPartition))
})
}
kc.close()
fromdbOffset.map(f => {
val beginOffset=beginOffsetMap.getOrElse(f._1,0).toString.toLong
if(beginOffset > f._2){
(f._1,beginOffset)
}else{
f
}
})
}
def run():Unit={
val conf=new SparkConf().setAppName("KafkaToES")
// 是否自动创建es index
conf.set("es.index.auto.create", "true")
// es节点地址
conf.set("es.nodes","ip1,ip2")
// es的账号密码,没有的话不用配
conf.set("es.net.http.auth.user", "user")
conf.set("es.net.http.auth.pass", "12345")
// 配置spark序列化
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
// 设置executor的堆外内存,数据量大的话建议调大一点
conf.set("spark.executor.memoryOverhead","2048")
// 避免spark每次任务获取过多的kafka条数 公式为 kafka分片数*调度周期(秒)*maxRatePerPartition = 总条数
conf.set("spark.streaming.backpressure.enabled","true")
conf.set("spark.streaming.kafka.maxRatePerPartition","1000")
val sc=new SparkContext(conf)
val streamContext=new StreamingContext(sc,Seconds(10))
// 设置groupid和要读取的topic
val groupid="test"
val topics=Set("topic1","topic2")
// 配置kafka相关参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "ip1:9092,ip2:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean),
// 分区获取的最大数据量,根据实际内存调整
"max.partition.fetch.bytes" -> "2097152"
)
// 从mysql获取offset
val fromdbOffset:Map[TopicPartition,Long]=JDBCUtil.selectKafkaOffset(groupid)
var kafkaDStream:InputDStream[ConsumerRecord[String, String]]=null
if(fromdbOffset.isEmpty){
// 第一次启动mysql里没有offset信息的情况
kafkaDStream=KafkaUtils.createDirectStream[String, String](
streamContext,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
}else{
// mysql里有offset数据,校验并消费
val checkedOffset=getCheckedOffset(topics,kafkaParams,fromdbOffset)
kafkaDStream=KafkaUtils.createDirectStream[String,String](
streamContext,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Assign[String,String](checkedOffset.keys,kafkaParams,checkedOffset)
)
}
// 开始处理消费到的数据
kafkaDStream.foreachRDD(rdd => {
// 获取新的offset信息
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val jsonRDD=rdd.map(_.value()).map(l =>{
// 业务处理部分
val jsonObject=JSON.parseObject(l)
// 拿到kafka里的message,需要的话进行字段解析,解析结果放入jsonObject
val s=jsonObject.get("message").toString
jsonObject.toJSONString
})
// 配置插入es的相关参数
val mapConf = Map(
// 要插入es的index
("es.resource" , "spark-test"),
// 插入过程中的一些配置,根据实际情况调整
("es.batch.size.bytes" , "10mb"),
("es.batch.size.entries" , "50000"),
("es.batch.write.retry.count", "10"),
("es.batch.write.retry.wait", "3000")
)
// 执行插入
EsSpark.saveJsonToEs(jsonRDD,mapConf)
// 提交offset到mysql
JDBCUtil.replaceKafkaOffset(offsetRanges,groupid)
})
// 启动
streamContext.start()
streamContext.awaitTermination()
}
def main(args: Array[String]): Unit = {
run()
}
}
1、org.apache.kafka.clients.consumer.OffsetOutOfRangeException: Offsets out of range with no configured reset policy for partitions
解决:使用offset校验方法,或者直接换个groupid,这样会丢一部分数据
2、Container killed by YARN for exceeding memory limits. 16.9 GB of 16 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead
解决:一开始按它说的调大了spark.yarn.executor.memoryOverhead这个参数,但是看日志的时候发现应该用spark.executor.memoryOverhead这个参数,可能是spark版本问题