Spark Streaming篇5:Spark Streaming自定义zookeeper管理offset
废话不多说,直接上干货
package com.iflytek.offset
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{KafkaUtils, OffsetRange}
/**
* kafka偏移量zookeeper维护类
* 适用本版:spark-streaming-kafka-0-10
*
* @param zkServers zookeeper server
*/
class KafkaZKManager(zkServers : String) {
//创建zookeeper连接客户端
val zkClient = {
val client = CuratorFrameworkFactory
.builder
.connectString(zkServers)
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
// .namespace("kafka")//创建包含隔离命名空间的会话
.build()
client.start()
client
}
val _base_path_of_kafka_offset = "/kafka/offsets" //offset 路径起始位置
/**
* 获取消费者组topic已消费偏移量(即本次起始偏移量)
* @param topics topic集合
* @param groupName 消费者组
* @return
*/
def getFromOffset(topics: Array[String], groupName:String):Map[TopicPartition, Long] = {
// Kafka 0.8和0.10的版本差别:0.10->TopicPartition ,0.8->TopicAndPartition
var fromOffset: Map[TopicPartition, Long] = Map()
for(topic <- topics){
val topic = topics(0).toString
println("========topics: "+topic+"========")
// 读取ZK中保存的Offset,作为Dstrem的起始位置。如果没有则创建该路径,并从 0 开始Dstream
val zkTopicPath = s"${_base_path_of_kafka_offset}/${groupName}/${topic}"
// 检查路径是否存在
checkZKPathExists(zkTopicPath)
// 获取topic的子节点,即 分区
val childrens = zkClient.getChildren().forPath(zkTopicPath)
// 遍历分区
import scala.collection.JavaConversions._
for (p <- childrens){
// 遍历读取子节点中的数据:即 offset
val offsetData = zkClient.getData().forPath(s"${zkTopicPath}/${p}")
// println("-------offsetData: "+offsetData.toString+"-------")
// 将offset转为Long
val offset = java.lang.Long.valueOf(new String(offsetData)).toLong
fromOffset += (new TopicPartition(topic, Integer.parseInt(p)) -> offset)
}
}
println("+++++++++fromOffset: "+fromOffset+"+++++++++")
fromOffset
}
/**
* 检查ZK中路径存在,不存在则创建该路径
* @param path
* @return
*/
def checkZKPathExists(path: String)={
if (zkClient.checkExists().forPath(path) == null) {
zkClient.create().creatingParentsIfNeeded().forPath(path)
}
}
/**
* 保存或更新偏移量
* @param offsetRange
* @param groupName
*/
def storeOffsets(offsetRange: Array[OffsetRange], groupName:String) = {
for (o <- offsetRange){
val zkPath = s"${_base_path_of_kafka_offset}/${groupName}/${o.topic}/${o.partition}"
// 检查路径是否存在
checkZKPathExists(zkPath)
// 向对应分区第一次写入或者更新Offset 信息
// println("---Offset写入ZK----Topic:" + o.topic +", Partition:" + o.partition + ", Offset:" + o.untilOffset)
zkClient.setData().forPath(zkPath, o.untilOffset.toString.getBytes())
}
}
}
object KafkaZKManager{
def createDirectStream(ssc:StreamingContext,
zkManager:KafkaZKManager,
kafkaParams: Map[String,Object],
topics:Array[String],
groupID:String):InputDStream[ConsumerRecord[String, String]]={
//采用zookeeper手动维护偏移量
// val zkManager = new KafkaZKManager(zkServer)
val fromOffsets = zkManager.getFromOffset(topics,groupID)
//创建数据流
var stream:InputDStream[ConsumerRecord[String, String]] = null
if (fromOffsets.size > 0){
stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams, fromOffsets))
return stream
}
else{
stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
println("第一次消费 Topic:" + topics)
return stream
}
}
}
package com.iflytek.offset
import java.sql.{Connection, DriverManager, Statement}
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
object KafkaZKManagerStreaming2 {
def main(args: Array[String]): Unit = {
Logger.getRootLogger.setLevel(Level.ERROR)
System.setProperty("HADOOP_USER_NAME","root")
System.setProperty("user.name","root")
val sparkSession: SparkSession = SparkSession.builder()
.appName("xx2")
.master("local[2]")
.config("spark.testing.memory", "471859200")
.config("spark.steaming.kafka.maxRatePerPartition","10000")
.config("spark.sql.streaming.schemaInference", "true")
.getOrCreate()
val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(3))
val topics = Array("pd_ry_txjl")
val kafkaServer="cdh01:9092"
val zkServer="cdh01:2181"
val groupID="xytest1222"
val kafkaParams = Map[String,Object](
"bootstrap.servers" -> kafkaServer,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupID,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> "false"
)
val zkManager = new KafkaZKManager(zkServer)
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaZKManager.createDirectStream(
ssc,
zkManager,
kafkaParams,
topics,
groupID
)
val schema : StructType = {
StructType(Seq(
StructField("jlId", StringType, true),
StructField("xqbm", StringType, true),
StructField("xqmc", StringType, true),
StructField("fx", IntegerType, true),
StructField("txsj", TimestampType, true),
StructField("rybm", StringType, true),
StructField("rymc", StringType, true)))
}
stream.foreachRDD(rdd=>{
val rddCache: RDD[String] = rdd.map(_.value()).cache()//这里需要对rdd进行cache,不然在写入kafka的时候会报错
val frameAll= rdd2Dataset(sparkSession,rddCache,schema)
frameAll.createOrReplaceTempView("frameAll")
sparkSession.sqlContext.cacheTable("frameAll")
val distancted: Dataset[Row] = frameAll.dropDuplicates(Array("rybm","xqbm"))
sendMysql2(distancted)
sparkSession.sqlContext.uncacheTable("frameAll")
rddCache.unpersist(true)
// 当所有操作执行完毕后,获取offset,并写入zk,如果无法获取窗口的的union偏移量,
// 需要另起一个foreach
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
zkManager.storeOffsets(offsetRanges,groupID)
})
ssc.start()
ssc.awaitTermination()
}
def sendMysql2(expected:DataFrame):Unit={
if(!expected.rdd.isEmpty()){
expected.foreachPartition(fp=>{
var
conn: Connection=null
var st:Statement=null
val jdbcUrl = "jdbc:mysql://cdh01:3306/xytest?useUnicode=true&characterEncoding=utf8";
val user = "root"
val password = "com/iflytek"
val rq = new SimpleDateFormat("yyyyMMdd").format(new Date().getTime)
try {
conn = DriverManager.getConnection(jdbcUrl, user, password)
st= conn.createStatement()
conn.setAutoCommit(false)
fp.foreach(f => {
val rq=new SimpleDateFormat("yyyyMMdd").format(new Date().getTime)
var sql=
s"""
|CREATE TABLE if not exists expected_${rq}(
| `id` int(11) NOT NULL AUTO_INCREMENT,
| `jlid` varchar(255) DEFAULT NULL,
| `xqmc` varchar(255) DEFAULT NULL,
| `rymc` varchar(255) DEFAULT NULL,
| `txsj` varchar(255) DEFAULT NULL,
| PRIMARY KEY (`id`)
|) ENGINE=InnoDB DEFAULT CHARSET=utf8;
""".stripMargin
st.addBatch(sql)
// 插入记录
sql=s"""
|insert into expected_${rq}
|(jlid,xqmc,rymc,txsj)
|values
|('${f.getString(0)}','${f.getString(1)}','${f.getString(2)}','${f.getString(3)}')
""".stripMargin
st.addBatch(sql)
})
st.executeBatch()
conn.commit()
} catch {
case e: Exception => println(e.printStackTrace())
} finally {
if (st != null)
st.close()
if (conn != null)
conn.close()}})
}
}
def rdd2Dataset(sparkSession: SparkSession,rdd:RDD[String],schema : StructType): Dataset[Row] ={
import org.apache.spark.sql.functions._
import sparkSession.implicits._
val ds: Dataset[String] = sparkSession.createDataset(rdd)
val frameAll: Dataset[Row] = {
ds.select(get_json_object(col("value").cast("string"), "$.body") as ("body"))
.select(get_json_object(col("body"), "$.body") as ("body"))
.select(from_json(col("body"), schema) as ("parsed_value"))
.select(
col("parsed_value").getItem("jlId") as ("jlId"),
col("parsed_value").getItem("xqbm") as ("xqbm"),
col("parsed_value").getItem("xqmc") as ("xqmc"),
col("parsed_value").getItem("fx").cast("string").cast("int") as ("fx"),
from_unixtime((col("parsed_value").getItem("txsj").cast("double")/1000), "yyyy-MM-dd HH:mm:ss").cast("TIMESTAMP") as ("txsj"),
col("parsed_value").getItem("rybm") as ("rybm"),
col("parsed_value").getItem("rymc") as ("rymc"))
.filter("jlId is not null")
}
frameAll
}
}
pom如下:
<properties>
<spark.version>2.3.2</spark.version>
<scala.version>2.11.8</scala.version>
<hbase.version>1.2.1</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.31</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.jolbox</groupId>
<artifactId>bonecp</artifactId>
<version>0.8.0.RELEASE</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.13</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.hadoop</groupId>-->
<!--<artifactId>hadoop-client</artifactId>-->
<!--<version>2.7.2</version>-->
<!--</dependency>-->
<!--<!–guava和hadoop版本得对应–>-->
<!--<dependency>-->
<!--<groupId>com.google.guava</groupId>-->
<!--<artifactId>guava</artifactId>-->
<!--<version>18.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<id>compile-scala</id>
<phase>compile</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>test-compile-scala</id>
<phase>test-compile</phase>
<goals>
<goal>add-source</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<compilerArgs>
<arg>-extdirs</arg>
<arg>${project.basedir}/lib</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>