Spark Streaming篇4:Spark Streaming动态广播大变量并进行黑名单过滤
广播变量后,一般变量发生更改后,但是广播的值却不能随之更改,假设从mysql中读取了黑名单数据,后面黑名单数据发送改变,那么如何动态广播变量呢?
废话不多说,直接上干货
BroadcastWrapper用于动态更新广播变量
package com.iflytek.sparking.算子
import java.io.{ObjectInputStream, ObjectOutputStream}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.StreamingContext
import scala.reflect.ClassTag
//通过包装器在Ds的foreachRDD中更新广播变量
/*
当我们将大型的配置查询表广播出去的时候,每个节点都可以读到配置项进行任务计算,
那么假设配置发生了动态更改时,如何通知各个子节点配置表更改呢?
(尤其对于流式计算,重启的代价巨大)
广播变量时只读的,无法修改,所以我们只能通过unpeisiet去删除旧变量
然后在广播新变量
*/
case class BroadcastWrapper[T:ClassTag](
@transient private val ssc:StreamingContext,
@transient private val oldValue:T)
{
@transient private var broadcastV=ssc.sparkContext.broadcast(oldValue)
// 删除RDD是否需要锁定
def update(newValue : T,blocking:Boolean=false):Unit={
broadcastV.unpersist(blocking)
broadcastV=ssc.sparkContext.broadcast(newValue)
}
def value:T=broadcastV.value
private def writeObject(out:ObjectOutputStream):Unit={
out.writeObject(broadcastV)
}
private def readObject(in :ObjectInputStream):Unit={
broadcastV=in.readObject().asInstanceOf[Broadcast[T]]
}
}
/*
利用wraper更新广播变量,可以动态的更新大型配置变量,而不是用重启
大致逻辑如下:
// 定义
val yourBroadcast = BroadcastWrapper[yourType](ssc, yourValue)
yourStream.transform(rdd => {
//定期更新广播变量
if (System.currentTimeMillis - someTime > Conf.updateFreq) {
yourBroadcast.update(newValue, true)
}
// do something else
})
*/
MysqlPool 用于获取mysql中的黑名单数据
package com.iflytek.kafkaManager
import java.sql.Connection
import java.text.SimpleDateFormat
import java.util.Date
import com.mchange.v2.c3p0.ComboPooledDataSource
class MysqlPool extends Serializable{
private val cpds:ComboPooledDataSource=new ComboPooledDataSource(true)
// private val jdbcUrl = "jdbc:mysql://localhost:3306/xytest?useUnicode=true&characterEncoding=utf8&autoReconnect=true&rewriteBatchedStatements=TRUE&useSSL=false";
private val jdbcUrl = "jdbc:mysql://localhost:3306/xytest?useUnicode=true&characterEncoding=utf8";
private val user = "root"
private val password = "123456"
private val driver = "com.mysql.jdbc.Driver"
private val rq=new SimpleDateFormat("yyyyMMdd").format(new Date().getTime)
try{
cpds.setJdbcUrl(jdbcUrl)
cpds.setDriverClass(driver)
cpds.setUser(user)
cpds.setPassword(password)
cpds.setMaxPoolSize(200) //最大连接数量
cpds.setMinPoolSize(20) //最小连接数量
cpds.setAcquireIncrement(5) //每次递增数量
cpds.setMaxStatements(180) //连接池最大空闲时间
cpds.setMaxIdleTime(25000)
// cpds.setPreferredTestQuery(s"select id from word_count_20191126 where 1=2")
cpds.setIdleConnectionTestPeriod(18000)
}catch {
case e:Exception=>e.printStackTrace()
}
// 这里我们不进行关闭操作,在foreachRdd中进行
def getConnection:Connection={
try{
return cpds.getConnection
}catch {
case ex:Exception=> ex.printStackTrace()
null
}
}
}
object MysqlManager{
var mysqlManager:MysqlPool=_
def getMysqlManager:MysqlPool={
synchronized{
if (mysqlManager==null){
mysqlManager=new MysqlPool
}
mysqlManager
}
}
}
具体用法:
package com.iflytek.sparking.业务
import java.text.SimpleDateFormat
import java.util.Date
import com.alibaba.fastjson.JSON
import com.iflytek.kafkaManager.MysqlManager
import com.iflytek.sparking.算子.BroadcastWrapper
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.CarbonSession._
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
/*
需求:
利用sparkstreaming实时过滤黑名单数据
*/
object 动态黑名单 {
val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS")
private var listBroadcastWrapper: BroadcastWrapper[mutable.HashSet[String]] = null
def main(args: Array[String]): Unit = {
Logger.getRootLogger.setLevel(Level.ERROR)
System.setProperty("HADOOP_USER_NAME","root")
System.setProperty("user.name","root")
val warehouse = "hdfs://cdh01:8020/user/hive/warehouse/carbon.store"
val sparkSession = SparkSession.builder()
.appName("xx")
.master("local[2]")
.config("spark.testing.memory","471859200")
.getOrCreateCarbonSession(warehouse)
@transient
val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(3))
ssc.checkpoint("hdfs://cdh01:8020/user/hive/warehouse/checkpointed/sdf")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "cdh01:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "xx001",
"auto.offset.reset" -> "latest",//earliest latest
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("pd_ry_txjl")
val stream: InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
val kv: DStream[(String, String)] = stream.map(record => (record.key, record.value))
val value: DStream[String] = stream.map(_.value())
val mapDS = value.map(x => {
val dataFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val nObject = JSON.parseObject(x)
val bodyObject1 = nObject.getJSONObject("body")
val bodyObject2 = bodyObject1.getJSONObject("body")
val xqbm = bodyObject2.get("id").toString
val xqmc = bodyObject2.get("name").toString
(xqbm,xqmc)
})
val xqbmSet: mutable.HashSet[String] = getXqbmFromMysql
listBroadcastWrapper = BroadcastWrapper(ssc,xqbmSet)
val fliter: DStream[(String, String)] = mapDS.transform(rdd => {
rdd.filter(x => {
val current_time = sdf.format(new Date())
val new_time = current_time.substring(14,16).toLong
if(new_time % 2 == 0){
listBroadcastWrapper.update(getXqbmFromMysql,true) //2分钟更新一次广播变量的内容;
}
val newValue= listBroadcastWrapper.value
newValue.contains(x._1)
})
})
fliter.print()
ssc.start()
ssc.awaitTermination()
}
def getXqbmFromMysql():mutable.HashSet[String]={
val perTime=System.currentTimeMillis()
val sql="select distinct(name) from user"
val conn=MysqlManager.getMysqlManager.getConnection
val statement = conn.createStatement()
try {
val rs = statement.executeQuery(sql)
val words = mutable.HashSet[String]()
while (rs.next()) {
words += rs.getString("name")
}
words
} catch {
case e:Exception=>e.printStackTrace()
mutable.HashSet[String]()//由于返回值是mutable.HashSet[String],如果上面异常,这里不定义一个set的话,就没返回值了
} finally {
statement.close()
conn.close
}
}
}
pom如下:
<properties>
<spark.version>2.3.2</spark.version>
<scala.version>2.11.8</scala.version>
<hbase.version>1.2.1</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.31</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.jolbox</groupId>
<artifactId>bonecp</artifactId>
<version>0.8.0.RELEASE</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.13</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.hadoop</groupId>-->
<!--<artifactId>hadoop-client</artifactId>-->
<!--<version>2.7.2</version>-->
<!--</dependency>-->
<!--<!–guava和hadoop版本得对应–>-->
<!--<dependency>-->
<!--<groupId>com.google.guava</groupId>-->
<!--<artifactId>guava</artifactId>-->
<!--<version>18.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<id>compile-scala</id>
<phase>compile</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>test-compile-scala</id>
<phase>test-compile</phase>
<goals>
<goal>add-source</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<compilerArgs>
<arg>-extdirs</arg>
<arg>${project.basedir}/lib</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>