Spark Streaming MakeData
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import scala.collection.mutable.ListBuffer
import scala.util.Random
//生产数据
object MakeDataDemo {
def main(args: Array[String]): Unit = {
//Kafka 参数
val props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092");
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
val producer = new KafkaProducer[String, String](props)
val words = List[String]("zhangsan", "lisi", "wangwu", "spark", "sql", "Streaming", "xiaowang", "xiaohong")
while (true) {
val lines = new ListBuffer[String]
for (i <- 1 to (1 + new Random().nextInt(6))) {
lines.append(words(new Random().nextInt(8)))
}
val data = new ProducerRecord[String, String]("sparktest", lines.mkString(" ")) //topic名称sparktest
producer.send(data)
Thread.sleep(500)
// print(data)
}
}
}
创建表相应表
mysql>create table wordcount (word varchar(200) primary key,counts int(11));
mysql> create table stopwords (word varchar(200) primary key);
mysql> insert into stopwords values('wangwu')
mysql>truncate table wordcount;
Spark Streaming 读取kafka数据并过滤写入到mysql中
package SparkTest.SparkStreaming
import java.sql.{Connection, PreparedStatement, ResultSet}
import SparkTest.SparkStreaming.StopWord
import SparkTest.util.DBUtil
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import scala.collection.mutable.ListBuffer
object SparkStreamingkafkaDemo {
def main(args: Array[String]): Unit = {
//创建 SparkConf
val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getName)
//创建 StreamingContext
val ssc = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("error")
//Kafka 参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark-demo",
"auto.offset.reset" -> "latest"
// "enable.auto.commit" -> (t: java.lang.Boolean)
)
//读取 Kafka 数据创建 DStream
val topics = Array("sparktest")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
/* //过滤 方法1
val result = stream.map(_.value).flatMap(_.split(" ")).transform(rdd =>{
val stopwords = new ListBuffer[String]()
stopwords.append("wangwu")
rdd.filter(!_.contains(stopwords(0)))
})*/
/* //过滤 方法2
val result = stream.map(_.value).flatMap(_.split(" ")).filter(!_.contains("wangwu"))*/
//过滤 方法3
val result = stream.map(_.value).flatMap(_.split(" ")).transform(rdd => {
val stopwords = StopWords.getWords() //stopWords 方法1 方法2
//val stopwords = StopWords.getWord() //stopWords 方法3
rdd.filter(word => {
!stopwords.contains(word)
})
})
result.map((_, 1)).reduceByKey(_ + _).foreachRDD(rdd => {
//foreacgRDD 写入mysql
rdd.foreachPartition(it => {
val con = DBUtil.getConnection()
val pstmt = con.prepareStatement("insert into wordcount values(?,?) ON DUPLICATE KEY UPDATE counts=counts+?")
try {
it.foreach(line => {
pstmt.setString(1, line._1)
pstmt.setInt(2, line._2.toInt)
pstmt.setInt(3, line._2.toInt)
pstmt.addBatch()
})
pstmt.executeBatch()
} catch {
case e: Exception => {
e.printStackTrace()
}
} finally {
pstmt.close()
con.close()
}
})
})
//stream.map(_.value).print()
//开启任务
ssc.start()
ssc.awaitTermination()
}
}
mysql 连接类
DBUtil
package SparkTest.util
import java.sql.{Connection, DriverManager}
object DBUtil {
def getConnection():Connection={
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://192.168.58.203/testdb","root","123")
}
}
StopWords 过滤类
package SparkTest.SparkStreaming
import java.sql.{Connection, PreparedStatement, ResultSet}
import SparkTest.util.DBUtil
import scala.collection.mutable.ListBuffer
object StopWords {
//方法1 过滤需求在数据库中
val stopword = new ListBuffer[String]()
var con: Connection = null
var pstmt: PreparedStatement = null
var rs: ResultSet = null
try {
con = DBUtil.getConnection()
pstmt = con.prepareStatement("select * from stopwords") //读数据库
//mysql> create table stopwords (word varchar(200) primary key);
//mysql> insert into stopwords values('wangwu')
rs = pstmt.executeQuery()
while (rs.next()) {
stopword.append(rs.getString("word"))
}
} catch {
case e: Exception => {
e.printStackTrace()
}
} finally {
rs.close()
pstmt.close()
con.close()
}
/*
//方法2 过滤需求在本地文件中
val lines = scala.io.Source.fromFile("data/stopword.txt").getLines() //读文件
//data/stopword.txt 里写入 wangwu
lines.foreach(line=>{
stopword.append(line)
}) */
def getWords(): ListBuffer[String] = {
stopword
}
/*
//方法3 过滤需求手动设置
val words="wangwu"
def getWord():String={
words
}*/
def main(args: Array[String]): Unit = {
// val stopWord=new StopWord("wangwu")
// print(stopWord.getWord()) // 法1
// println(getWords())
}
}
OffSetDemo 读批次 读topic 读的哪个分区 从哪里读到哪里
package SparkTest.SparkStreaming
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, TaskContext}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
//读批次 读topic 读的哪个分区 从哪里读到哪里
object OffSetDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getName)
val ssc = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("error")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark-demo",
"auto.offset.reset" -> "latest"
// "enable.auto.commit" -> (t: java.lang.Boolean)
)
val topics = Array("sparktest")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
stream.foreachRDD { rdd =>
//数组 OffsetRange
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition { iter =>
val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
}
}
ssc.start()
ssc.awaitTermination()
}
}