spark-2.2.0
kafka-2.11-2.30
zookeeper-3.5.5
package doc
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import scala.util.Random
/**
* @Author huangwei
* @Date 19-10-15
* @Comments
**/
object KafkaProducer extends App {
// topic
val topic = "KafkaOperation"
// brokers
val brokers = "localhost:9091,localhost:9092,localhost:9093"
// 设置一个随机数
val rnd = new Random()
// 配置项
val props = new Properties()
// 配置brokers
props.put("bootstrap.servers",brokers)
// 设置客户端名称
props.put("client.id","kafkaGenerator")
// 序列化
props.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
// 建立Kafka连接
val producer = new KafkaProducer[String,String](props)
val t = System.currentTimeMillis() // 当前系统时间
val nameAddrs = Map("bob" -> "shanghai#200000","amy" -> "beijing#100000","alice" -> "shanghai#200000","tom" -> "beijing#100000","lulu" -> "hangzhou#310000","nick" -> "shanghai#200000")
val namePhones = Map("bob" -> "15700079421","amy" -> "18700079458","alice" -> "17730076427","tom" -> "16700379451","lulu" -> "18800074423","nick" -> "14400033426")
for (nameAddr <- nameAddrs){
val data = new ProducerRecord[String,String](topic,nameAddr._1,s"${nameAddr._1}\t${nameAddr._2}\t0")
producer.send(data)
if (rnd.nextInt(100)<50) Thread.sleep(rnd.nextInt(10))
}
for (namePhone <- namePhones){
val data = new ProducerRecord[String,String](topic,namePhone._1,s"${namePhone._1}\t${namePhone._2}\t1")
producer.send(data)
if (rnd.nextInt(100) < 50) Thread.sleep(rnd.nextInt(10))
}
System.out.println("sent per second:" + (nameAddrs.size + namePhones.size) * 1000 / (System.currentTimeMillis() - t))
producer.close()
}
package doc
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Author huangwei
* @Date 19-10-16
* @Comments
**/
object KafkaOperation {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkStreaming-Kafka").setMaster("local[*]").set("spark.streaming.kafka.maxRatePerPartition","10")
val ssc = new StreamingContext(conf,Seconds(3))
// broker和topic创建直接通过Kafka连接Direct Kafka
val kafkaParams = Map[String,Object](
"bootstrap.servers" -> "localhost:9091,localhost:9092,localhost:9093", // 服务器地址
"key.deserializer" -> classOf[StringDeserializer], // 序列化
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "kafkaOperationGroup", // group 设置
"auto.offset.reset" -> "latest", // 从最新offset开始
"enable.auto.commit" -> (false:java.lang.Boolean) // 自动提交
)
val kafkaDirectStream = KafkaUtils.createDirectStream[String,String](
ssc,
PreferConsistent,
Subscribe[String,String](List("KafkaOperation"),kafkaParams)
)
// 根据接收的kafka信息,切分得到用户地址DStream
val nameAddStream = kafkaDirectStream.map(_.value).filter(record => {
val tokens = record.split("\t")
tokens(2).toInt == 0
}).map(record => {
val tokens = record.split("\t")
(tokens(0),tokens(1))
})
val namePhoneStream = kafkaDirectStream.map(_.value).filter(reocrd => {
val tokens = reocrd.split("\t")
tokens(2).toInt == 1
}).map(record => {
val tokens = record.split("\t")
(tokens(0),tokens(1))
})
val nameAddPhoneStream = nameAddStream.join(namePhoneStream).map(
record => {
s"姓名:${record._1},地址:${record._2._1},电话:${record._2._2}"
}
)
nameAddPhoneStream.print()
ssc.start()
ssc.awaitTermination()
}
}