Producer.class 生成数据daokafka
package day14;
/**
* 创建一个生产者 生成随机的key 和 字母
* 用于实现实时流统计词频 并 存储到redis
*/
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import java.util.Properties;
import java.util.Random;
import java.util.UUID;
public class GenerateWord {
public static void main(String[] args) throws Exception{
Properties props = new Properties();
props.setProperty("bootstrap.servers", "hadoop01:9092,hadoop02:9092,hadoop03:9092");//kafka的brokers列表
//key和value的序列化方式,因为需要网络传输所以需要序列化
props.setProperty("key.serializer", StringSerializer.class.getName());
props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
//创建一个生产者的客户端实例
KafkaProducer kafkaProducer = new KafkaProducer<>(props);
while(true){
Thread.sleep(500);
String key = UUID.randomUUID().toString();
int value = new Random().nextInt(26) + 97;
char word = (char)value;
ProducerRecord record = new ProducerRecord("wordcount",key,String.valueOf(word));
kafkaProducer.send(record);
System.out.println("record = " + record);
}
}
}
MyNetWordCountRedis.scala
package day14
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.JPools
/**
* 将kafka streaming 和 redis整合 实现词频统计
*/
object MyNetWordCountRedis {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
//创建SparkStreaming 对象
val conf = new SparkConf().setAppName(s"${this.getClass.getName}").setMaster("local[*]")
conf.set("spark.streaming.kafka.maxRatePerPartition","5") //从kafka拉取数据限速 (5)*(分区个数)*(采集数据时间)
conf.set("spark.streaming.kafka.stopGracefullyOnShutdown","true") //优雅的停止关闭
val ssc = new StreamingContext(conf,Seconds(2))
//定义一个消费者
val groupId = "day14_001"
//定义一个主题
val topic = "wordcount"
/**
* kafka参数列表
*/
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
//连接到kafka数据源
val stream = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent, //位置策略(可用的Executor上均匀分配分区)
ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParams))
//创建在Master节点的Driver
stream.foreachRDD(rdd => {
val reduced = rdd.map(x => (x.value(),1)).reduceByKey(_+_)
reduced.foreachPartition(rdd => {
//连接redis
val redis = JPools.getJedis
rdd.foreach({x => redis.hincrBy("wordcount",x._1,x._2.toLong)})
redis.close()
})
})
ssc.start()
ssc.awaitTermination()
}
}
Jpools.scala(Redis连接池)
package utils
import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.JedisPool
/**
* 一个简单的redis连接池
*/
object JPools {
private val poolConfi = new GenericObjectPoolConfig()
poolConfi.setMaxIdle(5) //最大的空闲连接 连接池中最大的空闲连接数,默认为8
poolConfi.setMaxTotal(2000) //支持最大的连接数 默认为8
//连接池是私有的 不能对外公开访问
private lazy val jedisPool = new JedisPool(poolConfi,"hadoop02")
def getJedis={
val jedis = jedisPool.getResource
jedis.select(0)
jedis
}
}
JedisOffSet.scala(查找Redis中保存的groupId的Offset)
package day14
import java.util
import org.apache.kafka.common.TopicPartition
import utils.JPools
object JedisOffSet {
def apply(groupid: String)={
var fromDbOffset = Map[TopicPartition, Long]()
val jedis = JPools.getJedis
val topicPartitionOffset: util.Map[String, String] = jedis.hgetAll(groupid)
import scala.collection.JavaConversions._
val topicPartitionOffsetList: List[(String, String)] = topicPartitionOffset.toList
for (topicPL <- topicPartitionOffsetList){
val split: Array[String] = topicPL._1.split("[-]")
fromDbOffset += (new TopicPartition(split(0), split(1).toInt) -> topicPL._2.toLong)
}
fromDbOffset
}
}
Streaming_Kafka_Redis_Offset.scala
package day14
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
import utils.JPools
/**
* 实现streaming, kafka, redis的结合 完成wordcount
* 并通过redis实现偏移量的管理
*/
object Streaming_Kafka_Redis_Offset {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
//创建SparkStreaming 对象
val conf = new SparkConf().setAppName(s"${this.getClass.getName}").setMaster("local[*]")
conf.set("spark.streaming.kafka.maxRatePerPartition","5") //从kafka拉取数据限速 (5)*(分区个数)*(采集数据时间)
conf.set("spark.streaming.kafka.stopGracefullyOnShutdown","true") //优雅的停止关闭
val ssc = new StreamingContext(conf,Seconds(2))
//定义一个消费者
val groupId = "day14_001"
//定义一个主题
val topic = "wordcount"
/**
* kafka参数列表
*/
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val formOffset: Map[TopicPartition, Long] = JedisOffSet(groupId)
//连接到kafka数据源
val stream = if(formOffset.size == 0) {
KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(topic), kafkaParams))
} else {
KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Assign[String, String](formOffset.keys, kafkaParams, formOffset))
}
//创建在Master节点的Driver
stream.foreachRDD(rdd => {
val offsetRange = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val reduced = rdd.map(x => (x.value(),1)).reduceByKey(_+_)
reduced.foreachPartition(rdd => {
//连接redis
val redis = JPools.getJedis
rdd.foreach({x => redis.hincrBy("wordcount",x._1,x._2.toLong)})
redis.close()
})
//将偏移量存到redis
val jedis = JPools.getJedis
for(o <- offsetRange){
jedis.hset(groupId,o.topic+"-"+o.partition,o.untilOffset.toString)
}
})
ssc.start()
ssc.awaitTermination()
}
}