import java.util.Properties
import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
object ProducerDemo {
def main(args: Array[String]): Unit = {
// 添加配置信息
val prop = new Properties()
// 指定kafka列表
// 指定序列化类
// 指定发送数据后的响应方式
// 指定分区器类
// prop.put("partitioner.class", "kafka.producer.DefaultPartitioner")
// 自定义分区类
prop.put("partitioner.class", "cry.day05.kafka.CustomPartitioner")
// 创建ProducerConfig对象
val config: ProducerConfig = new ProducerConfig(prop)
// 创建生产者对象
val producer: Producer[String, String] = new Producer(config)
// 指定一个用于接收数据的topic
val topic = "testapi"
// 模拟数据
for(i <- 1 to 1000){
val msg = s"producer send data:$i"
producer.send(new KeyedMessage[String,String](topic, msg))
import java.util.Properties
import java.util.concurrent.{ExecutorService, Executors}
import kafka.consumer.{Consumer, ConsumerConfig, KafkaStream}
import scala.collection.mutable
class ConsumerDemo(val consumer: String, val stream: KafkaStream[Array[Byte], Array[Byte]]) extends Runnable{
override def run(): Unit = {
val it = stream.iterator()
while (it.hasNext()){
val data =
val topic = data.topic
val partition = data.partition
val offset = data.offset
val msg = new String(data.message())
println(s"consumer:$consumer, topic:$topic, partition:$partition, offset:$offset,msg:$msg")
object ConsumerDemo {
def main(args: Array[String]): Unit = {
val topic = "testapi"
// 定义一个map,用来存储多个topic
// key:topic名称,value:指定线程数用于获取topic的数据
val topics = new mutable.HashMap[String, Int]()
topics.put(topic, 2)
// 配置信息
val prop = new Properties()
// 指定zk列表
// 指定consumer组名
prop.put("", "xxx")
// 指定offset异常时需要获取的offset的值
prop.put("auto.offset.reset", "smallest")
// 创建consumer配置信息对象
val config = new ConsumerConfig(prop)
// 创建Consumer对象(单例)
val consumer = Consumer.create(config)
// 获取数据,在返回的map类型中,key为topic名称,value为topic对应的数据
val streams: collection.Map[String, List[KafkaStream[Array[Byte], Array[Byte]]]] = consumer.createMessageStreams(topics)
// 获取指定的topic数据
val stream: Option[List[KafkaStream[Array[Byte], Array[Byte]]]] = streams.get(topic)
// 创建一个固定大小的线程池
val pool: ExecutorService = Executors.newFixedThreadPool(3)
for(i <- 0 until stream.size){
pool.execute(new ConsumerDemo(s"Consumer:$i", stream.get(i)))
import kafka.producer.Partitioner
import kafka.utils.{Utils, VerifiableProperties}
// 1.自定义分区器类需要继承Partitioner类
// 2.主构造器需要有一个类型为VerifiableProperties的参数,否则会报错
class CustomPartitioner(props: VerifiableProperties = null) extends Partitioner{
override def partition(key: Any, numPartitions: Int): Int = {
Utils.abs(key.hashCode) % numPartitions
每个partition相当于一个巨型文件被平均分配到多个大小相等segment(段)数据文件中。但每个段segment file消息数量不一定相等,这种特性能方便old segment file快速被删除,默认保存7天的数据。
segment file:由2大部分组成,分别为index file和data file,此2个文件一一对应,后缀.index表示为segment索引文件,.log表示segment数据文件
00000000000000368769.index的消息量起始偏移量为368770 = 368769 + 1
00000000000000737337.index的起始偏移量为737338=737337 + 1
以起始偏移量命名并排序这些文件,只要根据offset 二分查找文件列表,就可以快速定位到具体文件。当offset=368776时定位到00000000000000368769.index和对应log文件。
spark streaming用于流式数据的处理,有高吞吐量和容错能力强等特点。spark streaming支持的数据输入源很多,例如kafka、flume、zeroMQ等等。数据输入后可以使用spark的高度抽象原语如map、reduce、join、window等进行计算。结果也能保存在很多地方,如hdfs、数据库等
spark streaming有易用、容错、易整合到spark体系等优点。
Discretized Stream是Spark Streaming的基础抽象,代表持续性的数据流和经过各种spark原语操作后的结果数据流,在内部实现上,DStreams是一系列连续的RDD来表示,每个RDD含有一段时间间隔内的数据,如下图
计算过程由Spark engine来完成
Dstream上的原语与RDD的算子类似,分为Transformations(转换)和Output Operations(输出)两种,此外转换操作中还有一些比较特殊的原语,如updateStateByKey()、transform()、以及各种Window相关的原语
Transformation | Meaning |
map(func) | Return a new DStream by passing each element of the source DStream through a function func. |
flatMap(func) | Similar to map, but each input item can be mapped to 0 or more output items. |
filter(func) | Return a new DStream by selecting only the records of the source DStream on which func returns true. |
repartition(numPartitions) | Changes the level of parallelism in this DStream by creating more or fewer partitions. |
union(otherStream) | Return a new DStream that contains the union of the elements in the source DStream and otherDStream. |
count() | Return a new DStream of single-element RDDs by counting the number of elements in each RDD of the source DStream. |
reduce(func) | Return a new DStream of single-element RDDs by aggregating the elements in each RDD of the source DStream using a function func (which takes two arguments and returns one). The function should be associative so that it can be computed in parallel. |
countByValue() | When called on a DStream of elements of type K, return a new DStream of (K, Long) pairs where the value of each key is its frequency in each RDD of the source DStream. |
reduceByKey(func, [numTasks]) | When called on a DStream of (K, V) pairs, return a new DStream of (K, V) pairs where the values for each key are aggregated using the given reduce function. Note: By default, this uses Spark’s default number of parallel tasks (2 for local mode, and in cluster mode the number is determined by the config property spark.default.parallelism) to do the grouping. You can pass an optional numTasks argument to set a different number of tasks. |
join(otherStream, [numTasks]) | When called on two DStreams of (K, V) and (K, W) pairs, return a new DStream of (K, (V, W)) pairs with all pairs of elements for |
each key. | |
cogroup(otherStream, [numTasks]) | When called on a DStream of (K, V) and (K, W) pairs, return a new DStream of (K, Seq[V], Seq[W]) tuples. |
transform(func) | Return a new DStream by applying a RDD-to-RDD function to every RDD of the source DStream. This can be used to do arbitrary RDD operations on the DStream. |
updateStateByKey(func) | Return a new “state” DStream where the state for each key is updated by applying the given function on the previous state of |
the key and the new values for the key. This can be used to maintain arbitrary state data for each key. |
updateStateByKey Operation
transform Operation
Transform原语允许DStream上执行任意的RDD-to-RDD函数。通过该函数可以方便的扩展Spark API。
window Operations
Output Operations可以将DStream的数据输出到外部的数据库或文件系统,当某个Output Operations原语被调用时(与RDD的Action相同),streaming程序才会开始真正的计算过程。
Output Operation | Meaning |
print() | Prints the first ten elements of every batch of data in a DStream on the driver node running the streaming application. This is useful for development and debugging. |
saveAsTextFiles(prefix, [suffix]) | Save this DStream’s contents as text files. The file name at each batch interval is generated based on prefix and suffix: |
“prefix-TIME_IN_MS[.suffix]”. | |
saveAsObjectFiles(prefix, [suffix]) | Save this DStream’s contents as SequenceFiles of serialized Java objects. The file name at each batch interval is generated |
based on prefix and suffix: “prefix-TIME_IN_MS[.suffix]”. | |
saveAsHadoopFiles(prefix, [suffix]) | Save this DStream’s contents as Hadoop files. The file name at each batch interval is generated based on prefix and suffix: |
“prefix-TIME_IN_MS[.suffix]”. | |
foreachRDD(func) | The most generic output operator that applies a function, func, to each RDD generated from the stream. This function should push the data in each RDD to an external system, such as saving the RDD to files, or writing it over the network to a database. Note that the function func is executed in the driver process running the streaming application, and will usually have RDD actions in it that will force the computation of the streaming RDDs. |
yum install -y nc
nc -lk 6666
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object StreamingWordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("StreamingWordCount").setMaster("local[2]")
val sc = new SparkContext(conf)
// 创建Spark Streaming的上下文
val ssc = new StreamingContext(sc,Seconds(5))
// 获取netcat服务的数据
val dStream: ReceiverInputDStream[String] = ssc.socketTextStream("cdhnocms01",6666)
// 分析数据
val res: DStream[(String, Int)] = dStream.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_)
// 打印到控制台
// 开始提交任务
// 线程等待,等待处理下一批次任务
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object StreamingAccWordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("StreamingAccWordCount").setMaster("local[2]")
/*val sc = new SparkContext(conf)
// 创建Spark Streaming的上下文
val ssc = new StreamingContext(sc,Seconds(5))*/
// 创建Spark Streaming的上下文
val ssc = new StreamingContext(conf,Seconds(5))
// 设置检查点,最好checkpoint到hdfs
// 在有需要记录历史批次结果的需求的过程中,必须要checkpoint
// checkpoint不光可以记录RDD的元数据和依赖关系等数据,还可以记录历史结果
// 获取netcat的数据
val dStream: ReceiverInputDStream[String] = ssc.socketTextStream("cdhnocms01",6666)
// 开始分析
val tups: DStream[(String, Int)] = dStream.flatMap(_.split(" ")).map((_, 1))
val res: DStream[(String, Int)] = tups.updateStateByKey(func, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
// 在调用updateStateByKey时,需要传入一个用于计算力此批次和当前批次的函数
// 该函数中有以下类型:
// String:元祖的每一个单词,也就是key
// Seq[Int]:当前批次相同key对应的value,比如Seq(1,1,1,1)
// Option[Int]:代表上一批次中相同key对应的累加的结果,有可能有值,有可能没值
// 此时,获取历史批次的数据时,最好用getOrElse方法
val func = (it: Iterator[(String, Seq[Int], Option[Int])]) => { => {
(tup._1, tup._2.sum + tup._3.getOrElse(0))
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object TransformDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformDemo").setMaster("local[2]")
val ssc = new StreamingContext(conf,Seconds(5))
val dStream: ReceiverInputDStream[String] = ssc.socketTextStream("cdhnocms01", 6666)
// 调用transform操作DStream里的RDD
val res: DStream[(String, Int)] = dStream.transform(rdd => {
rdd.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
* 获取kafka的某个topic的数据实现wordcount
* 有批次累加的功能
object LoadKafkaDataAndWC {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformDemo").setMaster("local[2]")
val ssc = new StreamingContext(conf,Seconds(5))
// 设置用于请求kafka的几个参数
val Array(zkQuorum, group, topics, numThreads) = args
// 把获取到的每个topic封装到一个map里
val topicMap: Map[String, Int] = topics.split(",").map((_,numThreads.toInt)).toMap
// 调用kafka工具类用receive的方式获取kafka的数据
val data: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK)
// 获取到的数据中是对偶元祖,key为offset,value为offset对应的数据
// 此时offset没用,需要过滤掉
val lines: DStream[String] =
// 进行分析
val tups: DStream[(String, Int)] = lines.flatMap(_.split(" ")).map((_, 1))
val res: DStream[(String, Int)] = tups.updateStateByKey(func, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
val func = (it: Iterator[(String, Seq[Int], Option[Int])]) => {{
case (x, y, z) => {
(x, y.sum + z.getOrElse(0))
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object WindowOprationsDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("TransformDemo").setMaster("local[2]")
val ssc = new StreamingContext(conf,Seconds(5))
val dStream: ReceiverInputDStream[String] = ssc.socketTextStream("cdhnocms01",6666)
val tups: DStream[(String, Int)] = dStream.flatMap(_.split(" ")).map((_, 1))
val update: DStream[(String, Int)] = tups.updateStateByKey(func, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
val res: DStream[(String, Int)] = update.reduceByKeyAndWindow((x: Int, y: Int) => x + y, Seconds(10), Seconds(10))
val func = (it: Iterator[(String, Seq[Int], Option[Int])]) => {{
case (x, y, z) => {
(x, y.sum + z.getOrElse(0))