目录
SparkStreaming概述
SparkStreaming是什么
SparkStreaming特点
Spark Streaming架构
Dstream
Dstream入门
DStream 创建
DStream 转换
无状态转化操作
有状态转化操作
自定义数据源
Kafka 数据源
// 介绍一下什么是流式什么是批量://1. 从数据处理的方式角度:// 流式( Streaming )数据处理// 批量( batch )数据处理//2. 从数据处理延迟的长短角度// 实时数据处理:毫秒级别// 离线数据处理:小时 or 天级别//SparkStreaming 准实时(秒、分钟),微批次(时间)的数据处理框架。
与SpakrContext类似,StreamingContext是实时应用程序的入口,它充当应用程序与Spark引擎的连接纽带。DStream对于Spark Streaming的作用就如同RDD对于Spark的作用,DStream将潜在的无限数据流,转换成离散批处理的RDD。
org.apache.spark spark-mllib_2.12 2.4.5
package com.shujia.spark.test
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DemoSpark01_WordCount {
def main(args: Array[String]): Unit = {
//TODO: 创建环境对象
//StreamingContext创建时,需要传递两个参数
//第一个参数表示环境配置
val sparkconf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("WordCount")
//第二个参数表示批量处理的周期(采集周期)
val ssc = new StreamingContext(sparkconf,Seconds(3))
//TODO 处理逻辑
//获取端口数据
val lines: ReceiverInputDStream[String] = ssc.socketTextStream("localhost",
9999)
val words: DStream[String] = lines.flatMap(_.split(" "))
val mapD: DStream[(String, Int)] = words.map((_, 1))
val resultD: DStream[(String, Int)] = mapD.reduceByKey(_ + _)
resultD.print()
//TODO 关闭环境
//由于SparkStreaming采集器时长期执行的任务,所以不能直接关闭
//如果main方法执行完毕,应用程序也会自动结束。所以不能让main执行完毕
// ssc.stop()
//1、启动采集器
ssc.start()
//2、等待采集器的关闭
ssc.awaitTermination()
}
}
//打开命令行窗口://输入://nc -lp 9999
package com.shujia.spark.test
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
object DemoSpark02_Queue {
def main(args: Array[String]): Unit = {
//TODO: 创建环境对象
//StreamingContext创建时,需要传递两个参数
//第一个参数表示环境配置
//1、初始化spark配置信息
val sparkconf: SparkConf = new SparkConf()
.setMaster("local[*]").setAppName("WordCount")
//第二个参数表示批量处理的周期(采集周期)
//2.初始化 SparkStreamingContext
val ssc = new StreamingContext(sparkconf,Seconds(3))
//TODO 处理逻辑
//3.创建 RDD 队列
val queue: mutable.Queue[RDD[Int]] = new mutable.Queue[RDD[Int]]()
//4.创建 QueueInputDStream
val inputStream = ssc.queueStream(queue,oneAtATime = false)
//5.处理队列中的 RDD 数据
val mappedStream = inputStream.map((_,1))
val reducedStream = mappedStream.reduceByKey(_ + _)
//6.打印结果
reducedStream.print()
//TODO 关闭环境
//由于SparkStreaming采集器时长期执行的任务,所以不能直接关闭
//如果main方法执行完毕,应用程序也会自动结束。所以不能让main执行完毕
// ssc.stop()
//(1)启动采集器
ssc.start()
//8.循环创建并向 RDD 队列中放入 RDD
for (i <- 1 to 5) {
queue += ssc.sparkContext.makeRDD(1 to 300, 10)
Thread.sleep(2000)
}
//(2)等待采集器的关闭
ssc.awaitTermination()
}
}
package com.shujia.spark.test
import org.apache.commons.codec.StringDecoder
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils,
LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DemoSpark05_Kafka {
def main(args: Array[String]): Unit = {
//TODO: 创建环境对象
//StreamingContext创建时,需要传递两个参数
//第一个参数表示环境配置
//1、初始化spark配置信息
val sparkconf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("WordCount")
//第二个参数表示批量处理的周期(采集周期)
//2.初始化 SparkStreamingContext
val ssc = new StreamingContext(sparkconf,Seconds(3))val kafkaParas: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG ->
"master:9092,node1:9092,node2:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "shujia",
"key.deserializer" ->
"org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" ->
"org.apache.kafka.common.serialization.StringDeserializer"
)
//TODO 处理逻辑
val kafkaDataDS: InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("shujia"),
kafkaParas)
)
kafkaDataDS.map(_.value()).print()
//(1)启动采集器
ssc.start()
//(2)等待采集器的关闭
ssc.awaitTermination()
}
}
package com.shujia.spark.test
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConfimport org.apache.spark.streaming.dstream.{DStream, InputDStream,
ReceiverInputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils,
LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DemoSpark06_State {
def main(args: Array[String]): Unit = {
//TODO: 创建环境对象
val sparkconf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("WordCount")
val ssc = new StreamingContext(sparkconf,Seconds(3))
//无状态数据操作,只对当前的采集周期内的数据进行出来
//在某些场合下,需要保留数据统计结果(状态),实现数据的汇总
//使用有状态操作时,需要设定检查点路径
ssc.checkpoint("cp")
val datas: ReceiverInputDStream[String] =
ssc.socketTextStream("localhost", 9999)
val wordToOne: DStream[(String, Int)] = datas.map((_, 1))
// val wordCount: DStream[(String, Int)] = wordToOne.reduceByKey(_ + _)
//updateStateByKey:根据key对数据的状态进行更新
//传递的参数中含有两个值
//第一个值表示相同的key的value数据
//第二个值表示缓冲区相同key的value数据
val state = wordToOne.updateStateByKey(
(seq:Seq[Int],buff:Option[Int]) =>{
val newCount: Int = buff.getOrElse(0) + seq.sum
Option(newCount)
}
)
state.print()
ssc.start()
ssc.awaitTermination()
}
}
// 自定义数据源
package com.shujia.spark.test
import java.util.Random
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.{Seconds, StreamingContext}import scala.collection.mutable
object DemoSpark03_Diy {
def main(args: Array[String]): Unit = {
//TODO: 创建环境对象
//StreamingContext创建时,需要传递两个参数
//第一个参数表示环境配置
//1、初始化spark配置信息
val sparkconf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("WordCount")
//第二个参数表示批量处理的周期(采集周期)
//2.初始化 SparkStreamingContext
val ssc = new StreamingContext(sparkconf,Seconds(3))
//TODO 处理逻辑
val messageDS: ReceiverInputDStream[String] = ssc.receiverStream(new
MyReceiver())
messageDS.print()
//(1)启动采集器
ssc.start()
//(2)等待采集器的关闭
ssc.awaitTermination()
}
/**
* 自定义数据采集器
* 1、继承Receiver,定义泛型,传递参数
* 2、重写方法
*/
class MyReceiver extends Receiver[String](StorageLevel.MEMORY_ONLY){
private var flg = true
override def onStart(): Unit = {
new Thread(new Runnable {
override def run(): Unit = {
while (flg){
val message ="采集的数据为:" + new Random().nextInt(10).toString
store(message)
Thread.sleep(500)
}
}
}).start()
}
override def onStop(): Unit = {
flg = false
}
}
}
package com.shujia.spark.test
import java.io.{BufferedReader, InputStreamReader}
import java.net.Socket
import java.nio.charset.StandardCharsets
import java.util.Random
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DemoSpark04_Diy {
def main(args: Array[String]): Unit = {
//TODO: 创建环境对象
//StreamingContext创建时,需要传递两个参数
//第一个参数表示环境配置
//1、初始化spark配置信息
val sparkconf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("WordCount")
//第二个参数表示批量处理的周期(采集周期)
//2.初始化 SparkStreamingContext
val ssc = new StreamingContext(sparkconf,Seconds(3))
//TODO 处理逻辑
//创建自定义 receiver 的 Streaming
val lineStream: ReceiverInputDStream[String] = ssc.receiverStream(new
MyReceiver("localhost", 9999))
//将每一行数据做切分,形成一个个单词
val wordStream = lineStream.flatMap(_.split("\t"))
//将单词映射成元组(
word,1)
val wordAndOneStream = wordStream.map((_, 1))
//将相同的单词次数做统计
val wordAndCountStream = wordAndOneStream.reduceByKey(_ + _)
//打印
wordAndCountStream.print()
//(1)启动采集器
ssc.start()
//(2)等待采集器的关闭
ssc.awaitTermination()
}
/**
* 自定义数据采集器
* 1、继承Receiver,定义泛型,传递参数
* 2、重写方法
*/
class MyReceiver(host:String,port:Int) extends Receiver[String]
(StorageLevel.MEMORY_ONLY){//最初启动的时候,调用该方法,作用为:读数据并将数据发送给spark
override def onStart(): Unit = {
new Thread("Socket Receiver"){
override def run(): Unit = {
receiver()
}
}.start()
}
//读取数据并将数据发送给spark
def receiver()={
//创建一个Socket
val socket = new Socket(host, port)
//定义一个变量,用来接收端口传过来的数据
var input:String = null
//创建一个BufferedReader用于读取端口传来的数据
val reader = new BufferedReader(new
InputStreamReader(socket.getInputStream, StandardCharsets.UTF_8))
//读取数据
input = reader.readLine()
//当 receiver 没有关闭并且输入数据不为空,则循环发送数据给 Spark
while (!isStopped() && input != null) {
store(input)
input = reader.readLine()
}
//跳出循环则关闭资源
reader.close()
socket.close()
//重启任务
restart("restart")
}
override def onStop(): Unit = {
}
}
}
org.apache.spark spark-streaming-kafka-0-10_2.12 2.4.5 com.fasterxml.jackson.core jackson-core 2.10.1
scp -r kafka_2.11-1.0.0 node2: `pwd`scp -r kafka_2.11-1.0.0 node1: `pwd`
zkServer.sh start
zkServer.sh status
kafka-server-start.sh -daemon /usr/local/soft/kafka_2.11-1.0.0/config/server.properties
kafka-topics.sh --create --zookeeper master:2181,node1:2181,node2:2181 --replication-factor 2 --partitions 3 --topic test_topic1
kafka-topics.sh --describe --zookeeper master:2181,node1:2181,node2:2181 --topic test_topic1
kafka-topics.sh --list --zookeeper master:2181,node1:2181,node2:2181
kafka-console-producer.sh --broker-list master:9092,node1:9092,node2:9092 --topic test_topic1
kafka-console-consumer.sh --bootstrap-server master:9092,node1:9092,node2:9092 --from-beginning --topic test_topic1
bin/kafka-server-start.sh -daemon /usr/local/soft/kafka/config/server.properties
bin/kafka-topics.sh --create --zookeeper master:2181,node1:2181,node2:2181 --replication-factor 2 --partitions 3 --topic shujia
bin/kafka-topics.sh --describe --zookeeper master:2181,node1:2181,node2:2181 --topic kong
bin/kafka-topics.sh --list --zookeeper master:2181,node1:2181,node2:2181
bin/kafka-console-producer.sh --broker-list master:9092,node1:9092,node2:9092 --topic kong