import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
val conf=new SparkConf().setMaster("local[2]").setAppName("kgc streaming demo")
val ssc=new StreamingContext(conf,Seconds(8))
1、一个JVM只能有一个StreamingContext启动
2、StreamingContext停止后不能再启动
在Spark-shell下,会出现如下错误提示:
org.apache.spark.SparkException:Only onc SparkContext may be running in this JVM
解决:
方法1、sc.stop //创建ssc之前,停止spark-shell自行启动的SparkContext
方法2、或者通过已有的sc创建ssc:val ssc=new StreamingContext(sc,Seconds(8))
$nc -lk 9999 //数据服务器。当ssc启动后输入测试数据,观察Spark Streaming处理结果
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
val sparkConf = new
//local[n]其中n>接收器的个数
SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(1))
//DStream
val lines = ssc.socketTextStream("localhost", 9999)//指定数据源
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
每一个Input DStream(file stream除外)都与一个接收器(Receiver)相关联,接收器是从数据源提取数据到内存的专用对象
def textFileStream(directory: String): DStream[String]
def socketTextStream(hostname: String, port: Int, storageLevel: StorageLevel): ReceiverInputDStream[String]
val ds = FlumeUtils.createPollingStream(streamCtx, [sink hostname], [sink port]);
val ds = KafkaUtils.createStream(streamCtx, zooKeeper, consumerGrp, topicMap);
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
val input1 = List((1, true), (2, false), (3, false), (4, true), (5, false))
val input2 = List((1, false), (2, false), (3, true), (4, true), (5, true))
val rdd1 = sc.parallelize(input1)
val rdd2 = sc.parallelize(input2)
val ssc = new StreamingContext(sc, Seconds(3))
import scala.collection.mutable
val ds1 = ssc.queueStream[(Int, Boolean)](mutable.Queue(rdd1))
val ds2 = ssc.queueStream[(Int, Boolean)](mutable.Queue(rdd2))
val ds = ds1.join(ds2)
ds.print()
ssc.start()
ssc.awaitTerminationOrTimeout(5000)
ssc.stop()
// RDD 包含垃圾邮件信息
//从Hadoop接口API创建RDD
val spamRDD = ssc.sparkContext.newAPIHadoopRDD(...)
val cleanedDStream = wordCounts.transform { rdd =>
//用垃圾邮件信息连接数据流进行数据清理 rdd.join(spamRDD).filter( /* code... */)
// 其它操作...
}
//错误
dstream.foreachRDD { rdd =>
val connection = createNewConnection() // 在driver节点执行
rdd.foreach { record =>
connection.send(record) // 在worker节点执行
}
}
//正确
dstream.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
val connection = createNewConnection()
partitionOfRecords.foreach(record =>
connection.send(record))
}
}
val sparkConf = new SparkConf().setAppName("HdfsWordCount").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(2))
// 创建FileInputDStream去读取文件系统上的数据
val lines = ssc.textFileStream("/data/input") //启动后,往该HDFS目录上传文本文件并观察输出
//使用空格进行分割每行记录的字符串
val words = lines.flatMap(_.split(" "))
//类似于RDD的编程,将每个单词赋值为1,并进行合并计算
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
//定义状态更新函数
def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
val curr = currentValues.sum
val pre = preValues.getOrElse(0)
Some(curr + pre)
}
val sparkConf = new SparkConf().setAppName("StatefulWordCount").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
ssc.checkpoint(".")
val lines = ssc.socketTextStream("localhost", 6789)
val result = lines.flatMap(_.split(" ")).map((_, 1))
val state = result.updateStateByKey(updateFunction)
state.print()
ssc.start()
ssc.awaitTermination()
case class Word(word:String)
val sparkConf = new SparkConf().setAppName("NetworkSQLWordCount").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val spark=SparkSession.builder.config(sparkConf).getOrCreate()
val lines = ssc.socketTextStream("localhost", 6789)
val result = lines.flatMap(_.split(" "))
result.print()
result.foreachRDD(rdd => {
if (rdd.count() != 0) {
import spark.implicits._
//将RDD转换成DataFrame
val df = rdd.map(x => Word(x)).toDF
df.registerTempTable("tb_word")
spark.sql("select word, count(*) from tb_word group by word").show
}})
ssc.start()
ssc.awaitTermination()
#SparkSink——Pull方式:Spark使用Flume接收器从sink中拉取数据
simple-agent.sinks.spark-sink.type=org.apache.spark.streaming.flume.sink.SparkSink
simple-agent.sinks.spark-sink.channel=netcat-memory-channel
simple-agent.sinks.spark-sink.hostname=localhost
simple-agent.sinks.spark-sink.type=41414
//Pull方式关键代码
val flumeStream=FlumeUtils.createPollingStream(ssc,"localhost",41414,StorageLevel.MEMORY_ONLY_SER_2)
flumeStream.map(x=>new String(x.event.getBody.array()).trim).flatMap(_.split(" "))
……
$/opt/flume/bin/flume-ng agent --name simple-agent \
--conf-file ./flume_push_streaming.conf -Dflume.root.logger=INFO,console &
$spark-submit \
--class cn.kgc.FlumePushWordCount \
--jars spark-streaming-flume_2.11-2.3.0.cloudera1.jar,/opt/flume/lib/flume-ng-sdk-1.8.0.jar \
./sparkdemo-1.0-SNAPSHOT.jar localhost 41414
#新开终端进行测试,44444是Flume agent source连接的netcat端口
telnet localhost 44444
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, LocationStrategies}
val Array(brokers, topics) = args
val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster("local[1]")
val ssc = new StreamingContext(sparkConf, Seconds(2))
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("bootstrap.servers" -> brokers)
val messages = KafkaUtils.createDirectStream[String, String](ssc,LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicsSet,kafkaParams))
messages.map(_.value()) // 取出value
.flatMap(_.split(" ")) // 将字符串使用空格分隔
.map(word => (word, 1)) // 每个单词映射成一个pair
.reduceByKey(_+_) // 根据每个key进行累加
.print() // 打印前10个数据
ssc.start()
ssc.awaitTermination()